In [0]:
%scala
import org.apache.spark.sql.functions._
import org.apache.spark.sql.DataFrame

// ===============================
// PARAMETERS
// ===============================
// Example usage in Databricks job: 

val inputPeriods = dbutils.widgets.get("periods").split(",").map(_.trim).toList
val skipCleaning = dbutils.widgets.get("skip_cleaning").toBoolean

val featureTable = "silver_layer_yellow_taxi_including_features"

// ===============================
// HELPER FUNCTIONS
// ===============================
def loadBronze(period: String): DataFrame = {
  val tyear  = period.substring(0,4)
  val tmonth = period.substring(5,6)
  val tableName = s"hive_metastore.default.bronze_layer_${tyear}_${tmonth}_yellow_taxi_valid"
  spark.table(tableName)
}



def removeOutliers(df: DataFrame, colName: String): DataFrame = {
  val quantiles = df.stat.approxQuantile(colName, Array(0.25, 0.75), 0.01)
  val Q1 = quantiles(0); val Q3 = quantiles(1); val IQR = Q3 - Q1
  df.filter(col(colName).between(Q1 - 1.5*IQR, Q3 + 1.5*IQR))
}

// ===============================
// STEP 0: INGESTION
// ===============================
val rawDF = inputPeriods.map(loadBronze).reduce(_ unionByName _)

rawDF.show(5)


// ===============================
// STEP 1: DATA CLEANING (optional)
// ===============================
val processedDF: DataFrame = if (!skipCleaning) {
  // Deduplicate by key trip identifiers
  val dedupedDF = rawDF.dropDuplicates(
    "VendorID","tpep_pickup_datetime","tpep_dropoff_datetime","passenger_count","PULocationID","DOLocationID"
  )

  // Basic validity filters
  val cleanedDF = dedupedDF
    .filter(col("fare_amount") >= 0)
    .filter(col("trip_distance") >= 0)
    .filter(col("passenger_count") > 0)
    .filter(col("total_amount") >= 0)

  // Drop rows missing critical fields
  val noNullDF = cleanedDF.na.drop(
    Seq("tpep_pickup_datetime","tpep_dropoff_datetime","fare_amount","trip_distance","total_amount")
  )

  // Outlier removal (trip_distance and fare_amount)
  val finalCleanDF = removeOutliers(
    removeOutliers(noNullDF, "trip_distance"),
    "fare_amount"
  )

  println(s"✅ Data Cleaning complete. Records after cleaning: " + finalCleanDF.count())
  finalCleanDF
} else {
  println(s"⏩ Skipping Data Cleaning. Using raw ingested data.")
  rawDF
}

// ===============================
// STEP 2: FEATURE ENGINEERING
// ===============================
val feDF = processedDF
  // Trip duration in minutes
  .withColumn(
    "trip_duration_min", 
    (unix_timestamp(col("tpep_dropoff_datetime")) - unix_timestamp(col("tpep_pickup_datetime"))) / 60.0
  )
  // Temporal features
  .withColumn("pickup_hour", hour(col("tpep_pickup_datetime")))
  .withColumn("pickup_dayofweek", date_format(col("tpep_pickup_datetime"), "E"))
  .withColumn("pickup_month", month(col("tpep_pickup_datetime")))
  // Efficiency metrics
  .withColumn("fare_per_mile", when(col("trip_distance") > 0, col("fare_amount")/col("trip_distance")))
  .withColumn("fare_per_min", when(col("trip_duration_min") > 0, col("fare_amount")/col("trip_duration_min")))
  // Optional: revenue components
  .withColumn("has_tip", when(col("tip_amount") > 0, 1).otherwise(0))
  .withColumn("is_airport_trip", when(col("airport_fee") > 0, 1).otherwise(0))


// ===============================
// SAVE FINAL FEATURED DATA
// ===============================
feDF.write
  .mode("overwrite")
  .format("delta")
  .saveAsTable(featureTable)

println(s"Feature Engineering complete. Output table: ${featureTable}")

///val featureTableName = "featured_customers"

///fs.createFeatureTable(
///  name = featureTableName,
///  primaryKeys = Seq("id"),      // The unique key(s) for your features
///  df = feDF,                     // The DataFrame with features
///  description = "Customer features for ML models"
///)


In [0]:
%scala
// ===============================
// Feature Statistics
// ===============================
val numericCols = Seq(
  "trip_duration_min", "trip_distance", "fare_amount", 
  "fare_per_mile", "fare_per_min", "tip_amount"
)

// Summary statistics for numeric features
val numericSummary = feDF.select(numericCols.map(col): _*).describe()
display(numericSummary)




In [0]:
display(feDF.select("trip_duration_min"))

Databricks visualization. Run in Databricks to view.

In [0]:
import org.apache.spark.sql.functions._

// Pickup hour counts
val hourDF = feDF.groupBy("pickup_hour").count()
  .withColumnRenamed("count", "num_trips")
  .withColumn("feature_type", lit("pickup_hour"))
  .withColumn("feature_value", col("pickup_hour").cast("string")) // cast to string

// Pickup day of week counts
val dayDF = feDF.groupBy("pickup_dayofweek").count()
  .withColumnRenamed("count", "num_trips")
  .withColumn("feature_type", lit("pickup_dayofweek"))
  .withColumn("feature_value", col("pickup_dayofweek"))

// Pickup month counts
val monthDF = feDF.groupBy("pickup_month").count()
  .withColumnRenamed("count", "num_trips")
  .withColumn("feature_type", lit("pickup_month"))
  .withColumn("feature_value", col("pickup_month").cast("string"))

// Union all into one table
val temporalDF = hourDF.select("feature_type", "feature_value", "num_trips")
  .unionByName(dayDF.select("feature_type", "feature_value", "num_trips"))
  .unionByName(monthDF.select("feature_type", "feature_value", "num_trips"))

display(temporalDF)


In [0]:
// ===============================
// Binary Feature Distributions
// ===============================
val binaryCols = Seq("has_tip", "is_airport_trip")
binaryCols.foreach { c =>
  display(feDF.groupBy(c).count())
}

In [0]:
// ===============================
// Correlation Matrix (Numeric Features)
// ===============================
val corrValues = for {
  c1 <- numericCols
  c2 <- numericCols
} yield (c1, c2, feDF.stat.corr(c1, c2))

val corrDF = corrValues.toDF("feature1", "feature2", "correlation")
display(corrDF)