# Big Data project A.Y. 2024-2025 - First Job

## Members

- Giovanni Antonioni
- Luca Rubboli - 0001083742

### Define useful parameters

- Dataset location
- Iterator (defined like this to overcome different names for same columns in dataset)

In [1]:
import org.apache.spark
import org.apache.spark.sql.SparkSession

val spark = SparkSession.builder
  .appName("First job")
  .getOrCreate()

val sc = spark.sparkContext

Intitializing Scala interpreter ...

Spark Web UI available at http://10.201.106.166:4040
SparkContext available as 'sc' (version = 3.5.1, master = local[*], app id = local-1747155350627)
SparkSession available as 'spark'


import org.apache.spark
import org.apache.spark.sql.SparkSession
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@5e2b1d9a
sc: org.apache.spark.SparkContext = org.apache.spark.SparkContext@2d11f91c


In [2]:
val decimals: Int = 4
val datasetDir = "dataset"
val outputDir = "output/firstJobOutput"
val yellowDatasetDir = s"$datasetDir/yellow_cab"
val greenDatasetDir = s"$datasetDir/green_cab"
val fhvDatasetDir = s"$datasetDir/fhv_cab"
val fhvhvDatasetDir = s"$datasetDir/fhvhv_cab"
val datasetDirMap: Map[String, String] = Map("yellow" -> yellowDatasetDir, "green" -> greenDatasetDir, "fhv" -> fhvDatasetDir, "fhvhv" -> fhvhvDatasetDir)
val datasetIterator: Iterable[(String, String, String)] = Seq(
  ("yellow", "tpep_dropoff_datetime", "tpep_pickup_datetime"),
  ("green", "lpep_dropoff_datetime", "lpep_pickup_datetime"),
  // ("fhv", "tpep_dropoff_datetime", "tpep_pickup_datetime"),
  // ("fhvhv", "tpep_dropoff_datetime", "tpep_pickup_datetime"),
)

decimals: Int = 4
datasetDir: String = dataset
outputDir: String = output/firstJobOutput
yellowDatasetDir: String = dataset/yellow_cab
greenDatasetDir: String = dataset/green_cab
fhvDatasetDir: String = dataset/fhv_cab
fhvhvDatasetDir: String = dataset/fhvhv_cab
datasetDirMap: Map[String,String] = Map(yellow -> dataset/yellow_cab, green -> dataset/green_cab, fhv -> dataset/fhv_cab, fhvhv -> dataset/fhvhv_cab)
datasetIterator: Iterable[(String, String, String)] = List((yellow,tpep_dropoff_datetime,tpep_pickup_datetime), (green,lpep_dropoff_datetime,lpep_pickup_datetime))


## Define Columns for analysis
- Columns names
- Time zones for overprice
- Columns used in classification for average price calculation
- Columns which values are used in analysis

In [3]:
val colDurationMinutes: String = "duration_minutes"
val colDurationMinutesBinLabel: String = "duration_minutes_bin_label"
val colYear: String = "year"
val colWeekdaySurcharge: String = "weekday_surcharge"
val colAggregateFee: String = "fees"
val colAggregateFeeBin: String = "agg_fee_bin_label"
val colDistanceBin: String = "distance_bin_label"
val colFareAmount: String = "fare_amount"
val colPricePerDistance: String = "cost_per_distance"
val colPricePerTime: String = "cost_per_time"
val colAvgPricePerDistance: String = "avg_cost_per_distance"
val colAvgPricePerTime: String = "avg_cost_per_time"
val colPricePerDistanceDiff: String = "cost_per_distance_diff"
val colPricePerDistanceDiffPcg: String = "cost_per_distance_diff_pcg"
val colPricePerTimeDiff: String = "cost_per_time_diff"
val colPricePerTimeDiffPcg: String = "cost_per_time_diff_pcg"
val colPricePerDistanceDiffPcgLabel: String = colPricePerDistanceDiffPcg + "_label"
val colPricePerTimeDiffPcgLabel: String = colPricePerTimeDiffPcg + "_label"

val timeZoneOver: String = "overnight"
val timeZones = Map(timeZoneOver -> (20, 6), "regular" -> (6, 20))
val weekDaySurcharge: Double = 2.5

val colDurationOvernightPcg: String = s"${timeZoneOver}_duration_pcg"

val colToUse: Set[String] = Set(
  "tpep_pickup_datetime",
  "tpep_dropoff_datetime",
  "lpep_pickup_datetime",
  "lpep_dropoff_datetime",
  "passenger_count",
  "trip_distance",
  "ratecodeid",
  "store_and_fwd_flag",
  "payment_type",
  "fare_amount",
  "extra",
  "mta_tax",
  "tip_amount",
  "tolls_amount",
  "improvement_surcharge",
  "total_amount",
  "congestion_surcharge",
  "airport_fee")

val colFees: Set[String] = Set(
  "extra",
  "mta_tax",
  "improvement_surcharge",
  "congestion_surcharge",
  "airport_fee")

val colsForClassification: Seq[String] = Seq(
  "passenger_count",
  "store_and_fwd_flag",
  "payment_type",
  colAggregateFeeBin,
  colDurationMinutesBinLabel,
  colDistanceBin,
  colYear,
  s"${colDurationOvernightPcg}_label",
  colPricePerDistanceDiffPcgLabel,
  colPricePerTimeDiffPcgLabel
)

val colsForValuesAnalysis: Seq[String] = Seq(
  "passenger_count",
  "store_and_fwd_flag",
  "payment_type",
  colAggregateFeeBin,
  colDurationMinutesBinLabel,
  colDistanceBin,
  colYear,
  s"${colDurationOvernightPcg}_label",
)

colDurationMinutes: String = duration_minutes
colDurationMinutesBinLabel: String = duration_minutes_bin_label
colYear: String = year
colWeekdaySurcharge: String = weekday_surcharge
colAggregateFee: String = fees
colAggregateFeeBin: String = agg_fee_bin_label
colDistanceBin: String = distance_bin_label
colFareAmount: String = fare_amount
colPricePerDistance: String = cost_per_distance
colPricePerTime: String = cost_per_time
colAvgPricePerDistance: String = avg_cost_per_distance
colAvgPricePerTime: String = avg_cost_per_time
colPricePerDistanceDiff: String = cost_per_distance_diff
colPricePerDistanceDiffPcg: String = cost_per_distance_diff_pcg
colPricePerTimeDiff: String = cost_per_time_diff
colPricePerTimeDiffPcg: String = cost_per_time_diff_pcg
colPricePerDistanceDiffPcgLabel: String = ...


### Define preprocess rules

In [4]:
val featureFilters: Map[String, Any => Boolean] = Map(
  "passenger_count" -> {
    case i: Int => i > 0
    case f: Float => val i = f.toInt; i > 0
    case d: Double => val i = d.toInt; i > 0
    case _ => false
  },
  "trip_distance" -> {
    case i: Int => i > 0
    case i: Float => i > 0
    case i: Double => i > 0
    case _ => false
  },
  "ratecodeid" -> {
    case i: Int => (i >= 1 && i <= 6) || i == 99
    case f: Float => val i = f.toInt; (i >= 1 && i <= 6) || i == 99
    case d: Double => val i = d.toInt; (i >= 1 && i <= 6) || i == 99
    case _ => false
  },
  "store_and_fwd_flag" -> {
    case i: String => i == "Y" || i == "N"
    case _ => false
  },
  "payment_type" -> {
    case i: Int => i >= 1 && i <= 6
    case f: Float => val i = f.toInt; i >= 1 && i <= 6
    case d: Double => val i = d.toInt; i >= 1 && i <= 6
    case _ => false
  },
  "fare_amount" -> {
    case i: Int => i > 0
    case i: Float => i > 0
    case i: Double => i > 0
    case _ => false
  },
  "tolls_amount" -> {
    case i: Int => i >= 0 && i < 200
    case i: Float => i >= 0 && i < 200
    case i: Double => i >= 0 && i < 200
    case _ => false
  }
)

val taxFilter: Any => Boolean = {
  case tax: Int => tax >= 0 && tax < 20
  case tax: Float => tax >= 0 && tax < 20
  case tax: Double => tax >= 0 && tax < 20
  case _ => false
}

featureFilters: Map[String,Any => Boolean] = Map(trip_distance -> $Lambda$2372/0x0000000801086040@6445100d, tolls_amount -> $Lambda$2377/0x000000080108a040@666ac400, payment_type -> $Lambda$2375/0x0000000801088840@6ae6cbce, fare_amount -> $Lambda$2376/0x0000000801089040@51b2ae58, passenger_count -> $Lambda$2371/0x0000000801085840@1dbb191b, store_and_fwd_flag -> $Lambda$2374/0x0000000801087840@2dc18468, ratecodeid -> $Lambda$2373/0x0000000801087040@71a592a5)
taxFilter: Any => Boolean = $Lambda$2378/0x000000080108a840@35a132a7


### Utils functions for rdd

In [5]:
import java.time.temporal.ChronoUnit
import java.time.{DayOfWeek, LocalDate, LocalDateTime}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import scala.math.BigDecimal.RoundingMode


def binColByStepValue(rdd: RDD[Row], indexOfColToDiscrete: Int, stepValue: Int = 5): RDD[Row] = {
  rdd.map { row =>
    val value: Double = row.get(indexOfColToDiscrete) match {
      case i: Int => i.toDouble
      case d: Double => d
      case l: Long => l.toDouble
      case s: String => try { s.toDouble } catch { case _: Throwable => Double.NaN}
      case _ => Double.NaN
    }

    val rawBin = (value / stepValue).toInt * stepValue
    val binBase = if (value < 0 && value % stepValue == 0) rawBin + stepValue else rawBin
    val label = if (value < 0) { s"[${(binBase - stepValue).toInt}|${binBase.toInt})" } else { s"[${binBase.toInt}|${(binBase + stepValue).toInt})" }

    Row.fromSeq(row.toSeq :+ label)
  }
}

val castForFilter: Any => Any = {
  case s: String => if (s.matches("""^-?\d+\.\d+$""")) s.toDouble else if (s.matches("""^-?\d+$""")) s.toInt else s.trim
  case d: Double => d
  case i: Int => i
  case l: Long => l.toDouble
  case f: Float => f.toDouble
  case b: Boolean => b
  case null => null
  case other => other.toString.trim
}

val preciseBucketUDF: (Map[String, (Int, Int)], LocalDateTime, LocalDateTime, Int) => Map[String, Double] = { (timeZones: Map[String, (Int, Int)], start: LocalDateTime, end: LocalDateTime, decimals: Int) =>

  val overlap: (LocalDateTime, LocalDateTime, LocalDateTime, LocalDateTime, Int) => Double = { (start1: LocalDateTime, end1: LocalDateTime, start2: LocalDateTime, end2: LocalDateTime, decimals: Int) =>
    val overlapStart = if (start1.isAfter(start2)) start1 else start2
    val overlapEnd = if (end1.isBefore(end2)) end1 else end2
    if (overlapEnd.isAfter(overlapStart)) BigDecimal(ChronoUnit.MILLIS.between(overlapStart, overlapEnd) / 60000.0).setScale(decimals, RoundingMode.HALF_UP).toDouble else 0.0
  }

  var result = timeZones.keys.map(_ -> 0.0).toMap

  if (!(start == null || end == null)) {

    if (!end.isBefore(start)) {

      var current = start.toLocalDate.atStartOfDay

      while (!current.isAfter(end)) {
        val nextDay = current.plusDays(1)

        timeZones.foreach {
          case (label, (startHour, endHour)) if startHour > endHour =>
            val bucketStartBeforeMidnight = current.withHour(startHour).withMinute(0).withSecond(0).withNano(0)
            val bucketEndBeforeMidnight = current.withHour(23).withMinute(59).withSecond(59)
            val bucketStartAfterMidnight = current.withHour(0).withMinute(0).withSecond(0).withNano(0)
            val bucketEndAfterMidnight = current.withHour(endHour).withMinute(0).withSecond(0).withNano(0)

            val minutesBeforeMidnight = overlap(start, end, bucketStartBeforeMidnight, bucketEndBeforeMidnight, decimals)
            val minutesAfterMidnight = overlap(start, end, bucketStartAfterMidnight, bucketEndAfterMidnight, decimals)

            result = result.updated(label, result(label) + minutesBeforeMidnight + minutesAfterMidnight)

          case (label, (startHour, endHour)) =>
            val bucketStart = current.withHour(startHour).withMinute(0).withSecond(0).withNano(0)
            val bucketEnd = if (endHour == 24) current.plusDays(1).withHour(0).withMinute(0).withSecond(0).withNano(0) else current.withHour(endHour).withMinute(0).withSecond(0).withNano(0)

            val minutes = overlap(start, end, bucketStart, bucketEnd, decimals)

            result = result.updated(label, result(label) + minutes)
        }

        current = nextDay
      }
    }
  }
  result
}

val isUSHolidayOrWeekend: LocalDate => Boolean = { date =>
  val month = date.getMonthValue
  val day = date.getDayOfMonth
  val dayOfWeek = date.getDayOfWeek

  val isIndependenceDay = month == 7 && day == 4
  val isChristmas = month == 12 && day == 25
  val isNewYear = month == 1 && day == 1
  val isLaborDay = month == 9 && dayOfWeek == DayOfWeek.MONDAY && day <= 7

  val isThanksgiving = month == 11 && dayOfWeek == DayOfWeek.THURSDAY && day >= 22 && day <= 28 && ((day - 1) / 7 + 1 == 4)

  isIndependenceDay || isChristmas || isNewYear || isLaborDay || isThanksgiving || dayOfWeek == DayOfWeek.SATURDAY || dayOfWeek == DayOfWeek.SUNDAY
}

import java.time.temporal.ChronoUnit
import java.time.{DayOfWeek, LocalDate, LocalDateTime}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import scala.math.BigDecimal.RoundingMode
binColByStepValue: (rdd: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row], indexOfColToDiscrete: Int, stepValue: Int)org.apache.spark.rdd.RDD[org.apache.spark.sql.Row]
castForFilter: Any => Any = $Lambda$2515/0x00000008010f6840@3e1ad086
preciseBucketUDF: (Map[String,(Int, Int)], java.time.LocalDateTime, java.time.LocalDateTime, Int) => Map[String,Double] = $Lambda$2516/0x00000008010f7040@60345872
isUSHolidayOrWeekend: java.time.LocalDate => Boolean = $Lambda$2517/0x00000008010f7840@1555ca19


In [6]:
val projectDir: String = "/Users/luca/Desktop/Luca/Università/Magistrale/Corsi/BigData/Drivers"

def getDatasetPath(localPath: String): String = {
  "file://" + projectDir + "/" + localPath
}

projectDir: String = /Users/luca/Desktop/Luca/Università/Magistrale/Corsi/BigData/Drivers
getDatasetPath: (localPath: String)String


# Actual job

1) Select dataset, dropoff and pickup columns

In [7]:
val name: String = "green"
val dropoff: String = "lpep_dropoff_datetime"
val pickup: String = "lpep_pickup_datetime"

name: String = green
dropoff: String = lpep_dropoff_datetime
pickup: String = lpep_pickup_datetime


2) Load dataset

In [8]:
val dataset = spark.read.parquet(getDatasetPath(datasetDirMap(name)))
var headers: Seq[String] = dataset.columns.map(_.toLowerCase)
val indexesToUse: Seq[Int] = headers.zipWithIndex.collect {
  case(h, i) if colToUse.contains(h.toLowerCase) => i
}

dataset: org.apache.spark.sql.DataFrame = [VendorID: int, lpep_pickup_datetime: timestamp_ntz ... 18 more fields]
headers: Seq[String] = ArraySeq(vendorid, lpep_pickup_datetime, lpep_dropoff_datetime, store_and_fwd_flag, ratecodeid, pulocationid, dolocationid, passenger_count, trip_distance, fare_amount, extra, mta_tax, tip_amount, tolls_amount, ehail_fee, improvement_surcharge, total_amount, payment_type, trip_type, congestion_surcharge)
indexesToUse: Seq[Int] = ArraySeq(1, 2, 3, 4, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 19)


In [9]:
import java.io._

def isSerializable(obj: Any): Boolean = {
  try {
    val bos = new ByteArrayOutputStream()
    val out = new ObjectOutputStream(bos)
    out.writeObject(obj)
    out.close()
    true
  } catch {
    case e: Exception =>
    println(s"Serialization failed: ${e}")
    e.printStackTrace()
    false
  }
}

import java.io._
isSerializable: (obj: Any)Boolean


### Filter taxes and features based on filter conditions previously defined

In [10]:
import org.apache.spark.sql.DataFrame

def transformRDD(dataset: DataFrame, headers: Seq[String], idxs: Seq[Int], castFunc: Any => Any): RDD[Row] = {
  dataset.rdd.map(row => Row.fromSeq(idxs.map(row.get).map(castFunc)))
}

var rdd = transformRDD(dataset, headers, indexesToUse, castForFilter)
headers = headers.filter(head => colToUse.contains(head.toLowerCase))

def applyFilters(rdd: RDD[Row], headers: Seq[String], colOfFees: Set[String], taxFilter: Any => Boolean, featFilter: Map[String, Any => Boolean]): RDD[Row] = {
  rdd.filter { row =>
    headers.zip(row.toSeq).forall {case(header: String, value) =>
      val taxFilterCondition = if (colOfFees.contains(header.toLowerCase)) taxFilter(value) else true
      featFilter.get(header.toLowerCase) match {
        case Some(filterFunc) => taxFilterCondition && filterFunc(value)
        case None => taxFilterCondition // no filter defined for this column, so accept it
      }
    }
  }
}

rdd = applyFilters(rdd, headers, colFees, taxFilter, featureFilters)

import org.apache.spark.sql.DataFrame
transformRDD: (dataset: org.apache.spark.sql.DataFrame, headers: Seq[String], idxs: Seq[Int], castFunc: Any => Any)org.apache.spark.rdd.RDD[org.apache.spark.sql.Row]
rdd: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[9] at filter at <console>:51
headers: Seq[String] = ArraySeq(lpep_pickup_datetime, lpep_dropoff_datetime, store_and_fwd_flag, ratecodeid, passenger_count, trip_distance, fare_amount, extra, mta_tax, tip_amount, tolls_amount, improvement_surcharge, total_amount, payment_type, congestion_surcharge)
applyFilters: (rdd: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row], headers: Seq[String], colOfFees: Set[String], taxFilter: Any => Boolean, featFilter: Map[String,Any => Boolean])org.apache.spark.rdd.RDD[org.apache....


### Add duration and timezones

In [11]:
import java.time.format.DateTimeFormatter
import java.time.Duration

def addDuration(rdd: RDD[Row], headers: Seq[String], pickup: String, dropoff: String, decimals: Int): RDD[Row] = {
  rdd.map {row =>
    val formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm[:ss]")

    val pickupStr = row.getAs[String](headers.indexOf(pickup)).trim
    val dropoffStr = row.getAs[String](headers.indexOf(dropoff)).trim

    val pickupTS = LocalDateTime.parse(pickupStr, formatter)
    val dropoffTS = LocalDateTime.parse(dropoffStr, formatter)
    val durationMillis = Duration.between(pickupTS, dropoffTS).toMillis
    val durationMinutes = BigDecimal(durationMillis / 60000.0).setScale(decimals, RoundingMode.HALF_UP).toDouble

    val pickupYear = pickupTS.getYear

    Row.fromSeq(row.toSeq ++ Seq(durationMinutes, pickupYear))
  }.filter { row => row.getAs[Double](row.toSeq.length - 2) > 0.0 }
}

rdd = addDuration(rdd, headers, pickup, dropoff, decimals)
headers = headers ++ Seq(colDurationMinutes, colYear)

rdd = binColByStepValue(rdd, headers.indexOf(colDurationMinutes), 5)
headers = headers :+ colDurationMinutesBinLabel

def addTimeZones(rdd: RDD[Row], headers: Seq[String], timezones: Map[String, (Int, Int)], weekDaySurcharge: Double, colDuration: String, pickup: String, dropoff: String, decimals: Int, preciseBucketUDF: (Map[String, (Int, Int)], LocalDateTime, LocalDateTime, Int) => Map[String, Double], isUSHolidayOrWeekendTZ: LocalDate => Boolean): RDD[Row] = {
  rdd.map { row =>
    val formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm[:ss]")

    val timeZonesDuration: Map[String, Double] = preciseBucketUDF(timezones, LocalDateTime.parse(row.getAs[String](headers.indexOf(pickup)).trim, formatter), LocalDateTime.parse(row.getAs[String](headers.indexOf(dropoff)).trim, formatter), decimals)

    val weekday_surcharge: Double = if (isUSHolidayOrWeekendTZ(LocalDateTime.parse(row.getAs[String](headers.indexOf(pickup)).trim, formatter).toLocalDate)) 0 else weekDaySurcharge
    val colsToAdd: Seq[Double] = timezones.keys.toSeq.flatMap { tz =>
      val duration = timeZonesDuration.getOrElse(tz, 0.0)
      val totalDuration = row.getAs[Double](headers.indexOf(colDuration))
      Seq(duration, BigDecimal(duration * 100 / totalDuration).setScale(decimals, RoundingMode.HALF_UP).toDouble)
    }
    Row.fromSeq((row.toSeq ++ colsToAdd) :+ weekday_surcharge)
  }
}

rdd = addTimeZones(rdd, headers, timeZones, weekDaySurcharge, colDurationMinutes, pickup, dropoff, decimals, preciseBucketUDF, isUSHolidayOrWeekend)

val headersToAdd: Seq[String] = timeZones.keys.toSeq.flatMap { tz =>
  Seq(tz + "_duration", tz + "_duration_pcg")
} :+ colWeekdaySurcharge

headers = headers ++ headersToAdd

rdd.take(5).foreach(println)

[2024-05-01T00:07:08,2024-05-01T00:15:03,N,1.0,1.0,1.24,9.3,1.0,0.5,2.0,0.0,1.0,13.8,1.0,0.0,7.9167,2024.0,[5|10),7.9167,100.0,0.0,0.0,2.5]
[2024-05-01T00:30:48,2024-05-01T00:35:49,N,1.0,1.0,0.94,7.2,1.0,0.5,1.94,0.0,1.0,11.64,1.0,0.0,5.0167,2024.0,[5|10),5.0167,100.0,0.0,0.0,2.5]
[2024-05-01T00:34:13,2024-05-01T00:38:07,N,1.0,1.0,0.84,6.5,1.0,0.5,0.0,0.0,1.0,9.0,2.0,0.0,3.9,2024.0,[0|5),3.9,100.0,0.0,0.0,2.5]
[2024-05-01T00:58:01,2024-05-01T01:14:41,N,1.0,1.0,6.07,25.4,1.0,0.5,5.0,0.0,1.0,32.9,1.0,0.0,16.6667,2024.0,[15|20),16.6667,100.0,0.0,0.0,2.5]
[2024-05-01T00:11:45,2024-05-01T00:20:38,N,1.0,2.0,2.06,12.1,1.0,0.5,2.92,0.0,1.0,17.52,1.0,0.0,8.8833,2024.0,[5|10),8.8833,100.0,0.0,0.0,2.5]


import java.time.format.DateTimeFormatter
import java.time.Duration
addDuration: (rdd: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row], headers: Seq[String], pickup: String, dropoff: String, decimals: Int)org.apache.spark.rdd.RDD[org.apache.spark.sql.Row]
rdd: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[13] at map at <console>:76
headers: Seq[String] = ArraySeq(lpep_pickup_datetime, lpep_dropoff_datetime, store_and_fwd_flag, ratecodeid, passenger_count, trip_distance, fare_amount, extra, mta_tax, tip_amount, tolls_amount, improvement_surcharge, total_amount, payment_type, congestion_surcharge, duration_minutes, year, duration_minutes_bin_label, overnight_duration, overnight_duration_pcg, regular_duration, regular_duration_pcg, weekday_surcharge)
rdd: org.apa...


### Add Aggregate fees and bins

In [12]:
def addAggregateFees(rdd: RDD[Row], headers: Seq[String], colOfFees: Set[String]): RDD[Row] = {
  rdd.map { row =>
    val fees = colOfFees
      .filter(col => headers.contains(col.toLowerCase))
      .map(col => row.getAs[Double](headers.indexOf(col.toLowerCase))).sum

    Row.fromSeq(row.toSeq :+ fees)
  }
}

rdd = addAggregateFees(rdd, headers, colFees)
headers = headers :+ colAggregateFee

rdd = binColByStepValue(rdd, headers.indexOf(colAggregateFee), 2)
headers = headers :+ colAggregateFeeBin

addAggregateFees: (rdd: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row], headers: Seq[String], colOfFees: Set[String])org.apache.spark.rdd.RDD[org.apache.spark.sql.Row]
rdd: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[15] at map at <console>:35
headers: Seq[String] = ArraySeq(lpep_pickup_datetime, lpep_dropoff_datetime, store_and_fwd_flag, ratecodeid, passenger_count, trip_distance, fare_amount, extra, mta_tax, tip_amount, tolls_amount, improvement_surcharge, total_amount, payment_type, congestion_surcharge, duration_minutes, year, duration_minutes_bin_label, overnight_duration, overnight_duration_pcg, regular_duration, regular_duration_pcg, weekday_surcharge, fees, agg_fee_bin_label)
rdd: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD...


### Add price per mile and minute

In [13]:
def addPricePerDistanceAndTime(rdd: RDD[Row], headers: Seq[String], colFareAmount: String, colDuration: String, colDistance: String): RDD[Row] = {
  rdd.map { row =>
    val pricePerTime = Math.round(row.getAs[Double](headers.indexOf(colFareAmount)) / row.getAs[Double](headers.indexOf(colDuration)) * 100) / 100.0
    val pricePerDistance = Math.round(row.getAs[Double](headers.indexOf(colFareAmount)) / row.getAs[Double](headers.indexOf(colDistance)) * 100) / 100.0

    Row.fromSeq(row.toSeq ++ Seq(pricePerTime, pricePerDistance))
  }
}

rdd = addPricePerDistanceAndTime(rdd, headers, colFareAmount, colDurationMinutes, "trip_distance")
headers = headers ++ Seq(colPricePerTime, colPricePerDistance)

addPricePerDistanceAndTime: (rdd: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row], headers: Seq[String], colFareAmount: String, colDuration: String, colDistance: String)org.apache.spark.rdd.RDD[org.apache.spark.sql.Row]
rdd: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[16] at map at <console>:43
headers: Seq[String] = ArraySeq(lpep_pickup_datetime, lpep_dropoff_datetime, store_and_fwd_flag, ratecodeid, passenger_count, trip_distance, fare_amount, extra, mta_tax, tip_amount, tolls_amount, improvement_surcharge, total_amount, payment_type, congestion_surcharge, duration_minutes, year, duration_minutes_bin_label, overnight_duration, overnight_duration_pcg, regular_duration, regular_duration_pcg, weekday_surcharge, fees, agg_fee_bin_label, cost_per_time, cost_per...


### Add distance bin and duration in overnight time zone

In [14]:
rdd = binColByStepValue(rdd, headers.indexOf("trip_distance"), 5)
headers = headers :+ colDistanceBin

rdd = binColByStepValue(rdd, headers.indexOf(colDurationOvernightPcg), 5)
headers = headers :+ (colDurationOvernightPcg + "_label")

rdd: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[18] at map at <console>:35
headers: Seq[String] = ArraySeq(lpep_pickup_datetime, lpep_dropoff_datetime, store_and_fwd_flag, ratecodeid, passenger_count, trip_distance, fare_amount, extra, mta_tax, tip_amount, tolls_amount, improvement_surcharge, total_amount, payment_type, congestion_surcharge, duration_minutes, year, duration_minutes_bin_label, overnight_duration, overnight_duration_pcg, regular_duration, regular_duration_pcg, weekday_surcharge, fees, agg_fee_bin_label, cost_per_time, cost_per_distance, distance_bin_label, overnight_duration_pcg_label)
rdd: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[18] at map at <console>:35
headers: Seq[String] = ArraySeq(lpep_pickup_datetime, lpep_d...


### Add key for average calculation based on columns for classification

In [15]:
val actualHeader = headers
def addKey(rdd: RDD[Row], colsClassification: Seq[String], headers: Seq[String]): RDD[(String, Row)] = {
  rdd.map { row =>
    val key = colsClassification.filter(col => headers.contains(col.toLowerCase))
    .map(col => row.get(headers.indexOf(col.toLowerCase)))
    .mkString("_")
    (key, row)
  }
}

val rddWithKey = addKey(rdd, colsForClassification, actualHeader)

actualHeader: Seq[String] = ArraySeq(lpep_pickup_datetime, lpep_dropoff_datetime, store_and_fwd_flag, ratecodeid, passenger_count, trip_distance, fare_amount, extra, mta_tax, tip_amount, tolls_amount, improvement_surcharge, total_amount, payment_type, congestion_surcharge, duration_minutes, year, duration_minutes_bin_label, overnight_duration, overnight_duration_pcg, regular_duration, regular_duration_pcg, weekday_surcharge, fees, agg_fee_bin_label, cost_per_time, cost_per_distance, distance_bin_label, overnight_duration_pcg_label)
addKey: (rdd: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row], colsClassification: Seq[String], headers: Seq[String])org.apache.spark.rdd.RDD[(String, org.apache.spark.sql.Row)]
rddWithKey: org.apache.spark.rdd.RDD[(String, org.apache.spark.sql.Row)] = Map...


### Calculate prices per distance and time

In [16]:
import org.apache.spark.HashPartitioner
import org.apache.spark.storage.StorageLevel

val numPartitions = spark.sparkContext.defaultParallelism
val partitioner = new HashPartitioner(numPartitions)

def calculatePrices(rdd: RDD[(String, Row)], headers: Seq[String], colPriceDistance: String, colPriceTime: String): RDD[(String, (Double, Double, Long))] = {
  rdd.mapValues { row =>
    val costPerDistance = row.getAs[Double](headers.indexOf(colPriceDistance))
    val costPerTime = row.getAs[Double](headers.indexOf(colPriceTime))
    (costPerDistance, costPerTime, 1L)
  }
}

val rddForAvg = calculatePrices(rddWithKey, headers, colPricePerDistance, colPricePerTime).partitionBy(partitioner).persist(StorageLevel.MEMORY_AND_DISK)

import org.apache.spark.HashPartitioner
import org.apache.spark.storage.StorageLevel
numPartitions: Int = 12
partitioner: org.apache.spark.HashPartitioner = org.apache.spark.HashPartitioner@c
calculatePrices: (rdd: org.apache.spark.rdd.RDD[(String, org.apache.spark.sql.Row)], headers: Seq[String], colPriceDistance: String, colPriceTime: String)org.apache.spark.rdd.RDD[(String, (Double, Double, Long))]
rddForAvg: org.apache.spark.rdd.RDD[(String, (Double, Double, Long))] = ShuffledRDD[21] at partitionBy at <console>:57


### Calculate average prices per distance and time

In [17]:
def calculateAvgPrices(rdd: RDD[(String, (Double, Double, Long))], decimals: Int): RDD[(String, (Double, Double))] = {
  rdd.aggregateByKey((0.0, 0.0, 0L))((acc, v) => (acc._1 + v._1, acc._2 + v._2, acc._3 + v._3), (a, b) => (a._1 + b._1, a._2 + b._2, a._3 + b._3)).mapValues {
    case(sumDist, sumTime, count) =>
      val avgDist = BigDecimal(sumDist / count).setScale(decimals, BigDecimal.RoundingMode.HALF_UP).toDouble
      val avgTime = BigDecimal(sumTime / count).setScale(decimals, BigDecimal.RoundingMode.HALF_UP).toDouble
      (avgDist, avgTime)
  }.filter { case(_, (dist, time)) => dist > 0.0 && time > 0.0 }
}

val rddWithAvgPrices = calculateAvgPrices(rddForAvg, decimals)
rddForAvg.unpersist()

calculateAvgPrices: (rdd: org.apache.spark.rdd.RDD[(String, (Double, Double, Long))], decimals: Int)org.apache.spark.rdd.RDD[(String, (Double, Double))]
rddWithAvgPrices: org.apache.spark.rdd.RDD[(String, (Double, Double))] = MapPartitionsRDD[24] at filter at <console>:47
res1: rddForAvg.type = ShuffledRDD[21] at partitionBy at <console>:57


### Join average prices to previous rdd

In [18]:
import org.apache.spark.broadcast.Broadcast

val broadcastAvgPrices: Broadcast[Map[String, (Double, Double)]] = spark.sparkContext.broadcast(rddWithAvgPrices.collectAsMap().toMap)

def applyJoin(rdd: RDD[(String, Row)], broadcastMap: Broadcast[Map[String, (Double, Double)]]): RDD[Row] = {
  rddWithKey.flatMap { case (key, originalRow) =>
    broadcastMap.value.get(key).map { case(avgCostPerDistance, avgCostPerTime) =>
      Row.fromSeq(originalRow.toSeq ++ Seq(avgCostPerDistance, avgCostPerTime))
    }
  }
}

rdd = applyJoin(rddWithKey, broadcastAvgPrices)
headers = headers ++ Seq(colAvgPricePerDistance, colAvgPricePerTime)

rdd.take(100).foreach(println)

[2024-05-01T00:07:08,2024-05-01T00:15:03,N,1.0,1.0,1.24,9.3,1.0,0.5,2.0,0.0,1.0,13.8,1.0,0.0,7.9167,2024.0,[5|10),7.9167,100.0,0.0,0.0,2.5,1.5,[0|2),1.17,7.5,[0|5),[100|105),8.3097,1.3835]
[2024-05-01T00:30:48,2024-05-01T00:35:49,N,1.0,1.0,0.94,7.2,1.0,0.5,1.94,0.0,1.0,11.64,1.0,0.0,5.0167,2024.0,[5|10),5.0167,100.0,0.0,0.0,2.5,1.5,[0|2),1.44,7.66,[0|5),[100|105),8.3097,1.3835]
[2024-05-01T00:34:13,2024-05-01T00:38:07,N,1.0,1.0,0.84,6.5,1.0,0.5,0.0,0.0,1.0,9.0,2.0,0.0,3.9,2024.0,[0|5),3.9,100.0,0.0,0.0,2.5,1.5,[0|2),1.67,7.74,[0|5),[100|105),79.1046,14.1859]
[2024-05-01T00:58:01,2024-05-01T01:14:41,N,1.0,1.0,6.07,25.4,1.0,0.5,5.0,0.0,1.0,32.9,1.0,0.0,16.6667,2024.0,[15|20),16.6667,100.0,0.0,0.0,2.5,1.5,[0|2),1.52,4.18,[5|10),[100|105),4.9056,1.8955]
[2024-05-01T00:11:45,2024-05-01T00:20:38,N,1.0,2.0,2.06,12.1,1.0,0.5,2.92,0.0,1.0,17.52,1.0,0.0,8.8833,2024.0,[5|10),8.8833,100.0,0.0,0.0,2.5,1.5,[0|2),1.36,5.87,[0|5),[100|105),9.4582,1.5518]
[2024-05-01T00:29:06,2024-05-01T00:36:03,N,1.0,

import org.apache.spark.broadcast.Broadcast
broadcastAvgPrices: org.apache.spark.broadcast.Broadcast[Map[String,(Double, Double)]] = Broadcast(5)
applyJoin: (rdd: org.apache.spark.rdd.RDD[(String, org.apache.spark.sql.Row)], broadcastMap: org.apache.spark.broadcast.Broadcast[Map[String,(Double, Double)]])org.apache.spark.rdd.RDD[org.apache.spark.sql.Row]
rdd: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[25] at flatMap at <console>:51
headers: Seq[String] = ArraySeq(lpep_pickup_datetime, lpep_dropoff_datetime, store_and_fwd_flag, ratecodeid, passenger_count, trip_distance, fare_amount, extra, mta_tax, tip_amount, tolls_amount, improvement_surcharge, total_amount, payment_type, congestion_surcharge, duration_minutes, year, duration_minutes_bin_label, overnight_dur...


### Add price comparison

In [19]:
def addPriceComparison(rdd: RDD[Row], headers: Seq[String], colPriceDistance: String, colAvgPriceDistance: String, colPriceTime: String, colAvgPriceTime: String, decimals: Int) = {
  rdd.map { row =>
    val priceColsToAdd: Seq[Double] = Seq((colPriceDistance, colAvgPriceDistance), (colPriceTime, colAvgPriceTime))
      .flatMap { case (colPrice, colAvgPrice) =>
        val price = row.getAs[Double](headers.indexOf(colPrice))
        val priceAvg = row.getAs[Double](headers.indexOf(colAvgPrice))
        val priceDiff = BigDecimal(price - priceAvg).setScale(decimals, BigDecimal.RoundingMode.HALF_UP).toDouble
        val priceDiffPcg = BigDecimal(priceDiff / priceAvg * 100).setScale(decimals, BigDecimal.RoundingMode.HALF_UP).toDouble

        Seq(priceDiff, priceDiffPcg)
      }
    Row.fromSeq(row.toSeq ++ priceColsToAdd)
  }
}

rdd = addPriceComparison(rdd, headers, colPricePerDistance, colAvgPricePerDistance, colPricePerTime, colAvgPricePerTime, decimals)
headers = headers ++ Seq(colPricePerDistanceDiff, colPricePerDistanceDiffPcg, colPricePerTimeDiff, colPricePerTimeDiffPcg)

addPriceComparison: (rdd: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row], headers: Seq[String], colPriceDistance: String, colAvgPriceDistance: String, colPriceTime: String, colAvgPriceTime: String, decimals: Int)org.apache.spark.rdd.RDD[org.apache.spark.sql.Row]
rdd: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[26] at map at <console>:51
headers: Seq[String] = ArraySeq(lpep_pickup_datetime, lpep_dropoff_datetime, store_and_fwd_flag, ratecodeid, passenger_count, trip_distance, fare_amount, extra, mta_tax, tip_amount, tolls_amount, improvement_surcharge, total_amount, payment_type, congestion_surcharge, duration_minutes, year, duration_minutes_bin_label, overnight_duration, overnight_duration_pcg, regular_duration, regular_duration_pcg, weekday_surcharge, fees...


### Bin price difference per time and distance

In [20]:
rdd = binColByStepValue(rdd,headers.indexOf(colPricePerDistanceDiffPcg), 5)
rdd = binColByStepValue(rdd, headers.indexOf(colPricePerTimeDiffPcg), 5)

headers = headers ++ Seq(colPricePerDistanceDiffPcgLabel, colPricePerTimeDiffPcgLabel)

rdd: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[28] at map at <console>:35
rdd: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[28] at map at <console>:35
headers: Seq[String] = ArraySeq(lpep_pickup_datetime, lpep_dropoff_datetime, store_and_fwd_flag, ratecodeid, passenger_count, trip_distance, fare_amount, extra, mta_tax, tip_amount, tolls_amount, improvement_surcharge, total_amount, payment_type, congestion_surcharge, duration_minutes, year, duration_minutes_bin_label, overnight_duration, overnight_duration_pcg, regular_duration, regular_duration_pcg, weekday_surcharge, fees, agg_fee_bin_label, cost_per_time, cost_per_distance, distance_bin_label, overnight_duration_pcg_label, avg_cost_per_distance, avg_cost_per_time, cost_per_distance_...


### Reduce to analysis columns only

In [21]:
val headersForAnalysis = headers.filter(head => colsForClassification.contains(head.toLowerCase))

def reduceToAnalysis(rdd: RDD[Row], headers: Seq[String]): RDD[Row] = {
  rdd.map { row =>
    Row.fromSeq(headers.indices.map(row.get))
  }
}

rdd = reduceToAnalysis(rdd, headersForAnalysis)

val totalCount = rdd.count()

headersForAnalysis: Seq[String] = ArraySeq(store_and_fwd_flag, passenger_count, payment_type, year, duration_minutes_bin_label, agg_fee_bin_label, distance_bin_label, overnight_duration_pcg_label, cost_per_distance_diff_pcg_label, cost_per_time_diff_pcg_label)
reduceToAnalysis: (rdd: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row], headers: Seq[String])org.apache.spark.rdd.RDD[org.apache.spark.sql.Row]
rdd: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[29] at map at <console>:45
totalCount: Long = 587441


### Group by feature value

In [22]:
def groupByFeatures(rdd: RDD[Row], colForValuesAnalysis: Seq[String], colPriceDistanceDiffPcgLabel: String, colPriceTimeDiffPcgLabel: String, headersAnalysis: Seq[String], decimals: Int, totalCount: Long): Seq[RDD[Row]] = {
  colForValuesAnalysis.map { colName =>
    val groupCols = Seq(colPriceDistanceDiffPcgLabel, colPriceTimeDiffPcgLabel):+ colName
    val grouped = rdd.map { row =>
      val key = groupCols.map(col => row.get(headersAnalysis.indexOf(col.toLowerCase)))
      (key, 1)
    }.reduceByKey(_ + _).map { case (keySeq, count) =>
      val value = keySeq.last.toString
      val costDistLabel = keySeq(0).toString
      val costTimeLabel = keySeq(1).toString
      val pcg = BigDecimal(count.toDouble / totalCount * 100).setScale(decimals, BigDecimal.RoundingMode.HALF_UP).toDouble
      Row.fromSeq(Seq(colName, value, count, pcg, costDistLabel, costTimeLabel))
    }
    grouped
  }
}

val rddFeatures = groupByFeatures(rdd, colsForValuesAnalysis, colPricePerDistanceDiffPcgLabel, colPricePerTimeDiffPcgLabel, headersForAnalysis, decimals, totalCount)

rddFeatures.foreach(rdd => rdd.take(1).foreach(println))

[passenger_count,2024-03-04T17:08:32,1,2.0E-4,0.5,0.0]
[store_and_fwd_flag,2024-04-26T12:26:05,1,2.0E-4,0.5,0.0]
[payment_type,N,108,0.0184,0.5,0.11]
[agg_fee_bin_label,2.34,1,2.0E-4,0.0,4.8]
[duration_minutes_bin_label,1.0,34,0.0058,0.5,2.59]
[distance_bin_label,26.8,2,3.0E-4,0.5,8.58]
[year,1.0,35,0.006,0.5,2.59]
[overnight_duration_pcg_label,1.0,26,0.0044,0.5,2.59]


groupByFeatures: (rdd: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row], colForValuesAnalysis: Seq[String], colPriceDistanceDiffPcgLabel: String, colPriceTimeDiffPcgLabel: String, headersAnalysis: Seq[String], decimals: Int, totalCount: Long)Seq[org.apache.spark.rdd.RDD[org.apache.spark.sql.Row]]
rddFeatures: Seq[org.apache.spark.rdd.RDD[org.apache.spark.sql.Row]] = List(MapPartitionsRDD[32] at map at <console>:52, MapPartitionsRDD[35] at map at <console>:52, MapPartitionsRDD[38] at map at <console>:52, MapPartitionsRDD[41] at map at <console>:52, MapPartitionsRDD[44] at map at <console>:52, MapPartitionsRDD[47] at map at <console>:52, MapPartitionsRDD[50] at map at <console>:52, MapPartitionsRDD[53] at map at <console>:52)


### Reduce to single rdd and write output

In [23]:
import org.apache.spark.sql.types._

val headersForSchema = Seq(
  StructField("feature", StringType),
  StructField("value", StringType),
  StructField("count", IntegerType),
  StructField("pcg", DoubleType),
  StructField("cost_distance_label", StringType),
  StructField("cost_time_label", StringType)
)

val schema = StructType(headersForSchema)

val dfForAnalysis = spark.createDataFrame(rddFeatures.reduce(_ union _), schema)
dfForAnalysis.show(10)
dfForAnalysis.write.mode("overwrite").parquet(getDatasetPath(outputDir + f"/$name"))

+---------------+-------------------+-----+------+-------------------+---------------+
|        feature|              value|count|   pcg|cost_distance_label|cost_time_label|
+---------------+-------------------+-----+------+-------------------+---------------+
|passenger_count|2024-03-04T17:08:32|    1|2.0E-4|                0.5|            0.0|
|passenger_count|2024-06-09T10:00:59|    1|2.0E-4|                0.5|           2.01|
|passenger_count|2024-05-29T07:50:36|    1|2.0E-4|                0.5|            1.1|
|passenger_count|2024-06-02T15:46:54|    1|2.0E-4|                1.5|            2.0|
|passenger_count|2024-06-05T15:06:53|    1|2.0E-4|                0.5|            0.0|
|passenger_count|2024-12-12T15:30:33|    1|2.0E-4|                0.5|            0.0|
|passenger_count|2024-02-29T15:40:56|    1|2.0E-4|                0.5|           1.87|
|passenger_count|2024-09-27T10:46:26|    1|2.0E-4|                0.5|            0.0|
|passenger_count|2024-04-03T09:38:07|    1|

import org.apache.spark.sql.types._
headersForSchema: Seq[org.apache.spark.sql.types.StructField] = List(StructField(feature,StringType,true), StructField(value,StringType,true), StructField(count,IntegerType,true), StructField(pcg,DoubleType,true), StructField(cost_distance_label,StringType,true), StructField(cost_time_label,StringType,true))
schema: org.apache.spark.sql.types.StructType = StructType(StructField(feature,StringType,true),StructField(value,StringType,true),StructField(count,IntegerType,true),StructField(pcg,DoubleType,true),StructField(cost_distance_label,StringType,true),StructField(cost_time_label,StringType,true))
dfForAnalysis: org.apache.spark.sql.DataFrame = [feature: string, value: string ... 4 more fields]
