# Big Data project A.Y. 2024-2025

## Members

- Giovanni Antonioni
- Luca Rubboli - 0001083742

## Second job

In [39]:
import org.apache.spark.sql.SparkSession

val spark = SparkSession.builder
    .appName("Second job optimization")
    .getOrCreate()

val sc = spark.sparkContext

import org.apache.spark.sql.SparkSession
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@66755c3
sc: org.apache.spark.SparkContext = org.apache.spark.SparkContext@4fe78b9


## Definition of parameters for the job

Here are defined the variables used for the snippet.


In [40]:
val datasetFolder = "./dataset"
val outputDir = "/output/secondJobRDD"
val yellowCab = s"$datasetFolder/yellow_cab"
val greenCab = s"$datasetFolder/green_cab"
val weatherData = s"$datasetFolder/weather/weather_data_2017_2024.csv"
val weatherWmoLookup = s"$datasetFolder/weather/wmo_lookup_codes.csv"

datasetFolder: String = ./dataset
outputDir: String = /output/secondJobRDD
yellowCab: String = ./dataset/yellow_cab
greenCab: String = ./dataset/green_cab
weatherData: String = ./dataset/weather/weather_data_2017_2024.csv
weatherWmoLookup: String = ./dataset/weather/wmo_lookup_codes.csv


## Columns for the analysis

In [41]:
import org.apache.spark.sql.types._

val commonFields = List(
  StructField("VendorID", IntegerType),
  StructField("fare_amount", DoubleType),
  StructField("tip_amount", DoubleType),
  StructField("payment_type", LongType),
  StructField("trip_distance", DoubleType),
  StructField("total_amount", DoubleType),
  StructField("passenger_count", LongType)
)

val schemaYellow = StructType(
  StructField("tpep_pickup_datetime", TimestampType) ::
  StructField("tpep_dropoff_datetime", TimestampType) ::
  commonFields
)

val schemaGreen = StructType(
  StructField("lpep_pickup_datetime", TimestampType) ::
  StructField("lpep_dropoff_datetime", TimestampType) ::
  commonFields
)

import org.apache.spark.sql.types._
commonFields: List[org.apache.spark.sql.types.StructField] = List(StructField(VendorID,IntegerType,true), StructField(fare_amount,DoubleType,true), StructField(tip_amount,DoubleType,true), StructField(payment_type,LongType,true), StructField(trip_distance,DoubleType,true), StructField(total_amount,DoubleType,true), StructField(passenger_count,LongType,true))
schemaYellow: org.apache.spark.sql.types.StructType = StructType(StructField(tpep_pickup_datetime,TimestampType,true),StructField(tpep_dropoff_datetime,TimestampType,true),StructField(VendorID,IntegerType,true),StructField(fare_amount,DoubleType,true),StructField(tip_amount,DoubleType,true),StructField(payment_type,LongType,true),StructField(trip_distance,DoubleType,true),StructField(total_amount,...


# Load Datasets

First we want to load the dataset relative to the taxi data.

In [42]:
import java.sql.Timestamp

case class Ride(
  vendorId: Int,
  pickupDatetime: Timestamp,
  dropoffDatetime: Timestamp,
  fareAmount: Double,
  tipAmount: Double,
  paymentType: Int,
  tripDistance: Double,
  totalAmount: Double,
  passengerCount: Int,
  serviceType: String
)

import java.sql.Timestamp
defined class Ride


In [43]:
val projectDir: String = "/Users/giovanniantonioni/IdeaProjects/Drivers"
def getDatasetPath(localPath: String): String = {
  "file://" + projectDir + "/" + localPath
}

projectDir: String = /Users/giovanniantonioni/IdeaProjects/Drivers
getDatasetPath: (localPath: String)String


In [44]:
import org.apache.spark.HashPartitioner
import org.apache.spark.storage.StorageLevel

val numPartitions = spark.sparkContext.defaultParallelism
val partitioner = new HashPartitioner(numPartitions)

import org.apache.spark.HashPartitioner
import org.apache.spark.storage.StorageLevel
numPartitions: Int = 12
partitioner: org.apache.spark.HashPartitioner = org.apache.spark.HashPartitioner@c


In [45]:
import java.time.ZoneId
val yellowDataset = spark.read
  .schema(schemaYellow)
  .option("recursiveFileLookup", "true")
  .parquet(getDatasetPath(yellowCab))
  .select(
    $"VendorID",
    $"tpep_pickup_datetime".alias("pickup_datetime"),
    $"tpep_dropoff_datetime".alias("dropoff_datetime"),
    $"fare_amount",
    $"tip_amount",
    $"payment_type",
    $"trip_distance",
    $"total_amount",
    $"passenger_count"
  )

  .na.drop()
  .dropDuplicates()
  .rdd
  .map(r => Ride(
    r.getInt(0),
    r.getTimestamp(1),
    r.getTimestamp(2),
    r.getDouble(3),
    r.getDouble(4),
    r.getLong(5).toInt,
    r.getDouble(6),
    r.getDouble(7),
    r.getLong(8).toInt,
    "yellow"
  ))
  .keyBy(ride => ride.pickupDatetime.toInstant.atZone(ZoneId.systemDefault()).toLocalDate)

val greenDataset = spark.read
  .schema(schemaGreen)
  .option("recursiveFileLookup", "true")
  .parquet(getDatasetPath(greenCab))
  .select(
    $"VendorID",
    $"lpep_pickup_datetime".alias("pickup_datetime"),
    $"lpep_dropoff_datetime".alias("dropoff_datetime"),
    $"fare_amount",
    $"tip_amount",
    $"payment_type",
    $"trip_distance",
    $"total_amount",
    $"passenger_count"
  )
  .na.drop()
  .dropDuplicates()
  .rdd
  .map(r => Ride(
    r.getInt(0),
    r.getTimestamp(1),
    r.getTimestamp(2),
    r.getDouble(3),
    r.getDouble(4),
    r.getLong(5).toInt,
    r.getDouble(6),
    r.getDouble(7),
    r.getLong(8).toInt,
    "green"
  ))
  .keyBy(ride => ride.pickupDatetime.toInstant.atZone(ZoneId.systemDefault()).toLocalDate)

val joined = yellowDataset.union(greenDataset)
                          .partitionBy(partitioner)
                          .persist(StorageLevel.MEMORY_ONLY)


import java.time.ZoneId
yellowDataset: org.apache.spark.rdd.RDD[(java.time.LocalDate, Ride)] = MapPartitionsRDD[154] at keyBy at <console>:86
greenDataset: org.apache.spark.rdd.RDD[(java.time.LocalDate, Ride)] = MapPartitionsRDD[164] at keyBy at <console>:118
joined: org.apache.spark.rdd.RDD[(java.time.LocalDate, Ride)] = ShuffledRDD[166] at partitionBy at <console>:121


Once the Taxi data is loaded we procede to load the data relative to the weather


In [46]:
case class WeatherInfo(
  wmoCode: Int,
  dateOfRelevation: Timestamp,
  description: String
)

defined class WeatherInfo


In [47]:
val weatherFileRDD = spark.read
  .format("CSV")
  .option("header", "true")
  .load(getDatasetPath(weatherData))
  .rdd
  .map(row => (row.getString(1).trim.toInt, row.getString(0).trim))

val wmoLookupFileRDD = spark.read
  .format("CSV")
  .option("header", "true")
  .load(getDatasetPath(weatherWmoLookup))
  .rdd
  .map(row => {
    val Array(codeStr, desc) = row.getString(0).split(";")
    (codeStr.trim.toInt, desc.trim)
  })

val broadcastWmo = spark.sparkContext.broadcast(wmoLookupFileRDD.collectAsMap())

weatherFileRDD: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[182] at map at <console>:54
wmoLookupFileRDD: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[198] at map at <console>:61
broadcastWmo: org.apache.spark.broadcast.Broadcast[scala.collection.Map[Int,String]] = Broadcast(37)


In [48]:
import java.time.LocalDate

def mapWeatherRDD(
    rdd: org.apache.spark.rdd.RDD[(Int, String)],
    wmo: org.apache.spark.broadcast.Broadcast[scala.collection.Map[Int,String]],
): org.apache.spark.rdd.RDD[WeatherInfo] = {

  rdd.map {
    case (code, dateStr) =>
      val description = wmo.value.getOrElse(code, "unknown")
      val timestamp = Timestamp.valueOf(LocalDate.parse(dateStr).atStartOfDay())
      WeatherInfo(code, timestamp, description)
  }
}


val transformedWeatherClassRDD =  mapWeatherRDD(weatherFileRDD, broadcastWmo)
                                   .persist(StorageLevel.MEMORY_ONLY)

import java.time.LocalDate
mapWeatherRDD: (rdd: org.apache.spark.rdd.RDD[(Int, String)], wmo: org.apache.spark.broadcast.Broadcast[scala.collection.Map[Int,String]])org.apache.spark.rdd.RDD[WeatherInfo]
transformedWeatherClassRDD: org.apache.spark.rdd.RDD[WeatherInfo] = MapPartitionsRDD[199] at map at <console>:56


# Filtering


In [49]:
val filtered = joined
  .map (record => record._2)
  .filter { ride =>
      (ride.serviceType == "yellow" && Set(1, 2, 6, 7).contains(ride.vendorId)) ||
      (ride.serviceType == "green" && Set(1, 2, 6).contains(ride.vendorId))
  }
  .filter(ride => ride.fareAmount > 0)
  .filter(ride => ride.tipAmount >= 0)
  .filter(ride => ride.tipAmount <= ride.fareAmount * 1.5)
  .filter(ride => ride.paymentType >= 1 && ride.paymentType <= 6)
  .filter(ride => ride.tripDistance > 0)
  .filter(ride => ride.dropoffDatetime.after(ride.pickupDatetime))
  .persist(StorageLevel.MEMORY_ONLY)

joined.unpersist()

filtered: org.apache.spark.rdd.RDD[Ride] = MapPartitionsRDD[207] at filter at <console>:57
res12: joined.type = ShuffledRDD[166] at partitionBy at <console>:121


In [50]:
case class RideWithDurationMinutes(info: Ride, durationMinutes: Double)

defined class RideWithDurationMinutes


In [51]:
val withTripDuration = filtered.map(ride => {
  val durationMin = (ride.dropoffDatetime.getTime - ride.pickupDatetime.getTime).toDouble / (1000 * 60)
  RideWithDurationMinutes(ride, durationMin)
})


withTripDuration: org.apache.spark.rdd.RDD[RideWithDurationMinutes] = MapPartitionsRDD[208] at map at <console>:48


In [52]:
def getQuantile(sortedRDD: org.apache.spark.rdd.RDD[(Long, Double)], quantile: Double, count: Long): Double = {
  val idx = (quantile * count).toLong
  sortedRDD.lookup(idx).headOption.getOrElse(sortedRDD.map(_._2).takeOrdered(1).head)
}

getQuantile: (sortedRDD: org.apache.spark.rdd.RDD[(Long, Double)], quantile: Double, count: Long)Double


In [53]:
val tripDistances = withTripDuration.map { case ride => ride.info.tripDistance }
val tripDurations = withTripDuration.map { case ride => ride.durationMinutes }

val distanceSorted = tripDistances.sortBy(identity).zipWithIndex().map(_.swap)
val durationSorted = tripDurations.sortBy(identity).zipWithIndex().map(_.swap)

tripDistances: org.apache.spark.rdd.RDD[Double] = MapPartitionsRDD[209] at map at <console>:48
tripDurations: org.apache.spark.rdd.RDD[Double] = MapPartitionsRDD[210] at map at <console>:49
distanceSorted: org.apache.spark.rdd.RDD[(Long, Double)] = MapPartitionsRDD[217] at map at <console>:51
durationSorted: org.apache.spark.rdd.RDD[(Long, Double)] = MapPartitionsRDD[224] at map at <console>:52


In [54]:
val count = withTripDuration.count()
val distanceLower = getQuantile(distanceSorted, 0.02, count)
val distanceUpper = getQuantile(distanceSorted, 0.98, count)
val durationLower = getQuantile(durationSorted, 0.02, count)
val durationUpper = getQuantile(durationSorted, 0.98, count)

val filteredWithoutOutliers = withTripDuration.filter { case ride =>
  ride.info.tripDistance >= distanceLower && ride.info.tripDistance <= distanceUpper &&
  ride.durationMinutes >= durationLower && ride.durationMinutes <= durationUpper
}

count: Long = 2627298
distanceLower: Double = 0.37
distanceUpper: Double = 19.08
durationLower: Double = 2.533333333333333
durationUpper: Double = 61.0
filteredWithoutOutliers: org.apache.spark.rdd.RDD[RideWithDurationMinutes] = MapPartitionsRDD[233] at filter at <console>:60


In [55]:
case class RideWithEnrichedInformation(
  rideWithMinutes: RideWithDurationMinutes,
  hourOfDay: Int,
  dayOfWeek: Int,
  monthOfYear: Int,
  year: Int,
  isWeekend: Int,
  tripHourBucket: String,
  tipPercentage: Double,
  speedMph: Double,
  isRushHour: Boolean,
  isLongTrip: Boolean
)

defined class RideWithEnrichedInformation


In [56]:
val enriched = filteredWithoutOutliers.map { case ride =>
  val pickupCalendar = java.util.Calendar.getInstance()
  pickupCalendar.setTime(ride.info.pickupDatetime)

  val hourOfDay = pickupCalendar.get(java.util.Calendar.HOUR_OF_DAY)
  val dayOfWeek = pickupCalendar.get(java.util.Calendar.DAY_OF_WEEK)
  val monthOfYear = pickupCalendar.get(java.util.Calendar.MONTH)
  val year = pickupCalendar.get(java.util.Calendar.YEAR)

  val isWeekend = if (dayOfWeek == java.util.Calendar.SATURDAY || dayOfWeek == java.util.Calendar.SUNDAY) 1 else 0

  val tripHourBucket = hourOfDay match {
  case h if h >= 0 && h <= 5  => "late_night"
  case h if h >= 6 && h <= 9  => "morning"
  case h if h >= 10 && h <= 15 => "midday"
  case h if h >= 16 && h <= 19 => "evening"
  case _ => "night"
  }

  val tipPercentage = if (ride.info.totalAmount != 0) (ride.info.tipAmount / ride.info.totalAmount) * 100 else 0.0
  val speedMph = if (ride.durationMinutes > 0) ride.info.tripDistance / (ride.durationMinutes / 60.0) else 0.0

  val isRushHour = (dayOfWeek >= java.util.Calendar.MONDAY && dayOfWeek <= java.util.Calendar.FRIDAY) &&
  ((hourOfDay >= 7 && hourOfDay <= 9) || (hourOfDay >= 16 && hourOfDay <= 18))

  val isLongTrip = ride.info.tripDistance > 5.0 || ride.durationMinutes  > 20.0

  RideWithEnrichedInformation(
    ride,
    hourOfDay,
    dayOfWeek,
    monthOfYear,
    year,
    isWeekend,
    tripHourBucket,
    tipPercentage,
    speedMph,
    isRushHour,
    isLongTrip
  )
}

enriched: org.apache.spark.rdd.RDD[RideWithEnrichedInformation] = MapPartitionsRDD[234] at map at <console>:48


In [57]:
case class RideWithBins(
  enrichedInfo: RideWithEnrichedInformation,
  tripDistanceBin: String,
  tripDurationBin: String,
  fareAmountBin: String,
  tipPercentageBin: String,
  speedBin: String
)

defined class RideWithBins


In [58]:
val binned = enriched.map {
  case ride =>

    val binConfigs = Map(
      "trip_distance" -> (Seq(1.0, 3.0, 6.0), Seq("0-1", "1-3", "3-6", "6+")),
      "trip_duration_min" -> (Seq(5.0, 15.0, 30.0), Seq("0-5", "5-15", "15-30", "30+")),
      "fare_amount" -> (Seq(5.0, 10.0, 20.0, 40.0), Seq("0-5", "5-10", "10-20", "20-40", "40+")),
      "tip_percentage" -> (Seq(5.0, 10.0, 20.0, 30.0), Seq("0-5%", "5-10%", "10-20%", "20-30%", "30%+")),
      "speed_mph" -> (Seq(5.0, 15.0, 30.0), Seq("0-5mph", "5-15mph", "15-30mph", "30mph+"))
    )

    def assignBin(value: Double, bins: Seq[Double], labels: Seq[String]): String = {
      require(labels.length == bins.length + 1, "You need one more label than bin thresholds.")
      if (value < bins.head) labels.head
      else {
        val idx = bins.indexWhere(b => value < b)
        if (idx == -1) labels.last
        else labels(idx)
      }
    }

    val tripDurationMin =
    (
      ride.rideWithMinutes.info.dropoffDatetime.getTime - ride.rideWithMinutes.info.pickupDatetime.getTime
    ).toDouble / (1000 * 60)

    val tripDistanceBin = assignBin(
      ride.rideWithMinutes.info.tripDistance,
      binConfigs("trip_distance")._1,
      binConfigs("trip_distance")._2
    )

    val tripDurationBin = assignBin(
      tripDurationMin,
      binConfigs("trip_duration_min")._1,
      binConfigs("trip_duration_min")._2
    )

    val fareAmountBin = assignBin(
      ride.rideWithMinutes.info.fareAmount,
      binConfigs("fare_amount")._1,
      binConfigs("fare_amount")._2
    )

    val tipPercentageBin = assignBin(
      ride.tipPercentage,
      binConfigs("tip_percentage")._1,
      binConfigs("tip_percentage")._2
    )
    val speedBin = assignBin(
      ride.speedMph,
      binConfigs("speed_mph")._1,
      binConfigs("speed_mph")._2
    )

    RideWithBins(
      ride,
      tripDistanceBin,
      tripDurationBin,
      fareAmountBin,
      tipPercentageBin,
      speedBin
    )
}

binned: org.apache.spark.rdd.RDD[RideWithBins] = MapPartitionsRDD[235] at map at <console>:48


# Join weather and Ride data


In [59]:
case class RideWithWeather(
  ride: RideWithBins,
  weatherInfo: WeatherInfo
)

case class RideFinalOutput(
  ride: RideWithBins,
  weather: WeatherInfo,
  generalWeather: String
)

defined class RideWithWeather
defined class RideFinalOutput


In [60]:
val weatherByDate = transformedWeatherClassRDD.keyBy(_.dateOfRelevation.toLocalDateTime.toLocalDate)
val rideByDate = binned.keyBy(_.enrichedInfo.rideWithMinutes.info.pickupDatetime.toLocalDateTime.toLocalDate)

val partitionedWeather = weatherByDate.partitionBy(partitioner).persist()
val partitionedRides = rideByDate.partitionBy(partitioner).persist()

val joinedWeather = partitionedRides.join(partitionedWeather).map {
  case (_, (ride, weather)) => RideWithWeather(ride, weather)
}

partitionedWeather.unpersist()
partitionedRides.unpersist()


val finalRDD = joinedWeather.map { r =>

  def generalWeatherLabel(wmoCode: Int): String = wmoCode match {
    case c if Seq(0, 1).contains(c)              => "clear"
    case c if Seq(2, 3, 4).contains(c)           => "cloudy"
    case c if Seq(45, 48).contains(c)            => "foggy"
    case c if (50 to 67).contains(c)             => "rainy"
    case c if (70 to 77).contains(c)             => "snowy"
    case c if (80 to 99).contains(c)             => "stormy"
    case _                                       => "unknown"
  }

  val generalWeather = generalWeatherLabel(r.weatherInfo.wmoCode)
  RideFinalOutput(r.ride, r.weatherInfo, generalWeather)
}

finalRDD.cache()

weatherByDate: org.apache.spark.rdd.RDD[(java.time.LocalDate, WeatherInfo)] = MapPartitionsRDD[236] at keyBy at <console>:58
rideByDate: org.apache.spark.rdd.RDD[(java.time.LocalDate, RideWithBins)] = MapPartitionsRDD[237] at keyBy at <console>:59
partitionedWeather: org.apache.spark.rdd.RDD[(java.time.LocalDate, WeatherInfo)] = ShuffledRDD[238] at partitionBy at <console>:61
partitionedRides: org.apache.spark.rdd.RDD[(java.time.LocalDate, RideWithBins)] = ShuffledRDD[239] at partitionBy at <console>:62
joinedWeather: org.apache.spark.rdd.RDD[RideWithWeather] = MapPartitionsRDD[243] at map at <console>:64
finalRDD: org.apache.spark.rdd.RDD[RideFinalOutput] = MapPartitionsRDD[244] at map at <console>:72
res13: finalRDD.type = MapPartitionsRDD[244] at map at <console>:72


# Export the results

In [61]:
import org.apache.spark.sql.Row
import org.apache.spark.rdd.RDD

val binFields = Seq(
  "tripDistanceBin",
  "tripDurationBin",
  "fareAmountBin",
  "tipPercentageBin",
  "speedBin"
)

val binFieldPairs = for {
  x <- binFields
  y <- binFields
  if x != y
} yield (x, y)

val combinationRDD: RDD[Row] = finalRDD.flatMap { row =>
  binFieldPairs.map { case (x, y) =>
    val matchVal : String => String = {
      case "tripDistanceBin" => row.ride.tripDistanceBin
      case "tripDurationBin" => row.ride.tripDurationBin
      case "fareAmountBin" => row.ride.fareAmountBin
      case "tipPercentageBin" => row.ride.tipPercentageBin
      case "speedBin" => row.ride.speedBin
    }

    val binX = matchVal(x)
    val binY = matchVal(y)

    ((x, y, binX, binY), (row.ride.enrichedInfo.tipPercentage, 1L))
  }
}
.reduceByKey((a, b) => (a._1 + b._1, a._2 + b._2))
.map { case ((fieldX, fieldY, binX, binY), (sumTip, count)) =>
  Row(fieldX, fieldY, binX, binY, sumTip / count)
}

import org.apache.spark.sql.Row
import org.apache.spark.rdd.RDD
binFields: Seq[String] = List(tripDistanceBin, tripDurationBin, fareAmountBin, tipPercentageBin, speedBin)
binFieldPairs: Seq[(String, String)] = List((tripDistanceBin,tripDurationBin), (tripDistanceBin,fareAmountBin), (tripDistanceBin,tipPercentageBin), (tripDistanceBin,speedBin), (tripDurationBin,tripDistanceBin), (tripDurationBin,fareAmountBin), (tripDurationBin,tipPercentageBin), (tripDurationBin,speedBin), (fareAmountBin,tripDistanceBin), (fareAmountBin,tripDurationBin), (fareAmountBin,tipPercentageBin), (fareAmountBin,speedBin), (tipPercentageBin,tripDistanceBin), (tipPercentageBin,tripDurationBin), (tipPercentageBin,fareAmountBin), (tipPercentageBin,speedBin), (speedBin,tripDistanceBin), (speedBin,tripDurationBin), (...


In [62]:
val binFields = Seq(
  "tripDistanceBin",
  "tripDurationBin",
  "fareAmountBin",
  "tipPercentageBin",
  "speedBin"
)

val allTipByBinRDD: RDD[Row] = finalRDD.flatMap { row =>
  binFields.map { binFeature =>
    val bin = binFeature match {
      case "fareAmountBin" => row.ride.fareAmountBin
      case "tripDistanceBin" => row.ride.tripDistanceBin
      case "tripDurationBin" => row.ride.tripDurationBin
      case "tipPercentageBin" => row.ride.tipPercentageBin
      case "speedBin" => row.ride.speedBin
    }
    ((binFeature, bin), (row.ride.enrichedInfo.tipPercentage, 1L))
  }
}
.reduceByKey((a, b) => (a._1 + b._1, a._2 + b._2))
.map { case ((feature, bin), (sumTip, count)) =>
  Row(feature, bin, sumTip / count)
}

binFields: Seq[String] = List(tripDistanceBin, tripDurationBin, fareAmountBin, tipPercentageBin, speedBin)
allTipByBinRDD: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[250] at map at <console>:71


In [63]:
val avgTipByWeather = finalRDD
    .map(r => (r.generalWeather, (r.ride.enrichedInfo.tipPercentage, 1L)))
    .reduceByKey((a, b) => (a._1 + b._1, a._2 + b._2))
    .map { case (weather, (sumTip, count)) => Row(weather, sumTip / count) }

avgTipByWeather: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[253] at map at <console>:52


In [64]:
val tipByHourBucket = finalRDD
    .map(r => (r.ride.enrichedInfo.tripHourBucket, (r.ride.enrichedInfo.tipPercentage, 1L)))
    .reduceByKey((a, b) => (a._1 + b._1, a._2 + b._2))
    .map { case (bucket, (sumTip, count)) => Row(bucket, sumTip / count) }


tipByHourBucket: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[256] at map at <console>:52


In [65]:
val allTipByBinSchema = StructType(Seq(
  StructField("feature", StringType),
  StructField("bin", StringType),
  StructField("avg_tip_pct", DoubleType)
))

val dfAllTipByBinDF = spark.createDataFrame(allTipByBinRDD, allTipByBinSchema)

val schema = StructType(Seq(
  StructField("featureX", StringType),
  StructField("featureY", StringType),
  StructField("binX", StringType),
  StructField("binY", StringType),
  StructField("avg_tip_pct", DoubleType)
))

val combinationDF = spark.createDataFrame(combinationRDD, schema)

allTipByBinSchema: org.apache.spark.sql.types.StructType = StructType(StructField(feature,StringType,true),StructField(bin,StringType,true),StructField(avg_tip_pct,DoubleType,true))
dfAllTipByBinDF: org.apache.spark.sql.DataFrame = [feature: string, bin: string ... 1 more field]
schema: org.apache.spark.sql.types.StructType = StructType(StructField(featureX,StringType,true),StructField(featureY,StringType,true),StructField(binX,StringType,true),StructField(binY,StringType,true),StructField(avg_tip_pct,DoubleType,true))
combinationDF: org.apache.spark.sql.DataFrame = [featureX: string, featureY: string ... 3 more fields]


In [66]:
val weatherSchema = StructType(Seq(
  StructField("weather", StringType),
  StructField("avg_tip_pct", DoubleType)
))

val weatherDF = spark.createDataFrame(avgTipByWeather, weatherSchema)

weatherSchema: org.apache.spark.sql.types.StructType = StructType(StructField(weather,StringType,true),StructField(avg_tip_pct,DoubleType,true))
weatherDF: org.apache.spark.sql.DataFrame = [weather: string, avg_tip_pct: double]


In [67]:
val bucketSchema = StructType(Seq(
  StructField("hour_bucket", StringType),
  StructField("avg_tip_pct", DoubleType)
))

val hourBucketDF = spark.createDataFrame(tipByHourBucket, bucketSchema)

bucketSchema: org.apache.spark.sql.types.StructType = StructType(StructField(hour_bucket,StringType,true),StructField(avg_tip_pct,DoubleType,true))
hourBucketDF: org.apache.spark.sql.DataFrame = [hour_bucket: string, avg_tip_pct: double]
