# Big Data project A.Y. 2024-2025

## Members

- Giovanni Antonioni
- Luca Rubboli - 0001083742

## First job

### Define useful parameters

- Dataset location
- Iterator (defined like this to overcome different names for same columns in dataset)
- Time zones for overnight calculation

In [1]:
val datasetDir = "../../../dataset/dataset_csv"
val yellowDatasetDir = s"$datasetDir/dataset_yellow"
val greenDatasetDir = s"$datasetDir/dataset_green"
val fhvDatasetDir = s"$datasetDir/dataset_fhv"
val fhvhvDatasetDir = s"$datasetDir/dataset_fhvhv"
val datasetDirMap: Map[String, String] = Map("yellow" -> yellowDatasetDir, "green" -> greenDatasetDir,
"fhv" -> fhvDatasetDir, "fhvhv" -> fhvhvDatasetDir)
val datasetIterator: Iterable[(String, String, String)] = Seq(
    ("yellow", "tpep_dropoff_datetime", "tpep_pickup_datetime"),
    ("green", "lpep_dropoff_datetime", "lpep_pickup_datetime"),
    // ("fhv", "tpep_dropoff_datetime", "tpep_pickup_datetime"),
    // ("fhvhv", "tpep_dropoff_datetime", "tpep_pickup_datetime"),
)
val outputDir = "/output/firstJobOutput"
val timeZoneOver: String = "overnight"
val timeZones = Map(timeZoneOver -> (20, 6), "regular" -> (6, 20))

Intitializing Scala interpreter ...

Spark Web UI available at http://mac.fritz.box:4040
SparkContext available as 'sc' (version = 3.5.1, master = local[*], app id = local-1745353954459)
SparkSession available as 'spark'


datasetDir: String = ../../../dataset/dataset_csv
yellowDatasetDir: String = ../../../dataset/dataset_csv/dataset_yellow
greenDatasetDir: String = ../../../dataset/dataset_csv/dataset_green
fhvDatasetDir: String = ../../../dataset/dataset_csv/dataset_fhv
fhvhvDatasetDir: String = ../../../dataset/dataset_csv/dataset_fhvhv
datasetDirMap: Map[String,String] = Map(yellow -> ../../../dataset/dataset_csv/dataset_yellow, green -> ../../../dataset/dataset_csv/dataset_green, fhv -> ../../../dataset/dataset_csv/dataset_fhv, fhvhv -> ../../../dataset/dataset_csv/dataset_fhvhv)
datasetIterator: Iterable[(String, String, String)] = List((yellow,tpep_dropoff_datetime,tpep_pickup_datetime), (green,lpep_dropoff_datetime,lpep_pickup_datetime))
outputDir: String = /output/firstJobOutput
timeZoneOver: St...


### Define preprocess rules
- Filters for columns
- Columns used in classification for average price calculation
- Columns which values are used in analysis

In [2]:
import org.apache.spark.sql.Column

val featureFilters: Map[String, Column] = Map(
    "passenger_count" -> (col("passenger_count") > 0),
    "trip_distance" -> (col("trip_distance") > 0),
    "RatecodeID" -> (col("RatecodeID").between(1, 6) || col("RatecodeID") === 99),
    "store_and_fwd_flag" -> (col("store_and_fwd_flag") === "Y" || col("store_and_fwd_flag") === "N"),
    "payment_type" -> col("payment_type").between(1, 6),
    "fare_amount" -> (col("fare_amount") > 0),
    "tolls_amount" -> (col("tolls_amount") < 200),
)
val taxFilter: Column => Column = _ >= 0

val colsForClassification: Seq[String] = Seq(
    "passenger_count",
    "store_and_fwd_flag",
    "payment_type",
    "aggregate_fee_bin_label",
    "duration_minutes_bin_label",
    "trip_distance_bin_label",
    "year",
    s"duration_minutes_${timeZoneOver}_pcg_bin"
)
val colsForValuesAnalysis: Seq[String] = Seq(
    "passenger_count",
    "store_and_fwd_flag",
    "payment_type",
    "aggregate_fee_bin_label",
    "duration_minutes_bin_label",
    "trip_distance_bin_label",
    "year",
    s"duration_minutes_${timeZoneOver}_pcg_bin"
)

import org.apache.spark.sql.Column
featureFilters: Map[String,org.apache.spark.sql.Column] = Map(trip_distance -> (trip_distance > 0), tolls_amount -> (tolls_amount < 200), RatecodeID -> (((RatecodeID >= 1) AND (RatecodeID <= 6)) OR (RatecodeID = 99)), payment_type -> ((payment_type >= 1) AND (payment_type <= 6)), fare_amount -> (fare_amount > 0), passenger_count -> (passenger_count > 0), store_and_fwd_flag -> ((store_and_fwd_flag = Y) OR (store_and_fwd_flag = N)))
taxFilter: org.apache.spark.sql.Column => org.apache.spark.sql.Column = $Lambda$2191/0x0000000800f50040@354a296e
colsForClassification: Seq[String] = List(passenger_count, store_and_fwd_flag, payment_type, aggregate_fee_bin_label, duration_minutes_bin_label, trip_distance_bin_label, year, duration_minutes_overnight_pcg_bin)
c...


In [3]:
import org.apache.spark

val spark = SparkSession.builder
  .appName("First job")
  .getOrCreate()

val sc = spark.SparkContext.getOrCreate()

import org.apache.spark
sc: org.apache.spark.SparkContext = org.apache.spark.SparkContext@72686277


In [None]:
object CsvParser {

    val commaRegex = ",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)"

    // (VendorID 0, pickup_time 1, dropoff_time 2, passengers 3, distance 4, ratecodeID 5, store_fwd 6, PUloc 7, DOloc 8, pay_type 9, fare_amount 10, extra 11, mta_tax 12, tip 13, tolls 14, improvement 15, tot_amount 16, cong_surcharge 17, airportfee 18)
    def parseYellowDataLine(line: String): Option[(String, String, Int)] = {
        try {
            val input = line.split(commaRegex)

            val intIndices = Set(0, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18)
            val strIndices = Set(1, 2, 6)

            val result = for {
                i <- 0 to 18 if i != 15  // you skipped input(15)
            } yield {
                val value = i
                if (intIndices.contains(i)) value else value
            }

            println(result)

            Some(input(0).trim.toInt, input(1).trim, input(2).trim, input(3).trim.toInt, input(4).trim.toInt, input(5).trim.toInt, input(6).trim,
                input(7).trim.toInt, input(8).trim.toInt, input(9).trim.toInt, input(10).trim.toInt, input(11).trim.toInt, input(12).trim.toInt, input(13).trim.toInt, input(14).trim.toInt, input(16).trim.toInt, input(17).trim.toInt, input(18).trim.toInt)
        } catch {
            case _: Exception => None
        }
    }

    // ("VendorID" 0,"lpep_pickup_datetime" 1,"lpep_dropoff_datetime" 2,"store_and_fwd_flag" 3,"RatecodeID" 4,"PULocationID" 5,"DOLocationID" 6,"passenger_count" 7,"trip_distance" 8,"fare_amount" 9,"extra" 10,"mta_tax" 11,"tip_amount" 12,"tolls_amount" 13,"ehail_fee" 14,"improvement_surcharge" 15,"total_amount" 16,"payment_type" 17,"trip_type" 18,"congestion_surcharge" 19)
    def parseGreenDataLine(line: String): Option[(String, String, Int)] = {
        try {
            val input = line.split(commaRegex)
            Some(input(0).trim.toInt, input(1).trim, input(2).trim, input(3).trim, input(4).trim.toInt, input(5).trim.toInt, input(6).trim.toInt,
            input(7).trim.toInt, input(8).trim.toInt, input(9).trim.toInt, input(10).trim.toInt, input(11).trim.toInt, input(12).trim.toInt, input(13).trim.toInt, input(15).trim.toInt, input(16).trim.toInt, input(17).trim.toInt, input(18).trim.toInt)
        } catch {
            case _: Exception => None
        }
    }
}

In [8]:
var yellowFile = spark.read
  .parquet(Commons.getDatasetPath(deploymentMode, datasetDirMap(name)))

dataset.columns.foreach(println)

val myRdd = dataset.rdd

val yellowFile = sc.textFile(f"${greenDatasetDir}/green-tripdata-2024-01.csv")
val header = yellowFile.first() // primacol, secondacol, ...
println(header)

"VendorID","lpep_pickup_datetime","lpep_dropoff_datetime","store_and_fwd_flag","RatecodeID","PULocationID","DOLocationID","passenger_count","trip_distance","fare_amount","extra","mta_tax","tip_amount","tolls_amount","ehail_fee","improvement_surcharge","total_amount","payment_type","trip_type","congestion_surcharge"


yellowFile: org.apache.spark.rdd.RDD[String] = ../../../dataset/dataset_csv/dataset_green/green-tripdata-2024-01.csv MapPartitionsRDD[5] at textFile at <console>:29
header: String = "VendorID","lpep_pickup_datetime","lpep_dropoff_datetime","store_and_fwd_flag","RatecodeID","PULocationID","DOLocationID","passenger_count","trip_distance","fare_amount","extra","mta_tax","tip_amount","tolls_amount","ehail_fee","improvement_surcharge","total_amount","payment_type","trip_type","congestion_surcharge"


In [5]:
val data = yellowFile.filter(_ != header)
val parsed = data.map(_.split(","))


org.apache.spark.SparkException:  Task not serializable