In [0]:
# Intializing Spark Session
! pip install pyspark
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("MLlib lab") \
    .getOrCreate()

sc = spark.sparkContext

In [0]:
%matplotlib inline
import matplotlib.pyplot as plt
import pyspark.sql.functions as F
from pyspark.sql import Row
import time

In [0]:
import pandas as pd
from pyspark.sql.types import StructType, StructField, StringType, TimestampNTZType, LongType, DoubleType,IntegerType

parquet_files = '/content/drive/MyDrive/fhvhv_tripdata_2024-01.parquet'
schema = StructType([
  StructField('hvfhs_license_num', StringType(), nullable=True),
  StructField('dispatching_base_num', StringType(), nullable=True),
  StructField('originating_base_num', StringType(), nullable=True),
  StructField('request_datetime', TimestampNTZType(), nullable=True),
  StructField('on_scene_datetime', TimestampNTZType(), nullable=True),
  StructField('pickup_datetime', TimestampNTZType(), nullable=True),
  StructField('dropoff_datetime', TimestampNTZType(), nullable=True),
  StructField('PULocationID', LongType(), nullable=True),
  StructField('DOLocationID', LongType(), nullable=True),
  StructField('trip_miles', DoubleType(), nullable=True),
  StructField('trip_time', LongType(), nullable=True),
  StructField('base_passenger_fare', DoubleType(), nullable=True),
  StructField('tolls', DoubleType(), nullable=True),
  StructField('bcf', DoubleType(), nullable=True),
  StructField('sales_tax', DoubleType(), nullable=True),
  StructField('congestion_surcharge', DoubleType(), nullable=True),
  StructField('airport_fee', IntegerType(), nullable=True),
  StructField('tips', DoubleType(), nullable=True),
  StructField('driver_pay', DoubleType(), nullable=True),
  StructField('shared_request_flag', StringType(), nullable=True),
  StructField('shared_match_flag', StringType(), nullable=True),
  StructField('access_a_ride_flag', StringType(), nullable=True),
  StructField('wav_request_flag', StringType(), nullable=True),
  StructField('wav_match_flag', IntegerType(), nullable=True)
])
# Create empty dataframe
union_df = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)

# Process each Parquet file
for file in parquet_files:
  if 'parquet' in parquet_files:
    df = spark.read.parquet(parquet_files)
    # Read the Parquet file with schema inference
  #df = spark.read.parquet(file.path)

    # (column with mismatch, desirable type)
  mismatch_col = [
      ("wav_match_flag", "string"),
      ("airport_fee", "double"),
      ("PULocationID", "bigint"),
      ("DOLocationID", "bigint")
    ]

  df = df.withColumns({c: F.col(c).cast(t) for c, t in mismatch_col})

    # Union the casted DataFrame with the union_df
  union_df = union_df.union(df)

# Revert the variable name after unioning the whole data
df = union_df

total_rows = df.count()
print(f"Total number of rows:{total_rows}")

Total number of rows:1026034844


In [0]:
#df=df.withColumn("day",F.dayofyear('pickup_datetime'))
df=df.withColumn("trip_meters",F.col('trip_miles')*1609.35)
df=df.withColumn("hour",F.hour('pickup_datetime'))
#df=df.withColumn("month",F.month('pickup_datetime'))
df=df.withColumn("Date",F.to_date('pickup_datetime'))

In [0]:
import holidays
year_range = (2019, 2025)
hds = []
for y in range(year_range[0], year_range[1]):
  hds += holidays.US(state="NY", years=y).keys()

In [0]:
df = df.withColumns({
  "isWeekend": F.when(F.col("Date").isin(hds) | F.dayofweek(F.col("Date")).isin([1, 7]), 1).otherwise(0),
  "isOvernight": F.when(F.col("hour").isin(list(range(20, 24))+list(range(0, 6))), 1).otherwise(0)}) \
  .withColumn("isRushhour",
              F.when(F.col("hour").isin(list(range(16, 20))) | (F.col("isWeekend") == 0), 1).otherwise(0)
              )

In [0]:
drop = ['hvfhs_license_num', 'dispatching_base_num', 'originating_base_num', 'request_datetime', 'on_scene_datetime', 'pickup_datetime', 'dropoff_datetime', 'PULocationID', 'DOLocationID', 'tolls', 'bcf', 'sales_tax', 'congestion_surcharge',
 'airport_fee', 'tips', 'driver_pay', 'shared_request_flag', 'shared_match_flag', 'access_a_ride_flag', 'wav_request_flag', 'wav_match_flag','trip_miles','Date']
df = df.drop(*drop)

In [0]:
df = df.select('trip_time', 'trip_meters', 'hour', 'isWeekend', 'isOvernight', 'isRushhour', 'base_passenger_fare')

In [0]:
df1 =df.cache()

In [0]:
from pyspark.sql.functions import rand
# Set the random seed (for reproducibility)
seed = 42
# Split the DataFrame into training and test sets using a fixed random split
train_df, test_df = df1.randomSplit([0.7, 0.3], seed=seed)
# Print the sizes of the training and test sets
print(f"Training Data Size: {train_df.count()}")
print(f"Test Data Size: {test_df.count()}")

Training Data Size: 718214235
Test Data Size: 307820609


In [0]:
train_rdd = train_df.rdd.repartition(16).cache()

In [0]:
test_rdd = test_df.rdd.repartition(4)


In [0]:
column_names = df1.columns

In [0]:
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
def build_model(partition_data_it):
  partition_data_df = pd.DataFrame(partition_data_it,columns=column_names)
  reg = DecisionTreeRegressor(max_depth=5)
  X_train = partition_data_df[['trip_time', 'trip_meters', 'hour', 'isWeekend', 'isOvernight', 'isRushhour']]
  y_train = partition_data_df["base_passenger_fare"]
  model = reg.fit(X_train.values,y_train.values)
  return [model]

In [0]:
models = train_rdd.mapPartitions(build_model).collect()

In [0]:
len(models)

In [0]:
def predict(instance):
  return[m.predict([instance[:-1]])[0] for m in models]

In [0]:
test_rdd.map(predict).take(3)

In [0]:
def agg_predictions(preds):
  mean = sum(preds)/ len(preds)
  return float(mean)

In [0]:
test_rdd.map(predict).map(agg_predictions).take(5)

In [0]:
def transform(instance):
  return Row(**instance.asDict(), \
             raw_prediction = agg_predictions(predict(instance)))

In [0]:
predictions = test_rdd.map(transform).toDF()

In [0]:
predictions.show(5)

In [0]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(metricName="rmse",labelCol='base_passenger_fare',predictionCol="raw_prediction")

In [0]:
evaluator.evaluate(predictions)

In [0]:
partitions = [1,2,4,8,16,32,64,128,256,512]

In [0]:

for i in partitions:
  start = time.time()
  train_rdd = train_df.rdd.repartition(i)
  models = train_rdd.mapPartitions(build_model).collect()
  predictions = test_rdd.map(transform).toDF()
  end = time.time()
  with open("Local_Ensemble_dt.csv", "a") as f:
        print(
              f"{i},{evaluator.evaluate(predictions)},{end - start}",
              file=f,
              )


Root Mean Square Error of the model with 1 partitions: 9.658999559247526
Time Required to run in seconds: 27.397446393966675
Root Mean Square Error of the model with 2 partitions: 9.51505236024468
Time Required to run in seconds: 27.85700821876526
Root Mean Square Error of the model with 4 partitions: 9.475726336789966
Time Required to run in seconds: 33.78821897506714
Root Mean Square Error of the model with 8 partitions: 9.365018043951174
Time Required to run in seconds: 37.46826457977295
Root Mean Square Error of the model with 16 partitions: 9.345751586720935
Time Required to run in seconds: 57.52696967124939
Root Mean Square Error of the model with 32 partitions: 9.343354247297578
Time Required to run in seconds: 84.19667625427246
Root Mean Square Error of the model with 64 partitions: 9.381641885933675
Time Required to run in seconds: 143.88914561271667
Root Mean Square Error of the model with 128 partitions: 9.566239350984334
Time Required to run in seconds: 262.69355821609497
R

In [0]:
for j in range(1,11):
  def build_model(partition_data_it):
    partition_data_df = pd.DataFrame(partition_data_it,columns=column_names)
    reg = DecisionTreeRegressor(max_depth=j)
    X_train = partition_data_df[['trip_time', 'trip_meters', 'hour', 'isWeekend', 'isOvernight', 'isRushhour']]
    y_train = partition_data_df["base_passenger_fare"]
    model = reg.fit(X_train.values,y_train.values)
    return [model]
  start_time = time.time()
  train_rdd = train_df.rdd.repartition(j)
  models = train_rdd.mapPartitions(build_model).collect()
  predictions = test_rdd.map(transform).toDF()
  print(f"Root Mean Square Error of the model with " f"{j} partitions: {evaluator.evaluate(predictions)}")
  end_time= time.time()
  print(f"Time Required to run in seconds: {end_time-start_time}")

Root Mean Square Error of the model with 1 partitions: 15.524996797170678
Time Required to run in seconds: 44.71863126754761
Root Mean Square Error of the model with 2 partitions: 12.128528200779945
Time Required to run in seconds: 52.64550304412842
Root Mean Square Error of the model with 3 partitions: 10.025461163313398
Time Required to run in seconds: 64.19073796272278
Root Mean Square Error of the model with 4 partitions: 9.570910653161535
Time Required to run in seconds: 67.51734042167664
Root Mean Square Error of the model with 5 partitions: 9.320390831927257
Time Required to run in seconds: 70.78808665275574
Root Mean Square Error of the model with 6 partitions: 9.397083225926577
Time Required to run in seconds: 71.75579500198364
Root Mean Square Error of the model with 7 partitions: 9.375406942406496
Time Required to run in seconds: 77.8984055519104
Root Mean Square Error of the model with 8 partitions: 9.33079364868918
Time Required to run in seconds: 73.29802751541138
Root Me