# Generalized Regression

In [74]:
from pyspark.sql.functions import col, unix_timestamp
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline


In [59]:
# Spark Session
spark = SparkSession.builder \
    .appName("YellowTaxiTripPrediction") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

In [66]:
df = spark.read.csv("../data/full_tripdata.parquet", header=False, inferSchema=True)

column_names = [
    "PULocationID", "DOLocationID", "passenger_count", "tpep_pickup_datetime", "tpep_dropoff_datetime",
    "VendorID", "trip_distance", "payment_type", "fare_amount", "tip_amount", "total_amount",
    "hour", "day_of_week", "trip_duration_seconds", "speed_mph", "extras",
    "PU_Borough", "PU_Zone", "DO_Borough", "DO_Zone"
]
df = df.toDF(*column_names)

df.printSchema()

[Stage 54:>                                                       (0 + 20) / 20]

root
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- VendorID: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- hour: integer (nullable = true)
 |-- day_of_week: integer (nullable = true)
 |-- trip_duration_seconds: double (nullable = true)
 |-- speed_mph: double (nullable = true)
 |-- extras: double (nullable = true)
 |-- PU_Borough: string (nullable = true)
 |-- PU_Zone: string (nullable = true)
 |-- DO_Borough: string (nullable = true)
 |-- DO_Zone: string (nullable = true)



                                                                                

In [67]:
df.show(5)

+------------+------------+---------------+--------------------+---------------------+--------+-------------+------------+-----------+----------+------------+----+-----------+---------------------+------------------+------------------+----------+--------------------+----------+--------------------+
|PULocationID|DOLocationID|passenger_count|tpep_pickup_datetime|tpep_dropoff_datetime|VendorID|trip_distance|payment_type|fare_amount|tip_amount|total_amount|hour|day_of_week|trip_duration_seconds|         speed_mph|            extras|PU_Borough|             PU_Zone|DO_Borough|             DO_Zone|
+------------+------------+---------------+--------------------+---------------------+--------+-------------+------------+-----------+----------+------------+----+-----------+---------------------+------------------+------------------+----------+--------------------+----------+--------------------+
|         236|          68|              2| 2024-02-01 00:04:45|  2024-02-01 00:19:58|       1|     

In [68]:
# Relevant columns
selected_cols = ["trip_distance", "passenger_count", "PULocationID", "DOLocationID", 
                 "VendorID", "total_amount", "payment_type","hour", "day_of_week",
                 "trip_duration_seconds", "speed_mph", "extras"]
df_model = df.select(*selected_cols)

# Assemble features
assembler = VectorAssembler(
    inputCols=["trip_distance", "passenger_count", "PULocationID", "DOLocationID", 
                 "VendorID", "payment_type","hour", "day_of_week", "trip_duration_seconds",
                 "speed_mph", "extras"],
    outputCol="features"
)

In [69]:
# Train and test split
train_data, test_data = df_model.randomSplit([0.8, 0.2], seed=42)

In [70]:
# Cleaning data
train_data = train_data.na.drop(subset=[
    "trip_distance", "passenger_count", "PULocationID", "DOLocationID", "VendorID",
    "payment_type", "hour", "day_of_week",
    "trip_duration_seconds", "speed_mph", "extras", "total_amount"
])

test_data = test_data.na.drop(subset=[
    "trip_distance", "passenger_count", "PULocationID", "DOLocationID", "VendorID",
    "payment_type", "hour", "day_of_week",
    "trip_duration_seconds", "speed_mph", "extras", "total_amount"
])

In [71]:
# Model creation and training
glr = GeneralizedLinearRegression(
    labelCol="total_amount", 
    featuresCol="features", 
    maxIter=10, 
    regParam=0.1,
    family="gaussian", 
    link="identity"
)

pipeline = Pipeline(stages=[assembler, glr])
model = pipeline.fit(train_data)

                                                                                

In [72]:
predictions = model.transform(test_data)

# Evaluators and metrics
evaluator_rmse = RegressionEvaluator(labelCol="total_amount", predictionCol="prediction", metricName="rmse")
evaluator_mae = RegressionEvaluator(labelCol="total_amount", predictionCol="prediction", metricName="mae")
evaluator_r2 = RegressionEvaluator(labelCol="total_amount", predictionCol="prediction", metricName="r2")

rmse = evaluator_rmse.evaluate(predictions)
mae = evaluator_mae.evaluate(predictions)
r2 = evaluator_r2.evaluate(predictions)

print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R²: {r2:.2f}")



RMSE: 16.20
MAE: 9.71
R²: 0.49


                                                                                