## Train two models using PySpark:
- Linear Regression (baseline)
- Random Forest Regressor (nonlinear)

## 1. Load parquet file

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F, types as T

# reuse active Spark session when possible
try:
    spark
except NameError:
    spark = SparkSession.builder.enableHiveSupport().getOrCreate()

# load transformed data (table or parquet path)
out_trans_table = 'default.flights_2006_transformed'
out_trans_path = 'hdfs://namenode:8020/data/parquet/flights_2006_transformed'
try:
    print('Trying to read from metastore table', out_trans_table)
    df = spark.table(out_trans_table)
    print('Loaded table', out_trans_table)
except Exception as e:
    print('Failed to read table:', e)
    print('Falling back to parquet path', out_trans_path)
    df = spark.read.parquet(out_trans_path)
    print('Loaded parquet from', out_trans_path)

Trying to read from metastore table default.flights_2006_transformed
Failed to read table: 'java.lang.RuntimeException: java.lang.RuntimeException: Unable to instantiate org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient;'
Falling back to parquet path hdfs://namenode:8020/data/parquet/flights_2006_transformed
Loaded parquet from hdfs://namenode:8020/data/parquet/flights_2006_transformed


In [2]:
# Quick data checks: row count and schema
print('rows =', df.count())
df.printSchema()
df.show(5, truncate=False)

rows = 7003802
root
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- DepTime: double (nullable = true)
 |-- CRSDepTime: integer (nullable = true)
 |-- ArrTime: double (nullable = true)
 |-- CRSArrTime: integer (nullable = true)
 |-- DepDelay: double (nullable = true)
 |-- Distance: double (nullable = true)
 |-- Cancelled: integer (nullable = true)
 |-- TaxiIn: double (nullable = true)
 |-- TaxiOut: double (nullable = true)
 |-- ArrDelay: double (nullable = true)
 |-- DepHour: integer (nullable = true)
 |-- ArrHour: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)

+----------+---------+-------+----------+-------+----------+--------+--------+---------+------+-------+--------+-------+-------+----+-----+
|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|DepDelay|Distance|Cancelled|TaxiIn|TaxiOut|ArrDelay|DepHour|ArrHour|Year|Month|
+----------+---------+-------+----------+-------+-------

## 2. Prepare features and label
We will use `ArrDelay` as the numeric label (regression).
Assemble a small set of numeric features (tweak as needed). Keep the pipeline minimal.

In [4]:
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.sql.types import DoubleType, IntegerType

# 1) Filter out cancelled flights (keep only Cancelled == 0)
if 'Cancelled' in df.columns:
    df = df.filter(F.col('Cancelled') == 0)
    print('Filtered cancelled flights. rows =', df.count())
else:
    print('No Cancelled column found; continuing without filter')

# 2) Cast common columns to numeric types to avoid ML errors
to_cast_double = ['DepDelay','Distance','TaxiIn','TaxiOut','ArrDelay','DepTime','ArrTime']
to_cast_int = ['DepHour','ArrHour','DayOfWeek','DayofMonth','Month']
for c in to_cast_double:
    if c in df.columns:
        df = df.withColumn(c, F.col(c).cast(DoubleType()))
for c in to_cast_int:
    if c in df.columns:
        df = df.withColumn(c, F.col(c).cast(IntegerType()))

# 3) Choose feature list (explicit) and exclude Year / Cancelled
candidate_features = ['DepDelay','Distance','TaxiIn','TaxiOut','DepHour','ArrHour','DayOfWeek','Month']
features = [c for c in candidate_features if c in df.columns and c not in ('Year','Cancelled')]
print('Selected features (Year/Cancelled excluded):', features)

# 4) Ensure label is present
label_col = 'ArrDelay'
if label_col not in df.columns:
    raise ValueError('Label ArrDelay not found in dataframe')

# 5) Drop rows with nulls in features or label (simple baseline)
use_cols = features + [label_col]
df = df.select(*use_cols).na.drop()
print('Rows after dropping nulls:', df.count())

# Optional: limit for quick dev iterations (uncomment to use)
# df = df.limit(200000)

# 6) Assemble feature vector for MLlib
assembler = VectorAssembler(inputCols=features, outputCol='features_vec')
df = assembler.transform(df).select('features_vec', label_col)
print('Prepared dataframe with features_vec and label:')
df.show(5, truncate=False)

Filtered cancelled flights. rows = 7003802
Selected features (Year/Cancelled excluded): ['DepDelay', 'Distance', 'TaxiIn', 'TaxiOut', 'DepHour', 'ArrHour', 'DayOfWeek', 'Month']
Rows after dropping nulls: 7003802
Prepared dataframe with features_vec and label:
+---------------------------------------+--------+
|features_vec                           |ArrDelay|
+---------------------------------------+--------+
|[-5.0,984.0,5.0,16.0,13.0,17.0,1.0,8.0]|2.0     |
|[-5.0,109.0,9.0,9.0,13.0,14.0,5.0,8.0] |-2.0    |
|[0.0,641.0,6.0,21.0,18.0,19.0,4.0,8.0] |1.0     |
|[20.0,258.0,7.0,15.0,19.0,20.0,5.0,8.0]|18.0    |
|[-6.0,2454.0,6.0,18.0,22.0,6.0,4.0,8.0]|-13.0   |
+---------------------------------------+--------+
only showing top 5 rows



## 3. Train / Test split
Simple random split: 80% train / 20% test.

In [5]:
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)
print('train rows =', train_df.count(), 'test rows =', test_df.count())

train rows = 5602507 test rows = 1401295


## 4. Baseline: Linear Regression (PySpark)
Simple linear regression using `features_vec` as input. We'll measure RMSE and R2.

In [6]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

lr = LinearRegression(featuresCol='features_vec', labelCol=label_col, maxIter=50, regParam=0.1)
lr_model = lr.fit(train_df)
print('Linear Regression training done')

# evaluate on test set
preds_lr = lr_model.transform(test_df)
evaluator_rmse = RegressionEvaluator(labelCol=label_col, predictionCol='prediction', metricName='rmse')
evaluator_r2 = RegressionEvaluator(labelCol=label_col, predictionCol='prediction', metricName='r2')
rmse_lr = evaluator_rmse.evaluate(preds_lr)
r2_lr = evaluator_r2.evaluate(preds_lr)
print(f'Linear Regression RMSE: {rmse_lr:.4f}, R2: {r2_lr:.4f}')

Linear Regression training done
Linear Regression RMSE: 11.2483, R2: 0.9056


RMSE = 11.25 → On average, predicted arrival delays are about ±11 minutes off from the actual delays. Lower RMSE is better.

R² = 0.9056 → The model explains ~90.6% of the variance in arrival delays. This is very high for a baseline linear model, meaning the linear relationships captured are strong.

## 5. Nonlinear model: Random Forest Regressor (PySpark)
Train a Random Forest Regressor as a stronger nonlinear baseline. Keep hyperparameters small for speed.

In [7]:
from pyspark.ml.regression import RandomForestRegressor

# small RF for speed (increase numTrees if you want better performance)
rf = RandomForestRegressor(featuresCol='features_vec', labelCol=label_col, numTrees=50, maxDepth=8, seed=42)
rf_model = rf.fit(train_df)
print('Random Forest training done')

preds_rf = rf_model.transform(test_df)
rmse_rf = evaluator_rmse.evaluate(preds_rf)
r2_rf = evaluator_r2.evaluate(preds_rf)
print(f'Random Forest RMSE: {rmse_rf:.4f}, R2: {r2_rf:.4f}')

Random Forest training done
Random Forest RMSE: 17.0213, R2: 0.7837


RMSE = 17.02 → Predictions are, on average, ±17 minutes off. This is higher than Linear Regression, meaning it’s less precise in this setup.

R² = 0.7837 → The model explains ~78.4% of the variance. Lower than Linear Regression, so it’s capturing less of the total variance here.

## 6. Performance Evaluation 

Despite Random Forest being nonlinear and able to capture complex relationships, in this case it underperformed compared to the linear model, and it might mean the features might mostly have linear relationships with `ArrDelay` or Random Forest may require more hyperparameter tuning or more data preprocessing.