In [None]:
import pyspark
import findspark
import os

from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler

from pyspark.sql.functions import col, dayofweek, when, month, hour, lag, avg, sum as spark_sum
from pyspark.sql.window import Window
from pyspark.ml.feature import MinMaxScaler


##### Read parquet files

In [None]:
findspark.init()

# Create Spark session
spark = SparkSession.builder.appName("ParquetViewer").config("spark.driver.memory", "8g").config("spark.executor.memory", "4g").getOrCreate()

# Load local Parquet file
df = spark.read.parquet("/Users/caitlinyap/GitHub/IS459-G1T7/ML/merged_df1_df3_df7_df8/")

# Show the first 5 rows
df.show(5)

# Print schema
df.printSchema()

+----------+---------+--------------------+--------------------+--------------------+------------+--------------------+------------------+--------------------+--------------+-------------------+-----------+-----------------+--------+-------------------+----------+---------+--------+--------------------------+-----------------------+----------+----------+--------+---------------------------+----------------------+----------------------+-------+-------------------+-------------------+--------------+--------------+---------------+-------------------+-------------------+-------------------+--------------------+-------------------+----------------------+--------------------------+--------------------------+---------+----+
|       day|    LCLid|       energy_median|         energy_mean|          energy_max|energy_count|          energy_std|        energy_sum|          energy_min|temperatureMax| temperatureMaxTime|windBearing|             icon|dewPoint| temperatureMinTime|cloudCover|windSpeed

In [None]:
from pyspark.sql.functions import col, sum

# Count missing values in each column
missing_values = df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns])
missing_values.show()



+---+-----+-------------+-----------+----------+------------+----------+----------+----------+--------------+------------------+-----------+-----+--------+------------------+----------+---------+--------+--------------------------+-----------------------+----------+----------+--------+---------------------------+----------------------+----------------------+-------+-----+----------+--------------+--------------+---------------+-----------+-------------------+-----------+-------+------------------+----------------------+--------------------------+--------------------------+---------+-------+
|day|LCLid|energy_median|energy_mean|energy_max|energy_count|energy_std|energy_sum|energy_min|temperatureMax|temperatureMaxTime|windBearing| icon|dewPoint|temperatureMinTime|cloudCover|windSpeed|pressure|apparentTemperatureMinTime|apparentTemperatureHigh|precipType|visibility|humidity|apparentTemperatureHighTime|apparentTemperatureLow|apparentTemperatureMax|uvIndex| time|sunsetTime|temperatureLow|te

                                                                                

### Feature engineering

In [None]:
# Time-based features
df = df.withColumn("day_of_week", dayofweek(col("day")))
df = df.withColumn("is_weekend", when((col("day_of_week") == 1) | (col("day_of_week") == 7), 1).otherwise(0))
df = df.withColumn("month", month(col("day")))
df = df.withColumn("hour", hour(col("time")))

# Lag feature (previous day's energy consumption)
window_spec = Window.partitionBy("LCLid").orderBy("day")
df = df.withColumn("lag_1_day", lag("energy_sum", 1).over(window_spec))

# Weather-based features
df = df.withColumn("temperature_variability", col("temperatureMax") - col("temperatureMin"))
df = df.withColumn("humidity_temp_interaction", col("humidity") * col("temperatureMax"))

# Precipitation & Cloud Cover features
df = df.withColumn("cloud_temp_interaction", col("cloudCover") * col("temperatureMax"))

# Rolling sum of precipitation over 7 days
df = df.withColumn("rolling_precipitation_7d", spark_sum("precipType").over(window_spec.rowsBetween(-6, 0)))

# Drop only rows where `energy_sum` is NULL
df = df.dropna(subset=["energy_sum"])

# Fill missing values instead of dropping
df = df.fillna({
    "lag_1_day": 0,  
    "temperature_variability": df.select(avg("temperatureMax") - avg("temperatureMin")).collect()[0][0],
    "humidity_temp_interaction": df.select(avg("humidity") * avg("temperatureMax")).collect()[0][0],
    "cloud_temp_interaction": df.select(avg("cloudCover") * avg("temperatureMax")).collect()[0][0],
    "rolling_precipitation_7d": 0  
})

# Drop columns with too many missing values
columns_to_drop = ["precipType", "summary", "Type"]
df = df.drop(*columns_to_drop)

# Verify that we still have rows
print(f"Dataset size after cleaning: {df.count()}")

# Assemble features
feature_cols = [
    "day_of_week", "is_weekend", "lag_1_day",
    "temperature_variability", "humidity_temp_interaction",
    "cloud_temp_interaction", "rolling_precipitation_7d"
]

if "features" in df.columns:
    df = df.drop("features")

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df = assembler.transform(df)

scaler = MinMaxScaler(inputCol="features", outputCol="scaled_features")
df = scaler.fit(df).transform(df)

Dataset size after cleaning: 3517002


                                                                                

##### Train test split

In [9]:
# Train-Test Split
train_df, test_df = df.randomSplit([0.9, 0.1], seed=42)

# Verify train-test sizes
print(f"Training set size: {train_df.count()}")
print(f"Test set size: {test_df.count()}")

# Stop training if dataset is empty
if train_df.count() == 0:
    raise ValueError("Training dataset is empty. Check preprocessing!")

                                                                                

Training set size: 3165126


                                                                                

Test set size: 351876


                                                                                

##### Random forest model

In [None]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

# Train Model
rf = RandomForestRegressor(featuresCol="features", labelCol="energy_sum", numTrees=100)
rf_model = rf.fit(train_df)

# Predictions
predictions = rf_model.transform(test_df)

# Evaluate model
evaluator = RegressionEvaluator(labelCol="energy_sum", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Test RMSE: {rmse}")

                                                                                

Training set size: 3165126


                                                                                

Test set size: 351876




Test RMSE: 4.627385526912342


                                                                                

##### XGBoost model

In [10]:
from xgboost.spark import SparkXGBRegressor

os.environ["DMLC_TRACKER_URI"] = "127.0.0.1"
os.environ["DMLC_TRACKER_PORT"] = "9091"
os.environ["DMLC_NUM_WORKER"] = "1"
os.environ["DMLC_NUM_SERVER"] = "1"

train_df_xgb, test_df_xgb = df.randomSplit([0.9, 0.1], seed=42)

In [None]:
# Train XGBoost Model
xgb = SparkXGBRegressor(
    features_col="scaled_features",
    label_col="energy_sum",
    max_depth=10,
    eta=0.05,
    subsample=0.8,
    # num_round=150,
    # n_workers=1,  # Run in single-worker mode
    # use_external_storage=False
)

xgb_model = xgb.fit(train_df_xgb)

# Make Predictions
predictions = xgb_model.transform(test_df_xgb)

# Evaluate Model Performance
evaluator = RegressionEvaluator(labelCol="energy_sum", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Test RMSE (XGBoost): {rmse}")

2025-03-13 01:56:31,155 INFO XGBoost-PySpark: _fit Running xgboost-2.1.4 on 1 workers with
	booster params: {'objective': 'reg:squarederror', 'device': 'cpu', 'max_depth': 10, 'subsample': 0.8, 'num_round': 150, 'eta': 0.05, 'n_workers': 1, 'use_external_storage': False, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2025-03-13 01:57:17,543 INFO XGBoost-PySpark: _train_booster Training on CPUs 1]
[01:57:18] Task 0 got rank 0
Parameters: { "n_workers", "num_round", "use_external_storage" } are not used.

2025-03-13 01:57:54,180 INFO XGBoost-PySpark: _fit Finished xgboost training!   
2025-03-13 01:58:47,725 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs

Test RMSE (XGBoost): 3.9490515479235446


                                                                                