In [1]:
# Cell 1: Markdown
"""
# COVID-19 Prediction Model
*By: Saurabh (Model Builder)*

## Objective:
- Build machine learning model to predict COVID trends
- Train and test the model
- Evaluate performance
- Make predictions
"""

'\n# COVID-19 Prediction Model\n*By: Saurabh (Model Builder)*\n\n## Objective:\n- Build machine learning model to predict COVID trends\n- Train and test the model\n- Evaluate performance\n- Make predictions\n'

In [2]:
# Cell 2: Imports and Setup
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

spark = SparkSession.builder.appName("COVID19_ML_Model").getOrCreate()
print("Spark ML session created!")

Spark ML session created!


In [3]:
# Cell 3: Load Processed Data
df = spark.read.csv("C:/Users/edifi/Downloads/cleaned_data.csv", header=True, inferSchema=True)

print(f"Loaded {df.count()} records")
df.show(5)

Loaded 328041 records
+-----------+----------+----------+-------------+
|    Country|   DateStr|TotalCases|DailyNewCases|
+-----------+----------+----------+-------------+
|Afghanistan|01-01-2021|     52513|            0|
|Afghanistan|01-01-2022|    158107|       105594|
|Afghanistan|01-01-2023|    207616|        49509|
|Afghanistan|01-02-2021|     52586|            0|
|Afghanistan|01-02-2022|    158189|       105603|
+-----------+----------+----------+-------------+
only showing top 5 rows


In [4]:
# Cell 4: Feature Selection
major_countries = df.groupBy("Country") \
    .agg(F.max("TotalCases").alias("MaxCases")) \
    .filter(F.col("MaxCases") > 10000) \
    .select("Country")

df_model = df.join(major_countries, "Country")

# Handle mixed date formats
df_model = df_model.withColumn("Date",
    F.when(
        F.col("DateStr").rlike("^\\d{2}-\\d{2}-\\d{4}$"),
        F.to_date(F.col("DateStr"), "dd-MM-yyyy")
    ).when(
        F.col("DateStr").rlike("^\\d{1,2}/\\d{1,2}/\\d{2}$"),
        F.to_date(F.col("DateStr"), "M/d/yy")
    ).otherwise(None)
)

# Filter out unparseable dates
df_model = df_model.filter(F.col("Date").isNotNull())

# Create time features
df_model = df_model.withColumn("DaysSinceStart",
                               F.datediff(F.col("Date"), F.lit("2020-01-22")))
df_model = df_model.withColumn("DayOfWeek", F.dayofweek("Date"))
df_model = df_model.withColumn("Month", F.month("Date"))

print("Features prepared:")
df_model.select("Date", "DaysSinceStart", "DayOfWeek", "Month", "DailyNewCases", "TotalCases").show(10)

Features prepared:
+----------+--------------+---------+-----+-------------+----------+
|      Date|DaysSinceStart|DayOfWeek|Month|DailyNewCases|TotalCases|
+----------+--------------+---------+-----+-------------+----------+
|2021-01-01|           345|        6|    1|            0|     52513|
|2022-01-01|           710|        7|    1|       105594|    158107|
|2023-01-01|          1075|        1|    1|        49509|    207616|
|2021-02-01|           376|        2|    2|            0|     52586|
|2022-02-01|           741|        3|    2|       105603|    158189|
|2023-02-01|          1106|        4|    2|        49438|    207627|
|2021-03-01|           404|        2|    3|            0|     52709|
|2022-03-01|           769|        3|    3|       105474|    158183|
|2023-03-01|          1134|        4|    3|        49471|    207654|
|2021-04-01|           435|        5|    4|            0|     52909|
+----------+--------------+---------+-----+-------------+----------+
only showing to

In [5]:
# Cell 5: Prepare Training Data
feature_cols = ["DaysSinceStart", "DayOfWeek", "Month", "TotalCases"]

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features",
    handleInvalid="skip"
)

df_model_clean = df_model.filter(F.col("Date").isNotNull()) \
                         .filter(F.col("DailyNewCases").isNotNull()) \
                         .filter(F.col("DaysSinceStart").isNotNull())

df_ml = assembler.transform(df_model_clean).select("features", "DailyNewCases")
df_ml = df_ml.filter(F.col("DailyNewCases").isNotNull())

print(f"ML dataset ready: {df_ml.count()} samples")
df_ml.show(5)

ML dataset ready: 304038 samples
+--------------------+-------------+
|            features|DailyNewCases|
+--------------------+-------------+
|[345.0,6.0,1.0,52...|            0|
|[710.0,7.0,1.0,15...|       105594|
|[1075.0,1.0,1.0,2...|        49509|
|[376.0,2.0,2.0,52...|            0|
|[741.0,3.0,2.0,15...|       105603|
+--------------------+-------------+
only showing top 5 rows


In [6]:
# Cell 6: Train-Test Split
train_data, test_data = df_ml.randomSplit([0.8, 0.2], seed=42)

print(f"Training set: {train_data.count()} samples")
print(f"Test set: {test_data.count()} samples")

Training set: 243209 samples
Test set: 60829 samples


In [7]:
# Cell 7: Build Model - Linear Regression
"""
## Step 5: Train Linear Regression Model
"""
# Create Linear Regression model
lr = LinearRegression(
    featuresCol="features",
    labelCol="DailyNewCases",
    maxIter=10
)

# Train model
print("Training Linear Regression model...")
lr_model = lr.fit(train_data)

print(f"✅ Model trained!")
print(f"Coefficients: {lr_model.coefficients}")
print(f"Intercept: {lr_model.intercept}")

Training Linear Regression model...
✅ Model trained!
Coefficients: [28.803690145425563,63.193763077030034,-1081.9559582542445,0.6169982857519708]
Intercept: 54098.180003030444


In [8]:
# Cell 8: Model Evaluation
"""
## Step 6: Evaluate Model Performance
"""
# Make predictions on test data
lr_predictions = lr_model.transform(test_data)

# Show some predictions
print("Sample predictions:")
lr_predictions.select("DailyNewCases", "prediction").show(10)

# Evaluate model
evaluator = RegressionEvaluator(
    labelCol="DailyNewCases",
    predictionCol="prediction",
    metricName="rmse"
)

rmse = evaluator.evaluate(lr_predictions)
mae_evaluator = RegressionEvaluator(labelCol="DailyNewCases", predictionCol="prediction", metricName="mae")
mae = mae_evaluator.evaluate(lr_predictions)
r2_evaluator = RegressionEvaluator(labelCol="DailyNewCases", predictionCol="prediction", metricName="r2")
r2 = r2_evaluator.evaluate(lr_predictions)

print("\n" + "="*50)
print("LINEAR REGRESSION MODEL PERFORMANCE")
print("="*50)
print(f"RMSE: {rmse:,.2f}")
print(f"MAE: {mae:,.2f}")
print(f"R²: {r2:.4f}")
print("="*50)

Sample predictions:
+-------------+------------------+
|DailyNewCases|        prediction|
+-------------+------------------+
|            0|52756.119057252836|
|            0|52756.119057252836|
|            0|52756.119057252836|
|            0|52756.119057252836|
|            0|52756.119057252836|
|            0|52756.119057252836|
|            0|52756.119057252836|
|            0|52756.119057252836|
|            0|52756.119057252836|
|            0|52756.119057252836|
+-------------+------------------+
only showing top 10 rows

LINEAR REGRESSION MODEL PERFORMANCE
RMSE: 1,230,476.77
MAE: 283,332.37
R²: 0.8668


In [9]:
# Cell 9: Alternative Model - Random Forest
rf = RandomForestRegressor(
    featuresCol="features",
    labelCol="DailyNewCases",
    numTrees=20,
    seed=42
)

print("Training Random Forest model...")
rf_model = rf.fit(train_data)
rf_predictions = rf_model.transform(test_data)

evaluator_rmse = RegressionEvaluator(labelCol="DailyNewCases", predictionCol="prediction", metricName="rmse")
evaluator_mae  = RegressionEvaluator(labelCol="DailyNewCases", predictionCol="prediction", metricName="mae")
evaluator_r2   = RegressionEvaluator(labelCol="DailyNewCases", predictionCol="prediction", metricName="r2")

rf_rmse = evaluator_rmse.evaluate(rf_predictions)
rf_mae  = evaluator_mae.evaluate(rf_predictions)
rf_r2   = evaluator_r2.evaluate(rf_predictions)

print("\n" + "="*50)
print("RANDOM FOREST MODEL PERFORMANCE")
print("="*50)
print(f"RMSE: {rf_rmse:,.2f}")
print(f"MAE:  {rf_mae:,.2f}")
print(f"R²:   {rf_r2:.4f}")
print("="*50)

print("\n✅ Best Model: Linear Regression" if r2 > rf_r2 else "\n✅ Best Model: Random Forest")

Training Random Forest model...

RANDOM FOREST MODEL PERFORMANCE
RMSE: 2,016,656.12
MAE:  399,999.70
R²:   0.6422

✅ Best Model: Linear Regression


In [10]:
# Cell 10: Save Model and Results
best_model_name = "Linear Regression"
best_predictions = lr_predictions

print("✅ Best model selected: Linear Regression")
print(f"LR RMSE: {rmse:,.2f}")

✅ Best model selected: Linear Regression
LR RMSE: 1,230,476.77


In [11]:
# Cell 11: Model Summary
"""
## Final Model Report
"""
print("\n" + "="*60)
print("MODEL BUILDING COMPLETE")
print("="*60)
print(f"Best Model: {best_model_name}")
print(f"Training Samples: {train_data.count()}")
print(f"Test Samples: {test_data.count()}")
print(f"Features Used: {', '.join(feature_cols)}")
print(f"Model Performance (RMSE): {min(rmse, rf_rmse):,.2f}")
print("="*60)
print("\n✅ Model ready for predictions!")

spark.stop()


MODEL BUILDING COMPLETE
Best Model: Linear Regression
Training Samples: 243209
Test Samples: 60829
Features Used: DaysSinceStart, DayOfWeek, Month, TotalCases
Model Performance (RMSE): 1,230,476.77

✅ Model ready for predictions!
