In [1]:
import os

import pandas as pd
import numpy as np
from pathlib import Path

from datetime import datetime

from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Import the function from evaluation.py
from evaluation import test_model

# Setting

In [None]:
# path to where we our preprocessed data is
data_file_path = Path("../data")

# path to where we save our model results
model_results_file_path = Path("./model_results")

# target variable
TARGET_VAR = "price_per_dozen"

# Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Lasso Regression": Lasso(alpha=0.1)
}

# Load Dataset
dataset = pd.read_csv(f'{data_file_path}/merged_data.csv', index_col=0)

In [None]:
# Convert 'date' column to datetime format
dataset["date"] = pd.to_datetime(dataset["date"], format="%Y-%m")

# Extract numerical features (year & month)
dataset["year"] = dataset["date"].dt.year
dataset["month"] = dataset["date"].dt.month

# Drop original date column
dataset.drop(columns=["date"], inplace=True)

In [24]:
dataset.head()

Unnamed: 0,price_per_dozen,disaster_deaths_adjusted,disaster_cost_adjusted,disaster_cost_unadjusted,disaster_type_Flooding,disaster_type_Freeze,disaster_type_Severe Storm,disaster_type_Tropical Cyclone,disaster_type_Wildfire,disaster_type_Winter Storm,human_outbreaks_per_million,human_illnesses_per_million,covid_hospitalization_per_million,temp_overall,infected_flock_cnt,infected_bird_cnt,infected_h5n1_people_cnt,change_in_price_per_dozen,year,month
0,0.879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25.16129,0.0,0.0,0.0,0.0,1980,1
1,0.774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.366667,0.0,0.0,0.0,-0.105,1980,2
2,0.812,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33.487097,0.0,0.0,0.0,0.038,1980,3
3,0.797,7.0,7.919502,6.562162,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,48.588889,0.0,0.0,0.0,-0.015,1980,4
4,0.737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,61.634409,0.0,0.0,0.0,-0.06,1980,5


# Regression

In [None]:
X = dataset.drop([TARGET_VAR], axis=1) # Extract features (X) by dropping the target variable
Y = dataset.loc[:, TARGET_VAR:TARGET_VAR] # Extract the target variable (Y)

kf = KFold(n_splits=5, shuffle=True, random_state=42) # Initialize 5-Fold Cross Validation
fold = 1

# List to store all results
results_list = []

for train_index, val_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = Y.iloc[train_index], Y.iloc[val_index]

    for name, model in models.items():
        metrics = test_model(model, X_train, y_train, X_val, y_val)
        metrics["Model"] = name
        metrics["Fold"] = fold
        results_list.append(metrics)

    fold += 1

df_results = pd.DataFrame(results_list) # Convert results to a DataFrame
df_results


LinearRegression on Enhanced Features:

Evaluation Metrics:
           Model  Mean Squared Error  Root Mean Squared Error  Mean Absolute Error  r-squared
LinearRegression            0.125719                 0.354569             0.262925   0.713899

Ridge on Enhanced Features:

Evaluation Metrics:
Model  Mean Squared Error  Root Mean Squared Error  Mean Absolute Error  r-squared
Ridge            0.125575                 0.354366              0.26317   0.714228

Lasso on Enhanced Features:

Evaluation Metrics:
Model  Mean Squared Error  Root Mean Squared Error  Mean Absolute Error  r-squared
Lasso            0.123502                 0.351429             0.262732   0.718945

LinearRegression on Enhanced Features:

Evaluation Metrics:
           Model  Mean Squared Error  Root Mean Squared Error  Mean Absolute Error  r-squared
LinearRegression            0.145268                  0.38114             0.277616   0.713194

Ridge on Enhanced Features:

Evaluation Metrics:
Model  Mean Squared 

In [None]:
df_results.to_csv(f'{model_results_file_path}/linear_ridge_lasso_cv_result.csv')

Unnamed: 0,Model,Mean Squared Error,Root Mean Squared Error,Mean Absolute Error,r-squared,Fold
0,Linear Regression,0.125719,0.354569,0.262925,0.713899,1
1,Ridge Regression,0.125575,0.354366,0.26317,0.714228,1
2,Lasso Regression,0.123502,0.351429,0.262732,0.718945,1
3,Linear Regression,0.145268,0.38114,0.277616,0.713194,2
4,Ridge Regression,0.145915,0.381989,0.277966,0.711915,2
5,Lasso Regression,0.155037,0.393748,0.286054,0.693905,2
6,Linear Regression,0.103373,0.321516,0.237069,0.798266,3
7,Ridge Regression,0.103956,0.322421,0.237427,0.797128,3
8,Lasso Regression,0.109782,0.331334,0.243338,0.785758,3
9,Linear Regression,0.144962,0.380739,0.285029,0.725007,4
