In [1]:
import numpy as np
import pandas as pd
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score

In [2]:
data = pd.read_csv('../data/merged_data.csv', index_col = 0)
data.head()

Unnamed: 0,price_per_dozen,change_in_price_per_dozen,disaster_deaths_adjusted,disaster_cost_adjusted,human_outbreaks_per_million,human_illnesses_per_million,covid_hospitalization_per_million,infected_flock_cnt,infected_bird_cnt,infected_h5n1_people_cnt,temp_variance,gas_price_per_gallon
0,0.879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.605921,1.11
1,0.774,-0.105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.400545,1.186
2,0.812,0.038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.771467,1.23
3,0.797,-0.015,7.0,7.919502,0.0,0.0,0.0,0.0,0.0,0.0,1.669675,1.242
4,0.737,-0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.375845,1.244


In [3]:
X = data[['disaster_cost_adjusted', 'human_outbreaks_per_million', 'covid_hospitalization_per_million', 
          'infected_flock_cnt', 'infected_h5n1_people_cnt', 'gas_price_per_gallon', 'temp_variance']].copy()
Y = data[['price_per_dozen']]
X.head(), Y.head()

(   disaster_cost_adjusted  human_outbreaks_per_million  \
 0                0.000000                          0.0   
 1                0.000000                          0.0   
 2                0.000000                          0.0   
 3                7.919502                          0.0   
 4                0.000000                          0.0   
 
    covid_hospitalization_per_million  infected_flock_cnt  \
 0                                0.0                 0.0   
 1                                0.0                 0.0   
 2                                0.0                 0.0   
 3                                0.0                 0.0   
 4                                0.0                 0.0   
 
    infected_h5n1_people_cnt  gas_price_per_gallon  temp_variance  
 0                       0.0                 1.110       2.605921  
 1                       0.0                 1.186       5.400545  
 2                       0.0                 1.230      16.771467  
 3  

In [4]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [5]:
models = {
    "SVR": SVR()
}

In [6]:
param_grids = {
    "SVR": {
        'C': [0.001, 0.1, 1, 10, 100],  
        'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],  
        'epsilon': [0.001, 0.01, 0.1, 0.5, 1]  
    }
}

In [7]:
scoring = {
    "MSE": make_scorer(mean_squared_error, greater_is_better=False),
    "MAE": make_scorer(mean_absolute_error, greater_is_better=False),
    "R2": make_scorer(r2_score)
}

In [8]:
# List to store results
results_list = []

# Loop through models
for name, model in models.items():
    if name in param_grids:  # Apply GridSearchCV if hyperparameters exist
        grid_search = GridSearchCV(
            model, param_grids[name], 
            cv = 5, scoring = scoring, refit = "R2", n_jobs = -1, verbose = True
        )
        grid_search.fit(X_scaled, Y.values.ravel())  # Fit the model using training data

        best_params = grid_search.best_params_
        best_model = grid_search.best_estimator_

        # Cross-validation scores on the best model
        r2_scores = cross_val_score(best_model, X_scaled, Y.values.ravel(), cv = 5, scoring = "r2")
        mse_scores = -cross_val_score(best_model, X_scaled, Y.values.ravel(), cv = 5, scoring = "neg_mean_squared_error")
        mae_scores = -cross_val_score(best_model, X_scaled, Y.values.ravel(), cv = 5, scoring = "neg_mean_absolute_error")

        # Store cross-validation results
        results_list.append({
            'Model': name,
            'Best Params': best_params,
            'MAE': np.mean(mae_scores),
            'MAE Std': np.std(mae_scores),
            'RMSE': np.sqrt(np.mean(mse_scores)),
            'RMSE Std': np.std(np.sqrt(mse_scores)),
            'MSE': np.mean(mse_scores),
            'MSE Std': np.std(mse_scores),
            'R2': np.mean(r2_scores),
            'R2 Std': np.std(r2_scores)
        })
    else:
        model.fit(X_scaled, Y.values.ravel())  # Train directly without GridSearch
        best_params = "N/A"
        best_model = model

        # Cross-validation scores on the model
        r2_scores = cross_val_score(best_model, X_scaled, Y.values.ravel(), cv = 5, scoring = "r2")
        mse_scores = -cross_val_score(best_model, X_scaled, Y.values.ravel(), cv = 5, scoring = "neg_mean_squared_error")
        mae_scores = -cross_val_score(best_model, X_scaled, Y.values.ravel(), cv = 5, scoring = "neg_mean_absolute_error")

        # Store cross-validation results
        results_list.append({
            'Model': name,
            'Best Params': best_params,
            'MAE': np.mean(mae_scores),
            'MAE Std': np.std(mae_scores),
            'RMSE': np.sqrt(np.mean(mse_scores)),
            'RMSE Std': np.std(np.sqrt(mse_scores)),
            'MSE': np.mean(mse_scores),
            'MSE Std': np.std(mse_scores),
            'R2': np.mean(r2_scores),
            'R2 Std': np.std(r2_scores)
        })

Fitting 5 folds for each of 150 candidates, totalling 750 fits


In [9]:
df_results = pd.DataFrame(results_list)
df_results

Unnamed: 0,Model,Best Params,MAE,MAE Std,RMSE,RMSE Std,MSE,MSE Std,R2,R2 Std
0,SVR,"{'C': 10, 'epsilon': 0.1, 'gamma': 0.001}",0.233039,0.152707,0.446443,0.277006,0.199312,0.276633,-0.452878,0.384302


In [10]:
from pathlib import Path

model_results_file_path = Path("./model_results")
df_results.to_csv(f'{model_results_file_path}/svm_cv_result.csv')

In [11]:
# C = 10: The regularization parameter. A larger value of C suggests that the model is trying to fit the data more closely, placing a higher penalty on misclassification (or in regression, on large errors). 
# A value of C = 10 implies the model has a stronger emphasis on reducing training error, which may lead to less bias but possibly more variance (risk of overfitting).
# epsilon = 0.1: The epsilon parameter controls the width of the margin where no penalty is applied to errors. 
# A value of 0.1 suggests that the model allows some flexibility in the errors without penalizing them. A moderate value like this is often a good starting point to strike a balance between fitting the data closely and maintaining a smooth model.
# gamma = 0.001: The gamma parameter controls the influence of each individual training point. 
# A small gamma value like 0.001 means that each training point has a broad influence on the decision boundary, leading to a smoother decision boundary and prevents the model from reacting too strongly to small changes in the data, which can help with generalization and avoiding overfitting.

In [None]:
# MAE of 0.233 indicates that, on average, the model is off by about 0.23 units from the true values.
# The RMSE of 0.45 shows the average magnitude of errors, with larger errors penalized more, and the MSE confirms that the model's errors are quite substantial.
# R² of -0.45 indicates that the model is performing poorly and is unable to explain the variance in the target variable better than a naive model.
# The relatively high standard deviations in the metrics (especially in R² and RMSE) suggest that the model's performance is inconsistent across the different folds of the cross-validation.
# Since the R² is negative, the model is not explaining the variance well and does worse than a model that always predicts the mean. This could be due to various factors like:
# The feature set may not be a good match for the target variable.
# SVR might not be the best model for this problem.