In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from pathlib import Path

import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer
from sklearn.model_selection import GridSearchCV, cross_validate

# Setting

In [4]:
# path to where we our preprocessed data is
data_file_path = Path("../data")

# path to where we save our model results
model_results_file_path = Path("./model_results")

# target variable
TARGET_VAR = "price_per_dozen"

# Initialize models
models = {'xgb1' : XGBRegressor()}
param_grids = {
    'xgb1': {'objective':['reg:squarederror'],
              'learning_rate': [0.1], #so called `eta` value
              'max_depth': range(10),
              'min_child_weight': [4],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [1000]},
}
parameters = {'objective':['reg:squarederror'],
              'learning_rate': [0.1], #so called `eta` value
              'max_depth': range(10),
              'min_child_weight': [4],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [1000]}

# Load Dataset
dataset = pd.read_csv(f'{data_file_path}/merged_data.csv', index_col=0)

# Regression

## Try with all features

In [5]:
X = dataset[['disaster_cost_adjusted', 'human_outbreaks_per_million', 'covid_hospitalization_per_million', 'infected_flock_cnt', 'infected_h5n1_people_cnt', 'gas_price_per_gallon', 'temp_variance']].copy()
Y = dataset.loc[:, TARGET_VAR:TARGET_VAR] # Extract the target variable (Y)

# List to store all results
results_list = []
scoring = {
    "MSE": make_scorer(mean_squared_error, greater_is_better=False),
    "MAE": make_scorer(mean_absolute_error, greater_is_better=False),
    "R2": make_scorer(r2_score)
}

# Loop through models
for name, model in models.items():
    if name in param_grids:  # Apply GridSearchCV if hyperparameters exist
        grid_search = GridSearchCV(
            model, param_grids[name], 
            cv=5, scoring=scoring, refit="R2", n_jobs=-1, verbose=1
        )
        grid_search.fit(X, Y)

        best_params = grid_search.best_params_
        best_model = grid_search.best_estimator_
    else:
        model.fit(X, Y)  # Train model directly if no hyperparameters
        best_params = "N/A"
        best_model = model

    
    # Cross-validation scores on the best model
    scores = cross_validate(best_model, X, Y, cv=5, scoring=scoring)

    # Store results
    results_list.append({
        'Model': name,
        'Best Params': best_params,
        'MAE': -np.mean(scores["test_MAE"]),
        'MAE Std': np.std(scores["test_MAE"]),
        'RMSE': np.sqrt(-np.mean(scores["test_MSE"])),
        'RMSE Std': np.std(np.sqrt(-scores["test_MSE"])),
        'MSE': -np.mean(scores["test_MSE"]),
        'MSE Std': np.std(scores["test_MSE"]),
        'R2': np.mean(scores["test_R2"]),
        'R2 Std': np.std(scores["test_R2"])
    })

# Convert results to a DataFrame
df_results = pd.DataFrame(results_list)
df_results.head()

Fitting 5 folds for each of 10 candidates, totalling 50 fits


Unnamed: 0,Model,Best Params,MAE,MAE Std,RMSE,RMSE Std,MSE,MSE Std,R2,R2 Std
0,xgb1,"{'colsample_bytree': 0.7, 'learning_rate': 0.1...",0.330155,0.185305,0.511999,0.276887,0.262143,0.309567,-1.805337,1.342354


In [6]:
df_results.to_csv(f'{model_results_file_path}/xgboost_result.csv')

## Try with some non-zero features

In [7]:
X = dataset[['disaster_cost_adjusted', 'human_outbreaks_per_million', 'infected_h5n1_people_cnt', 'gas_price_per_gallon', 'temp_variance']].copy()
Y = dataset.loc[:, TARGET_VAR:TARGET_VAR] # Extract the target variable (Y)

# List to store all results
results_list = []
scoring = {
    "MSE": make_scorer(mean_squared_error, greater_is_better=False),
    "MAE": make_scorer(mean_absolute_error, greater_is_better=False),
    "R2": make_scorer(r2_score)
}

# Loop through models
for name, model in models.items():
    if name in param_grids:  # Apply GridSearchCV if hyperparameters exist
        grid_search = GridSearchCV(
            model, param_grids[name], 
            cv=5, scoring=scoring, refit="R2", n_jobs=-1, verbose=1
        )
        grid_search.fit(X, Y)

        best_params = grid_search.best_params_
        best_model = grid_search.best_estimator_
    else:
        model.fit(X, Y)  # Train model directly if no hyperparameters
        best_params = "N/A"
        best_model = model

    
    # Cross-validation scores on the best model
    scores = cross_validate(best_model, X, Y, cv=5, scoring=scoring)

    # Store results
    results_list.append({
        'Model': name,
        'Best Params': best_params,
        'MAE': -np.mean(scores["test_MAE"]),
        'MAE Std': np.std(scores["test_MAE"]),
        'RMSE': np.sqrt(-np.mean(scores["test_MSE"])),
        'RMSE Std': np.std(np.sqrt(-scores["test_MSE"])),
        'MSE': -np.mean(scores["test_MSE"]),
        'MSE Std': np.std(scores["test_MSE"]),
        'R2': np.mean(scores["test_R2"]),
        'R2 Std': np.std(scores["test_R2"])
    })

# Convert results to a DataFrame
df_results = pd.DataFrame(results_list)
df_results.head()

Fitting 5 folds for each of 10 candidates, totalling 50 fits


Unnamed: 0,Model,Best Params,MAE,MAE Std,RMSE,RMSE Std,MSE,MSE Std,R2,R2 Std
0,xgb1,"{'colsample_bytree': 0.7, 'learning_rate': 0.1...",0.393293,0.220159,0.572035,0.289965,0.327224,0.316968,-2.776632,1.68918


In [None]:
df_results.to_csv(f'{model_results_file_path}/xgb_cv_result.csv')