In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from pathlib import Path

import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer
from sklearn.model_selection import GridSearchCV

# Setting

In [None]:
# path to where we our preprocessed data is
data_file_path = Path("../data")

# path to where we save our model results
model_results_file_path = Path("./model_results")

# target variable
TARGET_VAR = "price_per_dozen"

# Initialize models
xgb1 = XGBRegressor()
parameters = {'objective':['reg:squarederror'],
              'learning_rate': [0.1], #so called `eta` value
              'max_depth': [4, 5, 6, 7],
              'min_child_weight': [4],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [1000]}

# Load Dataset
dataset = pd.read_csv(f'{data_file_path}/merged_data.csv', index_col=0)

In [None]:
dataset = pd.read_csv(f'{data_file_path}/merged_data.csv', index_col=0) # change the merged egg price 

In [None]:
dataset.columns

In [None]:
X = dataset[['disaster_cost_adjusted', 'human_outbreaks_per_million', 'covid_hospitalization_per_million', 'infected_flock_cnt', 'infected_h5n1_people_cnt', 'gas_price_per_gallon', 'temp_variance']].copy()
Y = dataset.loc[:, TARGET_VAR:TARGET_VAR] # Extract the target variable (Y)

# Regression

In [None]:
# Initialiaze gridsearchCV
scoring = {"MSE": make_scorer(mean_squared_error), "MAE": make_scorer(mean_absolute_error), "R-squared": make_scorer(r2_score)}
xgb_grid = GridSearchCV(xgb1,
                        parameters,
                        cv = 5,
                        n_jobs = None,
                        scoring=scoring,
                        refit = "R-squared",
                        verbose=True)

In [None]:
xgb_grid.fit(X, Y)

best_score = xgb_grid.best_score_
best_params = xgb_grid.best_params_

In [None]:
print(best_score)
print(best_params)

In [None]:
results = xgb_grid.cv_results_

In [None]:
plt.figure(figsize=(13, 13))
plt.title("GridSearchCV evaluating using multiple scorers simultaneously", fontsize=16)

plt.xlabel("min_samples_split")
plt.ylabel("Score")


# Get the regular numpy array from the MaskedArray
Y_axis_mse = np.array(results["mean_test_MSE"].data, dtype=float)
Y_axis_mae = np.array(results["mean_test_MAE"].data, dtype=float)
Y_axis_r2 = np.array(results["mean_test_R-squared"].data, dtype=float)
X_axis = [4, 5, 6, 7]
plt.plot(X_axis, Y_axis_mse, label='MSE')
plt.plot(X_axis, Y_axis_mae, label='MAE')
# plt.plot(X_axis, Y_axis_r2, label='R2')


plt.legend(loc="best")
plt.grid(False)
plt.show()