# Hyper Parameter Tuning with XGBoost Regression on basic weather data

Based on the XGBoostV1 file, this will add some basic hyperparameter tuning to see if we can improve our model based on MAE.

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

from utilities import data_basic_utility as databasic
from utilities import dataframe_utility as dfutil

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

# dfFootWeather = pd.read_csv("./data_files/FootTrafficWeather_July2022_Melbourne.csv")
dfFootWeather = pd.read_csv("./data_files/FootTrafficWeatherMelb_20130101_20220701.csv", parse_dates=["date"])
thisFileName = "07b.RegressionXGboostTuning"

print(dfFootWeather.shape)
print(dfFootWeather.info())
dfFootWeather.head()

(3103, 10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3103 entries, 0 to 3102
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   date              3103 non-null   datetime64[ns]
 1   total_people      3103 non-null   float64       
 2   total_rain        3098 non-null   float64       
 3   rain_quality      3098 non-null   object        
 4   max_temp          3102 non-null   float64       
 5   max_temp_quality  3101 non-null   object        
 6   min_temp          3102 non-null   float64       
 7   min_temp_quality  3102 non-null   object        
 8   solar_exp         3102 non-null   float64       
 9   WeekDay           3103 non-null   int64         
dtypes: datetime64[ns](1), float64(5), int64(1), object(3)
memory usage: 242.5+ KB
None


Unnamed: 0,date,total_people,total_rain,rain_quality,max_temp,max_temp_quality,min_temp,min_temp_quality,solar_exp,WeekDay
0,2022-07-31,327383.0,0.0,N,14.7,Y,4.3,Y,4.8,6
1,2022-07-30,462115.0,0.0,N,13.0,Y,2.1,Y,11.3,5
2,2022-07-29,405511.0,1.0,N,12.7,Y,6.5,Y,11.2,4
3,2022-07-28,334858.0,1.0,N,13.2,Y,9.3,Y,9.3,3
4,2022-07-27,340569.0,3.0,N,15.3,Y,9.3,Y,7.7,2


In [2]:
dfFootWeather = dfFootWeather[dfFootWeather["total_rain"].notna()]
dfFootWeather = dfFootWeather[dfFootWeather["solar_exp"].notna()]

# assume missing quality is an N
dfFootWeather.loc[dfFootWeather["max_temp_quality"].isna(), "max_temp_quality"] = "N"

print(dfFootWeather.shape)
print(dfFootWeather.info())
dfFootWeather.head()

(3097, 10)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3097 entries, 0 to 3102
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   date              3097 non-null   datetime64[ns]
 1   total_people      3097 non-null   float64       
 2   total_rain        3097 non-null   float64       
 3   rain_quality      3097 non-null   object        
 4   max_temp          3097 non-null   float64       
 5   max_temp_quality  3097 non-null   object        
 6   min_temp          3097 non-null   float64       
 7   min_temp_quality  3097 non-null   object        
 8   solar_exp         3097 non-null   float64       
 9   WeekDay           3097 non-null   int64         
dtypes: datetime64[ns](1), float64(5), int64(1), object(3)
memory usage: 266.1+ KB
None


Unnamed: 0,date,total_people,total_rain,rain_quality,max_temp,max_temp_quality,min_temp,min_temp_quality,solar_exp,WeekDay
0,2022-07-31,327383.0,0.0,N,14.7,Y,4.3,Y,4.8,6
1,2022-07-30,462115.0,0.0,N,13.0,Y,2.1,Y,11.3,5
2,2022-07-29,405511.0,1.0,N,12.7,Y,6.5,Y,11.2,4
3,2022-07-28,334858.0,1.0,N,13.2,Y,9.3,Y,9.3,3
4,2022-07-27,340569.0,3.0,N,15.3,Y,9.3,Y,7.7,2


### Feature Engineering

First, we need to convert any non-number columns into numbers that the model can understand. This first version isn't doing anything beyond that, later on we should probably look for any missing data flags, and maybe do some column Min/Max scaling or other.

Convert the 3 Quality Y/N columns into 1/0 values, use the shared utility function for future code reuse

In [3]:
dfFootWeather = dfutil.convertBoolColToInt(dfFootWeather, "rain_quality")
dfFootWeather = dfutil.convertBoolColToInt(dfFootWeather, "max_temp_quality")
dfFootWeather = dfutil.convertBoolColToInt(dfFootWeather, "min_temp_quality")
dfFootWeather.head()

Unnamed: 0,date,total_people,total_rain,rain_quality,max_temp,max_temp_quality,min_temp,min_temp_quality,solar_exp,WeekDay
0,2022-07-31,327383.0,0.0,0,14.7,1,4.3,1,4.8,6
1,2022-07-30,462115.0,0.0,0,13.0,1,2.1,1,11.3,5
2,2022-07-29,405511.0,1.0,0,12.7,1,6.5,1,11.2,4
3,2022-07-28,334858.0,1.0,0,13.2,1,9.3,1,9.3,3
4,2022-07-27,340569.0,3.0,0,15.3,1,9.3,1,7.7,2


In [4]:
print(type(dfFootWeather["date"].dtype))
print(dfFootWeather["date"].dtype == "object")
print(dfFootWeather["date"][0])

<class 'numpy.dtype[datetime64]'>
False
2022-07-31 00:00:00


In [5]:
dfFootWeather = dfutil.separateYmdCol(dfFootWeather, "date")
print(dfFootWeather.info())
dfFootWeather.head(20)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3097 entries, 0 to 3102
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   total_people      3097 non-null   float64
 1   total_rain        3097 non-null   float64
 2   rain_quality      3097 non-null   int64  
 3   max_temp          3097 non-null   float64
 4   max_temp_quality  3097 non-null   int64  
 5   min_temp          3097 non-null   float64
 6   min_temp_quality  3097 non-null   int64  
 7   solar_exp         3097 non-null   float64
 8   WeekDay           3097 non-null   int64  
 9   date_year         3097 non-null   int64  
 10  date_month        3097 non-null   int64  
 11  date_day          3097 non-null   int64  
dtypes: float64(5), int64(7)
memory usage: 379.1 KB
None


Unnamed: 0,total_people,total_rain,rain_quality,max_temp,max_temp_quality,min_temp,min_temp_quality,solar_exp,WeekDay,date_year,date_month,date_day
0,327383.0,0.0,0,14.7,1,4.3,1,4.8,6,2022,7,31
1,462115.0,0.0,0,13.0,1,2.1,1,11.3,5,2022,7,30
2,405511.0,1.0,0,12.7,1,6.5,1,11.2,4,2022,7,29
3,334858.0,1.0,0,13.2,1,9.3,1,9.3,3,2022,7,28
4,340569.0,3.0,0,15.3,1,9.3,1,7.7,2,2022,7,27
5,316316.0,4.4,0,13.2,1,8.8,1,6.4,1,2022,7,26
6,274106.0,0.0,0,16.8,1,8.1,1,5.1,0,2022,7,25
7,406977.0,7.8,0,19.3,1,10.4,1,10.1,6,2022,7,24
8,371336.0,1.2,0,14.5,1,8.1,1,5.1,5,2022,7,23
9,416838.0,0.0,0,19.0,1,3.4,1,9.9,4,2022,7,22


With Hyperparameter Tuning, we're not doing K-Folds, instead, we'll use Gridsearch CV to try different permutations of parameters

In [6]:
# Split the columns between the target and features
targetColName = "total_people"
col_names = dfFootWeather.columns
feature_cols = col_names.drop([targetColName])
trainFeatures = dfFootWeather[feature_cols]
trainTargets = dfFootWeather[targetColName]


In [7]:
# Configure basic XGBoost Regression object and the grid of possible parameter settings to test
randomSeed = databasic.get_random_seed()
model = xgb.XGBRegressor(objective="reg:squarederror", booster="gbtree", seed=randomSeed)
gbm_param_grid = { 'colsample_bytree': [0.3, 0.7], 'n_estimators': [10, 100, 200, 500, 1000], 'max_depth': [2, 4, 6, 8, 10], "eta": [0.3, 0.1, 0.01] }

x_train, x_vali, y_train, y_vali = train_test_split(trainFeatures, trainTargets, test_size=0.2, random_state=databasic.get_random_seed())

tuned_mse = GridSearchCV(param_grid=gbm_param_grid, estimator=model, scoring="neg_mean_absolute_error", cv=5, verbose=1)
tuned_mse.fit(x_train, y_train)

# Print the best parameters and lowest RMSE
print("Best parameters found: ", tuned_mse.best_params_)
print("Lowest MAE found: ", tuned_mse.best_score_)

Fitting 5 folds for each of 150 candidates, totalling 750 fits
Best parameters found:  {'colsample_bytree': 0.7, 'eta': 0.1, 'max_depth': 6, 'n_estimators': 200}
Lowest MAE found:  -43476.65617123382


### Run 1
- Fitting 5 folds for each of 150 candidates, totalling 750 fits
- Time to run: 8m 2.1s
- Best parameters found:  {'colsample_bytree': 0.7, 'eta': 0.1, 'max_depth': 6, 'n_estimators': 200}
- Lowest MAE found:  -43476.65617123382