# Hyper Parameter Tuning with XGBoost Regression on basic weather data

Based on the XGBoostV1 file, this will add some basic hyperparameter tuning to see if we can improve our model based on MAE.

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

from utilities import data_basic_utility as databasic
from utilities import dataframe_utility as dfutil

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

# dfFootWeather = pd.read_csv("./data_files/FootTrafficWeather_July2022_Melbourne.csv")
dfFootWeather = pd.read_csv("./data_files/FT_Street_Melb_20130101_20220701.csv", parse_dates=["date"])
thisFileName = "11b.StreetXGboostTuning"

print(dfFootWeather.shape)
print(dfFootWeather.info())
dfFootWeather.head()

c:\Users\nelso\AppData\Local\Programs\Python\Python39\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
c:\Users\nelso\AppData\Local\Programs\Python\Python39\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll


(24493, 13)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24493 entries, 0 to 24492
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   date                      24493 non-null  datetime64[ns]
 1   street                    24493 non-null  object        
 2   total_people              24493 non-null  int64         
 3   total_rain                23394 non-null  float64       
 4   rain_quality              23394 non-null  object        
 5   max_temp                  23429 non-null  float64       
 6   max_temp_quality          23422 non-null  object        
 7   min_temp                  23422 non-null  float64       
 8   min_temp_quality          23422 non-null  object        
 9   solar_exp                 24486 non-null  float64       
 10  WeekDay                   24493 non-null  int64         
 11  population_annual         24493 non-null  int64         
 12  popula

Unnamed: 0,date,street,total_people,total_rain,rain_quality,max_temp,max_temp_quality,min_temp,min_temp_quality,solar_exp,WeekDay,population_annual,population_change_annual
0,2022-07-31,Bourke Street Mall (North),15434,0.0,N,14.7,Y,4.3,Y,4.8,6,5151000,1.78
1,2022-07-31,Spencer St-Collins St (North),12349,0.0,N,14.7,Y,4.3,Y,4.8,6,5151000,1.78
2,2022-07-31,Southern Cross Station,1661,0.0,N,14.7,Y,4.3,Y,4.8,6,5151000,1.78
3,2022-07-31,QV Market-Peel St,3203,0.0,N,14.7,Y,4.3,Y,4.8,6,5151000,1.78
4,2022-07-31,Melbourne Central,23363,0.0,N,14.7,Y,4.3,Y,4.8,6,5151000,1.78


In [2]:
dfFootWeather = dfFootWeather[dfFootWeather["total_rain"].notna()]
dfFootWeather = dfFootWeather[dfFootWeather["solar_exp"].notna()]

# assume missing quality is an N
dfFootWeather.loc[dfFootWeather["max_temp_quality"].isna(), "max_temp_quality"] = "N"

print(dfFootWeather.shape)
print(dfFootWeather.info())
dfFootWeather.head()

(23387, 13)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 23387 entries, 0 to 23428
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   date                      23387 non-null  datetime64[ns]
 1   street                    23387 non-null  object        
 2   total_people              23387 non-null  int64         
 3   total_rain                23387 non-null  float64       
 4   rain_quality              23387 non-null  object        
 5   max_temp                  23387 non-null  float64       
 6   max_temp_quality          23387 non-null  object        
 7   min_temp                  23387 non-null  float64       
 8   min_temp_quality          23387 non-null  object        
 9   solar_exp                 23387 non-null  float64       
 10  WeekDay                   23387 non-null  int64         
 11  population_annual         23387 non-null  int64         
 12  popula

Unnamed: 0,date,street,total_people,total_rain,rain_quality,max_temp,max_temp_quality,min_temp,min_temp_quality,solar_exp,WeekDay,population_annual,population_change_annual
0,2022-07-31,Bourke Street Mall (North),15434,0.0,N,14.7,Y,4.3,Y,4.8,6,5151000,1.78
1,2022-07-31,Spencer St-Collins St (North),12349,0.0,N,14.7,Y,4.3,Y,4.8,6,5151000,1.78
2,2022-07-31,Southern Cross Station,1661,0.0,N,14.7,Y,4.3,Y,4.8,6,5151000,1.78
3,2022-07-31,QV Market-Peel St,3203,0.0,N,14.7,Y,4.3,Y,4.8,6,5151000,1.78
4,2022-07-31,Melbourne Central,23363,0.0,N,14.7,Y,4.3,Y,4.8,6,5151000,1.78


### Feature Engineering

First, we need to convert any non-number columns into numbers that the model can understand. This first version isn't doing anything beyond that, later on we should probably look for any missing data flags, and maybe do some column Min/Max scaling or other.

Convert the 3 Quality Y/N columns into 1/0 values, use the shared utility function for future code reuse

In [3]:
dfFootWeather = dfutil.convertBoolColToInt(dfFootWeather, "rain_quality")
dfFootWeather = dfutil.convertBoolColToInt(dfFootWeather, "max_temp_quality")
dfFootWeather = dfutil.convertBoolColToInt(dfFootWeather, "min_temp_quality")
dfFootWeather.head()

Unnamed: 0,date,street,total_people,total_rain,rain_quality,max_temp,max_temp_quality,min_temp,min_temp_quality,solar_exp,WeekDay,population_annual,population_change_annual
0,2022-07-31,Bourke Street Mall (North),15434,0.0,0,14.7,1,4.3,1,4.8,6,5151000,1.78
1,2022-07-31,Spencer St-Collins St (North),12349,0.0,0,14.7,1,4.3,1,4.8,6,5151000,1.78
2,2022-07-31,Southern Cross Station,1661,0.0,0,14.7,1,4.3,1,4.8,6,5151000,1.78
3,2022-07-31,QV Market-Peel St,3203,0.0,0,14.7,1,4.3,1,4.8,6,5151000,1.78
4,2022-07-31,Melbourne Central,23363,0.0,0,14.7,1,4.3,1,4.8,6,5151000,1.78


In [4]:
print(type(dfFootWeather["date"].dtype))
print(dfFootWeather["date"].dtype == "object")
print(dfFootWeather["date"][0])

<class 'numpy.dtype[datetime64]'>
False
2022-07-31 00:00:00


In [5]:
dfFootWeather = dfutil.separateYmdCol(dfFootWeather, "date")

# Do one hot encoding on street
dfFootWeather = pd.get_dummies(data=dfFootWeather, columns=["street"])

In [6]:

print(dfFootWeather.info())
dfFootWeather.head(20)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23387 entries, 0 to 23428
Data columns (total 21 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   total_people                          23387 non-null  int64  
 1   total_rain                            23387 non-null  float64
 2   rain_quality                          23387 non-null  int64  
 3   max_temp                              23387 non-null  float64
 4   max_temp_quality                      23387 non-null  int64  
 5   min_temp                              23387 non-null  float64
 6   min_temp_quality                      23387 non-null  int64  
 7   solar_exp                             23387 non-null  float64
 8   WeekDay                               23387 non-null  int64  
 9   population_annual                     23387 non-null  int64  
 10  population_change_annual              23387 non-null  float64
 11  date_year      

Unnamed: 0,total_people,total_rain,rain_quality,max_temp,max_temp_quality,min_temp,min_temp_quality,solar_exp,WeekDay,population_annual,...,date_year,date_month,date_day,street_Bourke Street Mall (North),street_Chinatown-Swanston St (North),street_Collins Place (North),street_Melbourne Central,street_QV Market-Peel St,street_Southern Cross Station,street_Spencer St-Collins St (North)
0,15434,0.0,0,14.7,1,4.3,1,4.8,6,5151000,...,2022,7,31,1,0,0,0,0,0,0
1,12349,0.0,0,14.7,1,4.3,1,4.8,6,5151000,...,2022,7,31,0,0,0,0,0,0,1
2,1661,0.0,0,14.7,1,4.3,1,4.8,6,5151000,...,2022,7,31,0,0,0,0,0,1,0
3,3203,0.0,0,14.7,1,4.3,1,4.8,6,5151000,...,2022,7,31,0,0,0,0,1,0,0
4,23363,0.0,0,14.7,1,4.3,1,4.8,6,5151000,...,2022,7,31,0,0,0,1,0,0,0
5,1410,0.0,0,14.7,1,4.3,1,4.8,6,5151000,...,2022,7,31,0,0,1,0,0,0,0
6,11123,0.0,0,14.7,1,4.3,1,4.8,6,5151000,...,2022,7,31,0,1,0,0,0,0,0
7,15937,0.0,0,13.0,1,2.1,1,11.3,5,5151000,...,2022,7,30,0,0,0,0,0,0,1
8,2540,0.0,0,13.0,1,2.1,1,11.3,5,5151000,...,2022,7,30,0,0,0,0,0,1,0
9,4457,0.0,0,13.0,1,2.1,1,11.3,5,5151000,...,2022,7,30,0,0,0,0,1,0,0


With Hyperparameter Tuning, we're not doing K-Folds, instead, we'll use Gridsearch CV to try different permutations of parameters

In [7]:
# Split the columns between the target and features
targetColName = "total_people"
col_names = dfFootWeather.columns
feature_cols = col_names.drop([targetColName])
trainFeatures = dfFootWeather[feature_cols]
trainTargets = dfFootWeather[targetColName]


In [8]:
# Configure basic XGBoost Regression object and the grid of possible parameter settings to test
randomSeed = databasic.get_random_seed()
model = xgb.XGBRegressor(objective="reg:squarederror", booster="gbtree", seed=randomSeed)
gbm_param_grid = { 'colsample_bytree': [0.3, 0.7], 'n_estimators': [10, 100, 200, 500, 1000], 'max_depth': [2, 4, 6, 8, 10], "eta": [0.3, 0.1, 0.01] }

x_train, x_vali, y_train, y_vali = train_test_split(trainFeatures, trainTargets, test_size=0.2, random_state=databasic.get_random_seed())

tuned_mse = GridSearchCV(param_grid=gbm_param_grid, estimator=model, scoring="neg_mean_absolute_error", cv=5, verbose=1)
tuned_mse.fit(x_train, y_train)

# Print the best parameters and lowest RMSE
print("Best parameters found: ", tuned_mse.best_params_)
print("Lowest MAE found: ", tuned_mse.best_score_)

Fitting 5 folds for each of 150 candidates, totalling 750 fits
Best parameters found:  {'colsample_bytree': 0.7, 'eta': 0.01, 'max_depth': 10, 'n_estimators': 1000}
Lowest MAE found:  -1536.2282090195527


In [9]:
# Print the best parameters and lowest RMSE
print("Best parameters found: ", tuned_mse.best_params_)
print("Lowest MAE found: ", tuned_mse.best_score_)

Best parameters found:  {'colsample_bytree': 0.7, 'eta': 0.01, 'max_depth': 10, 'n_estimators': 1000}
Lowest MAE found:  -1536.2282090195527


### Run 1
- Best parameters found:  {'colsample_bytree': 0.7, 'eta': 0.01, 'max_depth': 10, 'n_estimators': 1000}
- Lowest MAE found:  -1536.2282090195527
- Time to run: 25m 42s
