# XGBoost Regression - Using V3 Data File

Do the XGBoost with the data file that includes lockdowns, holidays and retail data

In [33]:
import pandas as pd
import numpy as np
import xgboost as xgb

from utilities import data_basic_utility as databasic
from utilities import dataframe_utility as dfutil

from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

# dfFootWeather = pd.read_csv("./data_files/FootTrafficWeather_July2022_Melbourne.csv")
dfFootWeather = pd.read_csv("./data_files/FootTrafficWeatherMelb2_20130101_20220701_v3.csv", parse_dates=["date"])
thisFileName = "09b.RegressionXGboostV3"

print(dfFootWeather.shape)
print(dfFootWeather.info())
dfFootWeather.head()

(3499, 17)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3499 entries, 0 to 3498
Data columns (total 17 columns):
 #   Column                                      Non-Null Count  Dtype         
---  ------                                      --------------  -----         
 0   date                                        3499 non-null   datetime64[ns]
 1   total_people                                3499 non-null   float64       
 2   total_rain                                  3342 non-null   float64       
 3   rain_quality                                3342 non-null   object        
 4   max_temp                                    3347 non-null   float64       
 5   max_temp_quality                            3346 non-null   object        
 6   min_temp                                    3346 non-null   float64       
 7   min_temp_quality                            3346 non-null   object        
 8   solar_exp                                   3498 non-null   float64       
 9

Unnamed: 0,date,total_people,total_rain,rain_quality,max_temp,max_temp_quality,min_temp,min_temp_quality,solar_exp,WeekDay,population_annual,population_change_annual,is_holiday,is_lockdown,OfflineRetail_Original_Turnover,OfflineRetail_Seasonally_Adjusted_Turnover,OfflineRetail_Trend_Turnover
0,2022-07-31,327383.0,0.0,N,14.7,Y,4.3,Y,4.8,6,5151000,1.78,,,8562.7,8947.3,
1,2022-07-30,462115.0,0.0,N,13.0,Y,2.1,Y,11.3,5,5151000,1.78,,,8562.7,8947.3,
2,2022-07-29,405511.0,1.0,N,12.7,Y,6.5,Y,11.2,4,5151000,1.78,,,8562.7,8947.3,
3,2022-07-28,334858.0,1.0,N,13.2,Y,9.3,Y,9.3,3,5151000,1.78,,,8562.7,8947.3,
4,2022-07-27,340569.0,3.0,N,15.3,Y,9.3,Y,7.7,2,5151000,1.78,,,8562.7,8947.3,


# Data Load and Preparation Stage


In [34]:
dfFootWeather = dfFootWeather[dfFootWeather["total_rain"].notna()]
dfFootWeather = dfFootWeather[dfFootWeather["solar_exp"].notna()]

# assume missing quality is an N
dfFootWeather.loc[dfFootWeather["max_temp_quality"].isna(), "max_temp_quality"] = "N"

print(dfFootWeather.shape)
print(dfFootWeather.info())
dfFootWeather.head()

(3341, 17)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3341 entries, 0 to 3346
Data columns (total 17 columns):
 #   Column                                      Non-Null Count  Dtype         
---  ------                                      --------------  -----         
 0   date                                        3341 non-null   datetime64[ns]
 1   total_people                                3341 non-null   float64       
 2   total_rain                                  3341 non-null   float64       
 3   rain_quality                                3341 non-null   object        
 4   max_temp                                    3341 non-null   float64       
 5   max_temp_quality                            3341 non-null   object        
 6   min_temp                                    3341 non-null   float64       
 7   min_temp_quality                            3341 non-null   object        
 8   solar_exp                                   3341 non-null   float64       
 9

Unnamed: 0,date,total_people,total_rain,rain_quality,max_temp,max_temp_quality,min_temp,min_temp_quality,solar_exp,WeekDay,population_annual,population_change_annual,is_holiday,is_lockdown,OfflineRetail_Original_Turnover,OfflineRetail_Seasonally_Adjusted_Turnover,OfflineRetail_Trend_Turnover
0,2022-07-31,327383.0,0.0,N,14.7,Y,4.3,Y,4.8,6,5151000,1.78,,,8562.7,8947.3,
1,2022-07-30,462115.0,0.0,N,13.0,Y,2.1,Y,11.3,5,5151000,1.78,,,8562.7,8947.3,
2,2022-07-29,405511.0,1.0,N,12.7,Y,6.5,Y,11.2,4,5151000,1.78,,,8562.7,8947.3,
3,2022-07-28,334858.0,1.0,N,13.2,Y,9.3,Y,9.3,3,5151000,1.78,,,8562.7,8947.3,
4,2022-07-27,340569.0,3.0,N,15.3,Y,9.3,Y,7.7,2,5151000,1.78,,,8562.7,8947.3,


### Feature Engineering

First, we need to convert any non-number columns into numbers that the model can understand. This first version isn't doing anything beyond that, later on we should probably look for any missing data flags, and maybe do some column Min/Max scaling or other.

Convert the 3 Quality Y/N columns into 1/0 values, use the shared utility function for future code reuse

In [35]:
dfFootWeather = dfutil.convertBoolColToInt(dfFootWeather, "rain_quality")
dfFootWeather = dfutil.convertBoolColToInt(dfFootWeather, "max_temp_quality")
dfFootWeather = dfutil.convertBoolColToInt(dfFootWeather, "min_temp_quality")
dfFootWeather.head()

Unnamed: 0,date,total_people,total_rain,rain_quality,max_temp,max_temp_quality,min_temp,min_temp_quality,solar_exp,WeekDay,population_annual,population_change_annual,is_holiday,is_lockdown,OfflineRetail_Original_Turnover,OfflineRetail_Seasonally_Adjusted_Turnover,OfflineRetail_Trend_Turnover
0,2022-07-31,327383.0,0.0,0,14.7,1,4.3,1,4.8,6,5151000,1.78,,,8562.7,8947.3,
1,2022-07-30,462115.0,0.0,0,13.0,1,2.1,1,11.3,5,5151000,1.78,,,8562.7,8947.3,
2,2022-07-29,405511.0,1.0,0,12.7,1,6.5,1,11.2,4,5151000,1.78,,,8562.7,8947.3,
3,2022-07-28,334858.0,1.0,0,13.2,1,9.3,1,9.3,3,5151000,1.78,,,8562.7,8947.3,
4,2022-07-27,340569.0,3.0,0,15.3,1,9.3,1,7.7,2,5151000,1.78,,,8562.7,8947.3,


In [36]:
print(type(dfFootWeather["date"].dtype))
print(dfFootWeather["date"].dtype == "object")
print(dfFootWeather["date"][0])
print(dfFootWeather.info())

<class 'numpy.dtype[datetime64]'>
False
2022-07-31 00:00:00
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3341 entries, 0 to 3346
Data columns (total 17 columns):
 #   Column                                      Non-Null Count  Dtype         
---  ------                                      --------------  -----         
 0   date                                        3341 non-null   datetime64[ns]
 1   total_people                                3341 non-null   float64       
 2   total_rain                                  3341 non-null   float64       
 3   rain_quality                                3341 non-null   int64         
 4   max_temp                                    3341 non-null   float64       
 5   max_temp_quality                            3341 non-null   int64         
 6   min_temp                                    3341 non-null   float64       
 7   min_temp_quality                            3341 non-null   int64         
 8   solar_exp                   

Fill in the nulls in the holiday and lockdown columns with 0

Also, The OfflineRetail_Trend_Turnover column has nulls in the early data. In general, this value tends to be very close to the OfflineRetail_Seasonally_Adjusted_Turnover  , which has no nulls. Therefore, where Trend is null, populate it with the Seasonally Adjusted, so that null/0 values in the Trend don't skew results

In [37]:
dfFootWeather.loc[dfFootWeather["is_holiday"].isna(), "is_holiday"] = 0
dfFootWeather.loc[dfFootWeather["is_lockdown"].isna(), "is_lockdown"] = 0

dfFootWeather["OfflineRetail_Trend_Turnover"] = dfFootWeather["OfflineRetail_Trend_Turnover"].fillna(dfFootWeather["OfflineRetail_Seasonally_Adjusted_Turnover"])

print(dfFootWeather.info())
dfFootWeather.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3341 entries, 0 to 3346
Data columns (total 17 columns):
 #   Column                                      Non-Null Count  Dtype         
---  ------                                      --------------  -----         
 0   date                                        3341 non-null   datetime64[ns]
 1   total_people                                3341 non-null   float64       
 2   total_rain                                  3341 non-null   float64       
 3   rain_quality                                3341 non-null   int64         
 4   max_temp                                    3341 non-null   float64       
 5   max_temp_quality                            3341 non-null   int64         
 6   min_temp                                    3341 non-null   float64       
 7   min_temp_quality                            3341 non-null   int64         
 8   solar_exp                                   3341 non-null   float64       
 9   WeekDay 

Unnamed: 0,date,total_people,total_rain,rain_quality,max_temp,max_temp_quality,min_temp,min_temp_quality,solar_exp,WeekDay,population_annual,population_change_annual,is_holiday,is_lockdown,OfflineRetail_Original_Turnover,OfflineRetail_Seasonally_Adjusted_Turnover,OfflineRetail_Trend_Turnover
0,2022-07-31,327383.0,0.0,0,14.7,1,4.3,1,4.8,6,5151000,1.78,0.0,0.0,8562.7,8947.3,8947.3
1,2022-07-30,462115.0,0.0,0,13.0,1,2.1,1,11.3,5,5151000,1.78,0.0,0.0,8562.7,8947.3,8947.3
2,2022-07-29,405511.0,1.0,0,12.7,1,6.5,1,11.2,4,5151000,1.78,0.0,0.0,8562.7,8947.3,8947.3
3,2022-07-28,334858.0,1.0,0,13.2,1,9.3,1,9.3,3,5151000,1.78,0.0,0.0,8562.7,8947.3,8947.3
4,2022-07-27,340569.0,3.0,0,15.3,1,9.3,1,7.7,2,5151000,1.78,0.0,0.0,8562.7,8947.3,8947.3


In [38]:
dfFootWeather = dfutil.separateYmdCol(dfFootWeather, "date")
print(dfFootWeather.info())
dfFootWeather.head(20)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3341 entries, 0 to 3346
Data columns (total 19 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   total_people                                3341 non-null   float64
 1   total_rain                                  3341 non-null   float64
 2   rain_quality                                3341 non-null   int64  
 3   max_temp                                    3341 non-null   float64
 4   max_temp_quality                            3341 non-null   int64  
 5   min_temp                                    3341 non-null   float64
 6   min_temp_quality                            3341 non-null   int64  
 7   solar_exp                                   3341 non-null   float64
 8   WeekDay                                     3341 non-null   int64  
 9   population_annual                           3341 non-null   int64  
 10  population_c

Unnamed: 0,total_people,total_rain,rain_quality,max_temp,max_temp_quality,min_temp,min_temp_quality,solar_exp,WeekDay,population_annual,population_change_annual,is_holiday,is_lockdown,OfflineRetail_Original_Turnover,OfflineRetail_Seasonally_Adjusted_Turnover,OfflineRetail_Trend_Turnover,date_year,date_month,date_day
0,327383.0,0.0,0,14.7,1,4.3,1,4.8,6,5151000,1.78,0.0,0.0,8562.7,8947.3,8947.3,2022,7,31
1,462115.0,0.0,0,13.0,1,2.1,1,11.3,5,5151000,1.78,0.0,0.0,8562.7,8947.3,8947.3,2022,7,30
2,405511.0,1.0,0,12.7,1,6.5,1,11.2,4,5151000,1.78,0.0,0.0,8562.7,8947.3,8947.3,2022,7,29
3,334858.0,1.0,0,13.2,1,9.3,1,9.3,3,5151000,1.78,0.0,0.0,8562.7,8947.3,8947.3,2022,7,28
4,340569.0,3.0,0,15.3,1,9.3,1,7.7,2,5151000,1.78,0.0,0.0,8562.7,8947.3,8947.3,2022,7,27
5,316316.0,4.4,0,13.2,1,8.8,1,6.4,1,5151000,1.78,0.0,0.0,8562.7,8947.3,8947.3,2022,7,26
6,274106.0,0.0,0,16.8,1,8.1,1,5.1,0,5151000,1.78,0.0,0.0,8562.7,8947.3,8947.3,2022,7,25
7,406977.0,7.8,0,19.3,1,10.4,1,10.1,6,5151000,1.78,0.0,0.0,8562.7,8947.3,8947.3,2022,7,24
8,371336.0,1.2,0,14.5,1,8.1,1,5.1,5,5151000,1.78,0.0,0.0,8562.7,8947.3,8947.3,2022,7,23
9,416838.0,0.0,0,19.0,1,3.4,1,9.9,4,5151000,1.78,0.0,0.0,8562.7,8947.3,8947.3,2022,7,22


# Modeling and Prediction Stage

Do a K-Folds Cross Validation using XGBoost and get an MAE and an RMSE for mean error and indication of variance

Using Best Param Results from Hyperparameter Tuning:
- Best parameters found:  {'colsample_bytree': 0.7, 'eta': 0.1, 'max_depth': 6, 'n_estimators': 200}
- Lowest MAE found:  -43476.65617123382

In [39]:
# Test a basic XGBoost Regression with KFolds Cross Validation
randomSeed = databasic.get_random_seed()
model = xgb.XGBRegressor(objective="reg:squarederror", booster="gbtree", 
    n_estimators=200, max_depth=6, colsample_bytree=0.7, eta=0.1,
    seed=randomSeed)
    
modellingLog = ""   

targetColName = "total_people"
col_names = dfFootWeather.columns
feature_cols = col_names.drop([targetColName])
trainFeatures = dfFootWeather[feature_cols]
trainTargets = dfFootWeather[targetColName]


In [40]:

lstMae = []
lstRmse = []
kfolds = KFold(n_splits=5, random_state=randomSeed, shuffle=True)
for k, (train_index, test_index) in enumerate(kfolds.split(dfFootWeather)):
    # x_train = trainFeatures.loc[train_index, ]
    # x_vali = trainFeatures.loc[test_index, ]

    # y_train = trainTargets.loc[train_index, ]
    # y_vali = trainTargets.loc[test_index, ]
    x_train = trainFeatures.loc[trainFeatures.index.intersection(train_index)]
    x_vali = trainFeatures.loc[trainFeatures.index.intersection(test_index)]
    
    y_train = trainTargets.loc[trainTargets.index.intersection(train_index)]
    y_vali = trainTargets.loc[trainTargets.index.intersection(test_index)]
        
    model.fit(x_train, y_train)
    y_pred = model.predict(x_vali)

    # Compute the mae
    mae = mean_absolute_error(y_pred, y_vali)
    lstMae.append(mae)

    # Compute the rmse
    rmse = np.sqrt(mean_squared_error(y_pred, y_vali))
    lstRmse.append(rmse)
    
    print("Fold {0} MAE: {1}, RMSE: {2}".format(str(k), str(mae), str(rmse)))

    dfPredicted = x_vali
    dfPredicted["total_people"] = y_vali
    dfPredicted["total_people_predicted"] = y_pred
    dfPredicted.to_csv("./predictions/" + thisFileName+"_KFold" + str(k) + ".csv", index=False)

print("Final Result")
print("----------")
print("Average Mean Absolute Error (MAE): " + str(np.mean(lstMae)))
print("Average Root Mean Squared Error (RMSE): " + str(np.mean(lstRmse)))


Fold 0 MAE: 30962.96384529148, RMSE: 44046.3768223358
Fold 1 MAE: 33045.41118237488, RMSE: 51644.677353465115
Fold 2 MAE: 32856.19667433547, RMSE: 52012.140177202724
Fold 3 MAE: 36055.54738813389, RMSE: 52783.331336387324
Fold 4 MAE: 32199.165593594804, RMSE: 45846.75367254319
Final Result
----------
Average Mean Absolute Error (MAE): 33023.85693674611
Average Root Mean Squared Error (RMSE): 49266.65587238683


Run 1:
- Average Mean Absolute Error (MAE): 33316.382904179016
- Average Root Mean Squared Error (RMSE): 50619.67625220834

Run 2:
- Average Mean Absolute Error (MAE): 33270.610055330384
- Average Root Mean Squared Error (RMSE): 49644.41175939449

Run 3:
- Average Mean Absolute Error (MAE): 33023.85693674611
- Average Root Mean Squared Error (RMSE): 49266.65587238683

In [41]:
avgTotalPeople = np.mean(dfFootWeather["total_people"])
avgMae = np.mean([ 33316.382904179016, 33270.610055330384, 33023.85693674611 ])
avgRmse = np.mean([ 50619.67625220834, 49644.41175939449, 49266.65587238683 ])

predictionAccuracy = 100 - np.round((avgMae / avgTotalPeople) * 100, 2)
percentAvgAccuracyError = np.round((avgRmse / avgTotalPeople) * 100, 2)

print("Predictions made to an accuracy of: " + str(predictionAccuracy) + "%")
print("Predictions Error: +/-" + str(percentAvgAccuracyError) + "%")

Predictions made to an accuracy of: 93.23%
Predictions Error: +/-10.16%
