# Experiment with an XGBoost Regression on basic weather data

Updated to use the full Foot Traffic Weather data from 2013 - July 2022

This will use XGBoost Regression as the model. It will use 5x K-Folds Cross Validation to train then fit the model and evaluate the MAE and RMSE. For each fold, it will write out the data with the predictions to the /predictions folder so we can look at what the model is predicting vs the true total people

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

from utilities import data_basic_utility as databasic
from utilities import dataframe_utility as dfutil

from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

# dfFootWeather = pd.read_csv("./data_files/FootTrafficWeather_July2022_Melbourne.csv")
dfFootWeather = pd.read_csv("./data_files/FootTrafficWeatherMelb_20130101_20220701.csv", parse_dates=["date"])
thisFileName = "07a.RegressionXGboostV1"

print(dfFootWeather.shape)
print(dfFootWeather.info())
dfFootWeather.head()

c:\Users\nelso\AppData\Local\Programs\Python\Python39\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
c:\Users\nelso\AppData\Local\Programs\Python\Python39\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll


(3103, 10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3103 entries, 0 to 3102
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   date              3103 non-null   datetime64[ns]
 1   total_people      3103 non-null   float64       
 2   total_rain        3098 non-null   float64       
 3   rain_quality      3098 non-null   object        
 4   max_temp          3102 non-null   float64       
 5   max_temp_quality  3101 non-null   object        
 6   min_temp          3102 non-null   float64       
 7   min_temp_quality  3102 non-null   object        
 8   solar_exp         3102 non-null   float64       
 9   WeekDay           3103 non-null   int64         
dtypes: datetime64[ns](1), float64(5), int64(1), object(3)
memory usage: 242.5+ KB
None


Unnamed: 0,date,total_people,total_rain,rain_quality,max_temp,max_temp_quality,min_temp,min_temp_quality,solar_exp,WeekDay
0,2022-07-31,327383.0,0.0,N,14.7,Y,4.3,Y,4.8,6
1,2022-07-30,462115.0,0.0,N,13.0,Y,2.1,Y,11.3,5
2,2022-07-29,405511.0,1.0,N,12.7,Y,6.5,Y,11.2,4
3,2022-07-28,334858.0,1.0,N,13.2,Y,9.3,Y,9.3,3
4,2022-07-27,340569.0,3.0,N,15.3,Y,9.3,Y,7.7,2


In [2]:
dfFootWeather = dfFootWeather[dfFootWeather["total_rain"].notna()]
dfFootWeather = dfFootWeather[dfFootWeather["solar_exp"].notna()]

# assume missing quality is an N
dfFootWeather.loc[dfFootWeather["max_temp_quality"].isna(), "max_temp_quality"] = "N"

print(dfFootWeather.shape)
print(dfFootWeather.info())
dfFootWeather.head()

(3097, 10)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3097 entries, 0 to 3102
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   date              3097 non-null   datetime64[ns]
 1   total_people      3097 non-null   float64       
 2   total_rain        3097 non-null   float64       
 3   rain_quality      3097 non-null   object        
 4   max_temp          3097 non-null   float64       
 5   max_temp_quality  3097 non-null   object        
 6   min_temp          3097 non-null   float64       
 7   min_temp_quality  3097 non-null   object        
 8   solar_exp         3097 non-null   float64       
 9   WeekDay           3097 non-null   int64         
dtypes: datetime64[ns](1), float64(5), int64(1), object(3)
memory usage: 266.1+ KB
None


Unnamed: 0,date,total_people,total_rain,rain_quality,max_temp,max_temp_quality,min_temp,min_temp_quality,solar_exp,WeekDay
0,2022-07-31,327383.0,0.0,N,14.7,Y,4.3,Y,4.8,6
1,2022-07-30,462115.0,0.0,N,13.0,Y,2.1,Y,11.3,5
2,2022-07-29,405511.0,1.0,N,12.7,Y,6.5,Y,11.2,4
3,2022-07-28,334858.0,1.0,N,13.2,Y,9.3,Y,9.3,3
4,2022-07-27,340569.0,3.0,N,15.3,Y,9.3,Y,7.7,2


### Feature Engineering

First, we need to convert any non-number columns into numbers that the model can understand. This first version isn't doing anything beyond that, later on we should probably look for any missing data flags, and maybe do some column Min/Max scaling or other.

Convert the 3 Quality Y/N columns into 1/0 values, use the shared utility function for future code reuse

In [3]:
dfFootWeather = dfutil.convertBoolColToInt(dfFootWeather, "rain_quality")
dfFootWeather = dfutil.convertBoolColToInt(dfFootWeather, "max_temp_quality")
dfFootWeather = dfutil.convertBoolColToInt(dfFootWeather, "min_temp_quality")
dfFootWeather.head()

Unnamed: 0,date,total_people,total_rain,rain_quality,max_temp,max_temp_quality,min_temp,min_temp_quality,solar_exp,WeekDay
0,2022-07-31,327383.0,0.0,0,14.7,1,4.3,1,4.8,6
1,2022-07-30,462115.0,0.0,0,13.0,1,2.1,1,11.3,5
2,2022-07-29,405511.0,1.0,0,12.7,1,6.5,1,11.2,4
3,2022-07-28,334858.0,1.0,0,13.2,1,9.3,1,9.3,3
4,2022-07-27,340569.0,3.0,0,15.3,1,9.3,1,7.7,2


In [4]:
print(type(dfFootWeather["date"].dtype))
print(dfFootWeather["date"].dtype == "object")
print(dfFootWeather["date"][0])

<class 'numpy.dtype[datetime64]'>
False
2022-07-31 00:00:00


In [5]:
dfFootWeather = dfutil.separateYmdCol(dfFootWeather, "date")
print(dfFootWeather.info())
dfFootWeather.head(20)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3097 entries, 0 to 3102
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   total_people      3097 non-null   float64
 1   total_rain        3097 non-null   float64
 2   rain_quality      3097 non-null   int64  
 3   max_temp          3097 non-null   float64
 4   max_temp_quality  3097 non-null   int64  
 5   min_temp          3097 non-null   float64
 6   min_temp_quality  3097 non-null   int64  
 7   solar_exp         3097 non-null   float64
 8   WeekDay           3097 non-null   int64  
 9   date_year         3097 non-null   int64  
 10  date_month        3097 non-null   int64  
 11  date_day          3097 non-null   int64  
dtypes: float64(5), int64(7)
memory usage: 379.1 KB
None


Unnamed: 0,total_people,total_rain,rain_quality,max_temp,max_temp_quality,min_temp,min_temp_quality,solar_exp,WeekDay,date_year,date_month,date_day
0,327383.0,0.0,0,14.7,1,4.3,1,4.8,6,2022,7,31
1,462115.0,0.0,0,13.0,1,2.1,1,11.3,5,2022,7,30
2,405511.0,1.0,0,12.7,1,6.5,1,11.2,4,2022,7,29
3,334858.0,1.0,0,13.2,1,9.3,1,9.3,3,2022,7,28
4,340569.0,3.0,0,15.3,1,9.3,1,7.7,2,2022,7,27
5,316316.0,4.4,0,13.2,1,8.8,1,6.4,1,2022,7,26
6,274106.0,0.0,0,16.8,1,8.1,1,5.1,0,2022,7,25
7,406977.0,7.8,0,19.3,1,10.4,1,10.1,6,2022,7,24
8,371336.0,1.2,0,14.5,1,8.1,1,5.1,5,2022,7,23
9,416838.0,0.0,0,19.0,1,3.4,1,9.9,4,2022,7,22


Do a K-Folds Cross Validation using XGBoost and get an MAE and an RMSE for mean error and indication of variance

In [6]:
# Test a basic XGBoost Regression with KFolds Cross Validation
randomSeed = databasic.get_random_seed()
model = xgb.XGBRegressor(objective="reg:squarederror", booster="gbtree", n_estimators=10, seed=randomSeed)
modellingLog = ""   

targetColName = "total_people"
col_names = dfFootWeather.columns
feature_cols = col_names.drop([targetColName])
trainFeatures = dfFootWeather[feature_cols]
trainTargets = dfFootWeather[targetColName]


In [7]:

lstMae = []
lstRmse = []
kfolds = KFold(n_splits=5, random_state=randomSeed, shuffle=True)
for k, (train_index, test_index) in enumerate(kfolds.split(dfFootWeather)):
    # x_train = trainFeatures.loc[train_index, ]
    # x_vali = trainFeatures.loc[test_index, ]

    # y_train = trainTargets.loc[train_index, ]
    # y_vali = trainTargets.loc[test_index, ]
    x_train = trainFeatures.loc[trainFeatures.index.intersection(train_index)]
    x_vali = trainFeatures.loc[trainFeatures.index.intersection(test_index)]
    
    y_train = trainTargets.loc[trainTargets.index.intersection(train_index)]
    y_vali = trainTargets.loc[trainTargets.index.intersection(test_index)]
        
    model.fit(x_train, y_train)
    y_pred = model.predict(x_vali)

    # Compute the mae
    mae = mean_absolute_error(y_pred, y_vali)
    lstMae.append(mae)

    # Compute the rmse
    rmse = np.sqrt(mean_squared_error(y_pred, y_vali))
    lstRmse.append(rmse)
    
    print("Fold {0} MAE: {1}, RMSE: {2}".format(str(k), str(mae), str(rmse)))

    dfPredicted = x_vali
    dfPredicted["total_people"] = y_vali
    dfPredicted["total_people_predicted"] = y_pred
    dfPredicted.to_csv("./predictions/" + thisFileName+"_KFold" + str(k) + ".csv", index=False)

print("Final Result")
print("----------")
print("Average Mean Absolute Error (MAE): " + str(np.mean(lstMae)))
print("Average Root Mean Squared Error (RMSE): " + str(np.mean(lstRmse)))


Fold 0 MAE: 52384.49160061591, RMSE: 72545.467279801
Fold 1 MAE: 52144.398607610885, RMSE: 78816.78073744326
Fold 2 MAE: 48033.41348544034, RMSE: 66273.9552746938
Fold 3 MAE: 47737.739883571, RMSE: 71487.40357355853
Fold 4 MAE: 45622.45601360234, RMSE: 64739.47646786812
Final Result
----------
Average Mean Absolute Error (MAE): 49184.4999181681
Average Root Mean Squared Error (RMSE): 70772.61666667296


Run 1:
- Average Mean Absolute Error (MAE): 48515.98126093765
- Average Root Mean Squared Error (RMSE): 68578.87551743323

Run 2:
- Average Mean Absolute Error (MAE): 48290.50871574182
- Average Root Mean Squared Error (RMSE): 67880.30173393416

Run 3:
- Average Mean Absolute Error (MAE): 48295.535915381195
- Average Root Mean Squared Error (RMSE): 68610.7752058299

In [8]:
avgTotalPeople = np.mean(dfFootWeather["total_people"])
avgMae = np.mean([ 48515.98126093765, 48290.50871574182, 48295.535915381195 ])
avgRmse = np.mean([ 68578.87551743323, 67880.30173393416, 68610.7752058299 ])

predictionAccuracy = 100 - np.round((avgMae / avgTotalPeople) * 100, 2)
percentAvgAccuracyError = np.round((avgRmse / avgTotalPeople) * 100, 2)

print("Predictions made to an accuracy of: " + str(predictionAccuracy) + "%")
print("Predictions Error: +/-" + str(percentAvgAccuracyError) + "%")

Predictions made to an accuracy of: 90.37%
Predictions Error: +/-13.6%
