# Experiment with an XGBoost Regression on basic weather data

Using the basic weather data, just for July 2022 data

This will use XGBoost Regression as the model. It will use 5x K-Folds Cross Validation to train then fit the model and evaluate the MAE and RMSE. For each fold, it will write out the data with the predictions to the /predictions folder so we can look at what the model is predicting vs the true total people

In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb

from utilities import data_basic_utility as databasic
from utilities import dataframe_utility as dfutil

from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

# dfFootWeather = pd.read_csv("./data_files/FootTrafficWeather_July2022_Melbourne.csv")
dfFootWeather = pd.read_csv("./data_files/FootTrafficWeather_July2022_Melbourne.csv", parse_dates=["date"])
thisFileName = "07.RegressionXGboostV1"

print(dfFootWeather.info())
dfFootWeather.head()

c:\Users\nelso\AppData\Local\Programs\Python\Python39\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
c:\Users\nelso\AppData\Local\Programs\Python\Python39\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31 entries, 0 to 30
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   date              31 non-null     datetime64[ns]
 1   total_people      31 non-null     int64         
 2   total_rain        31 non-null     float64       
 3   rain_quality      31 non-null     object        
 4   max_temp          31 non-null     float64       
 5   max_temp_quality  31 non-null     object        
 6   min_temp          31 non-null     float64       
 7   min_temp_quality  31 non-null     object        
 8   solar_exp         31 non-null     float64       
 9   weekday           31 non-null     int64         
dtypes: datetime64[ns](1), float64(4), int64(2), object(3)
memory usage: 2.5+ KB
None


Unnamed: 0,date,total_people,total_rain,rain_quality,max_temp,max_temp_quality,min_temp,min_temp_quality,solar_exp,weekday
0,2022-01-07,435062,0.0,N,12.5,Y,5.9,Y,5.4,6
1,2022-02-07,501883,0.0,N,12.5,Y,7.0,Y,6.8,7
2,2022-03-07,386038,0.0,N,13.2,Y,3.7,Y,8.8,1
3,2022-04-07,356396,0.0,N,13.3,Y,4.6,Y,9.5,2
4,2022-05-07,370637,0.0,N,15.0,Y,7.5,Y,9.3,3


### Feature Engineering

First, we need to convert any non-number columns into numbers that the model can understand. This first version isn't doing anything beyond that, later on we should probably look for any missing data flags, and maybe do some column Min/Max scaling or other.

Convert the 3 Quality Y/N columns into 1/0 values, use the shared utility function for future code reuse

In [3]:
dfFootWeather = dfutil.convertBoolColToInt(dfFootWeather, "rain_quality")
dfFootWeather = dfutil.convertBoolColToInt(dfFootWeather, "max_temp_quality")
dfFootWeather = dfutil.convertBoolColToInt(dfFootWeather, "min_temp_quality")
dfFootWeather.head()

Unnamed: 0,date,total_people,total_rain,rain_quality,max_temp,max_temp_quality,min_temp,min_temp_quality,solar_exp,weekday
0,2022-01-07,435062,0.0,0,12.5,1,5.9,1,5.4,6
1,2022-02-07,501883,0.0,0,12.5,1,7.0,1,6.8,7
2,2022-03-07,386038,0.0,0,13.2,1,3.7,1,8.8,1
3,2022-04-07,356396,0.0,0,13.3,1,4.6,1,9.5,2
4,2022-05-07,370637,0.0,0,15.0,1,7.5,1,9.3,3


In [4]:
print(type(dfFootWeather["date"].dtype))
print(dfFootWeather["date"].dtype == "object")
print(dfFootWeather["date"][0])

<class 'numpy.dtype[datetime64]'>
False
2022-01-07 00:00:00


In [5]:
dfFootWeather = dfutil.separateYmdCol(dfFootWeather, "date")
print(dfFootWeather.info())
dfFootWeather.head(20)

Do a K-Folds Cross Validation using XGBoost and get an MAE and an RMSE for mean error and indication of variance

In [None]:
# Test a basic XGBoost Regression with KFolds Cross Validation
randomSeed = databasic.get_random_seed()
model = xgb.XGBRegressor(objective="reg:squarederror", booster="gbtree", n_estimators=10, seed=randomSeed)
modellingLog = ""   

targetColName = "total_people"
col_names = dfFootWeather.columns
feature_cols = col_names.drop([targetColName])
trainFeatures = dfFootWeather[feature_cols]
trainTargets = dfFootWeather[targetColName]


In [None]:

lstMae = []
lstRmse = []
kfolds = KFold(n_splits=5, random_state=randomSeed, shuffle=True)
for k, (train_index, test_index) in enumerate(kfolds.split(dfFootWeather)):
    x_train = trainFeatures.loc[train_index, ]
    x_vali = trainFeatures.loc[test_index, ]

    y_train = trainTargets.loc[train_index, ]
    y_vali = trainTargets.loc[test_index, ]

    model.fit(x_train, y_train)
    y_pred = model.predict(x_vali)

    # Compute the mae
    mae = mean_absolute_error(y_pred, y_vali)
    lstMae.append(mae)

    # Compute the rmse
    rmse = np.sqrt(mean_squared_error(y_pred, y_vali))
    lstRmse.append(rmse)
    
    print("Fold {0} MAE: {1}, RMSE: {2}".format(str(k), str(mae), str(rmse)))

    dfPredicted = x_vali
    dfPredicted["total_people"] = y_vali
    dfPredicted["total_people_predicted"] = y_pred
    dfPredicted.to_csv("./predictions/" + thisFileName+"_KFold" + str(k) + ".csv", index=False)

print("Final Result")
print("----------")
print("Average Mean Absolute Error (MAE): " + str(np.mean(lstMae)))
print("Average Root Mean Squared Error (RMSE): " + str(np.mean(lstRmse)))
