# XGBoost Regression - Adding Annual Population data

Found the annual population data and growth percentage for Greater melbourne, filtered out to just 2013-2022 data.

Join this data to the dataset and do another regression

In [63]:
import pandas as pd
import numpy as np
import xgboost as xgb

from utilities import data_basic_utility as databasic
from utilities import dataframe_utility as dfutil

from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

# dfFootWeather = pd.read_csv("./data_files/FootTrafficWeather_July2022_Melbourne.csv")
dfFootWeather = pd.read_csv("./data_files/FootTrafficWeatherMelb_20130101_20220701.csv", parse_dates=["date"])
thisFileName = "07c.RegressionXGboostParams1"

print(dfFootWeather.shape)
print(dfFootWeather.info())
dfFootWeather.head()

(3103, 10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3103 entries, 0 to 3102
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   date              3103 non-null   datetime64[ns]
 1   total_people      3103 non-null   float64       
 2   total_rain        3098 non-null   float64       
 3   rain_quality      3098 non-null   object        
 4   max_temp          3102 non-null   float64       
 5   max_temp_quality  3101 non-null   object        
 6   min_temp          3102 non-null   float64       
 7   min_temp_quality  3102 non-null   object        
 8   solar_exp         3102 non-null   float64       
 9   WeekDay           3103 non-null   int64         
dtypes: datetime64[ns](1), float64(5), int64(1), object(3)
memory usage: 242.5+ KB
None


Unnamed: 0,date,total_people,total_rain,rain_quality,max_temp,max_temp_quality,min_temp,min_temp_quality,solar_exp,WeekDay
0,2022-07-31,327383.0,0.0,N,14.7,Y,4.3,Y,4.8,6
1,2022-07-30,462115.0,0.0,N,13.0,Y,2.1,Y,11.3,5
2,2022-07-29,405511.0,1.0,N,12.7,Y,6.5,Y,11.2,4
3,2022-07-28,334858.0,1.0,N,13.2,Y,9.3,Y,9.3,3
4,2022-07-27,340569.0,3.0,N,15.3,Y,9.3,Y,7.7,2


# Data Load and Preparation Stage


In [64]:
dfFootWeather = dfFootWeather[dfFootWeather["total_rain"].notna()]
dfFootWeather = dfFootWeather[dfFootWeather["solar_exp"].notna()]

# assume missing quality is an N
dfFootWeather.loc[dfFootWeather["max_temp_quality"].isna(), "max_temp_quality"] = "N"

print(dfFootWeather.shape)
print(dfFootWeather.info())
dfFootWeather.head()

(3097, 10)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3097 entries, 0 to 3102
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   date              3097 non-null   datetime64[ns]
 1   total_people      3097 non-null   float64       
 2   total_rain        3097 non-null   float64       
 3   rain_quality      3097 non-null   object        
 4   max_temp          3097 non-null   float64       
 5   max_temp_quality  3097 non-null   object        
 6   min_temp          3097 non-null   float64       
 7   min_temp_quality  3097 non-null   object        
 8   solar_exp         3097 non-null   float64       
 9   WeekDay           3097 non-null   int64         
dtypes: datetime64[ns](1), float64(5), int64(1), object(3)
memory usage: 266.1+ KB
None


Unnamed: 0,date,total_people,total_rain,rain_quality,max_temp,max_temp_quality,min_temp,min_temp_quality,solar_exp,WeekDay
0,2022-07-31,327383.0,0.0,N,14.7,Y,4.3,Y,4.8,6
1,2022-07-30,462115.0,0.0,N,13.0,Y,2.1,Y,11.3,5
2,2022-07-29,405511.0,1.0,N,12.7,Y,6.5,Y,11.2,4
3,2022-07-28,334858.0,1.0,N,13.2,Y,9.3,Y,9.3,3
4,2022-07-27,340569.0,3.0,N,15.3,Y,9.3,Y,7.7,2


### Feature Engineering

First, we need to convert any non-number columns into numbers that the model can understand. This first version isn't doing anything beyond that, later on we should probably look for any missing data flags, and maybe do some column Min/Max scaling or other.

Convert the 3 Quality Y/N columns into 1/0 values, use the shared utility function for future code reuse

In [65]:
dfFootWeather = dfutil.convertBoolColToInt(dfFootWeather, "rain_quality")
dfFootWeather = dfutil.convertBoolColToInt(dfFootWeather, "max_temp_quality")
dfFootWeather = dfutil.convertBoolColToInt(dfFootWeather, "min_temp_quality")
dfFootWeather.head()

Unnamed: 0,date,total_people,total_rain,rain_quality,max_temp,max_temp_quality,min_temp,min_temp_quality,solar_exp,WeekDay
0,2022-07-31,327383.0,0.0,0,14.7,1,4.3,1,4.8,6
1,2022-07-30,462115.0,0.0,0,13.0,1,2.1,1,11.3,5
2,2022-07-29,405511.0,1.0,0,12.7,1,6.5,1,11.2,4
3,2022-07-28,334858.0,1.0,0,13.2,1,9.3,1,9.3,3
4,2022-07-27,340569.0,3.0,0,15.3,1,9.3,1,7.7,2


In [66]:
print(type(dfFootWeather["date"].dtype))
print(dfFootWeather["date"].dtype == "object")
print(dfFootWeather["date"][0])

<class 'numpy.dtype[datetime64]'>
False
2022-07-31 00:00:00


In [67]:
dfFootWeather = dfutil.separateYmdCol(dfFootWeather, "date")
print(dfFootWeather.info())
dfFootWeather.head(20)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3097 entries, 0 to 3102
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   total_people      3097 non-null   float64
 1   total_rain        3097 non-null   float64
 2   rain_quality      3097 non-null   int64  
 3   max_temp          3097 non-null   float64
 4   max_temp_quality  3097 non-null   int64  
 5   min_temp          3097 non-null   float64
 6   min_temp_quality  3097 non-null   int64  
 7   solar_exp         3097 non-null   float64
 8   WeekDay           3097 non-null   int64  
 9   date_year         3097 non-null   int64  
 10  date_month        3097 non-null   int64  
 11  date_day          3097 non-null   int64  
dtypes: float64(5), int64(7)
memory usage: 379.1 KB
None


Unnamed: 0,total_people,total_rain,rain_quality,max_temp,max_temp_quality,min_temp,min_temp_quality,solar_exp,WeekDay,date_year,date_month,date_day
0,327383.0,0.0,0,14.7,1,4.3,1,4.8,6,2022,7,31
1,462115.0,0.0,0,13.0,1,2.1,1,11.3,5,2022,7,30
2,405511.0,1.0,0,12.7,1,6.5,1,11.2,4,2022,7,29
3,334858.0,1.0,0,13.2,1,9.3,1,9.3,3,2022,7,28
4,340569.0,3.0,0,15.3,1,9.3,1,7.7,2,2022,7,27
5,316316.0,4.4,0,13.2,1,8.8,1,6.4,1,2022,7,26
6,274106.0,0.0,0,16.8,1,8.1,1,5.1,0,2022,7,25
7,406977.0,7.8,0,19.3,1,10.4,1,10.1,6,2022,7,24
8,371336.0,1.2,0,14.5,1,8.1,1,5.1,5,2022,7,23
9,416838.0,0.0,0,19.0,1,3.4,1,9.9,4,2022,7,22


Load the population file and join to our dataframe

In [68]:
dfPop = pd.read_csv("./data_files/greatermelb_population_annual.csv")
print(dfPop.shape)
print(dfPop.info())
dfPop.head()

(10, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   date            10 non-null     object 
 1    Population     10 non-null     int64  
 2    Annual Change  10 non-null     float64
dtypes: float64(1), int64(1), object(1)
memory usage: 368.0+ bytes
None


Unnamed: 0,date,Population,Annual Change
0,31/12/2013,4217000,2.5
1,31/12/2014,4322000,2.49
2,31/12/2015,4430000,2.5
3,31/12/2016,4541000,2.51
4,31/12/2017,4654000,2.49


In [69]:
dfPop["date_year"] = dfPop.apply(lambda x: pd.to_datetime(x["date"]).year, axis=1)
dfPop = dfPop.rename(columns={ " Population" : "population_annual", " Annual Change" : "population_change_annual" })
del dfPop["date"]
dfPop.head()

Unnamed: 0,population_annual,population_change_annual,date_year
0,4217000,2.5,2013
1,4322000,2.49,2014
2,4430000,2.5,2015
3,4541000,2.51,2016
4,4654000,2.49,2017


In [70]:
dfFootWeather = pd.merge(dfFootWeather, dfPop, on="date_year")
print(dfFootWeather.info())
dfFootWeather.head(20)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3097 entries, 0 to 3096
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   total_people              3097 non-null   float64
 1   total_rain                3097 non-null   float64
 2   rain_quality              3097 non-null   int64  
 3   max_temp                  3097 non-null   float64
 4   max_temp_quality          3097 non-null   int64  
 5   min_temp                  3097 non-null   float64
 6   min_temp_quality          3097 non-null   int64  
 7   solar_exp                 3097 non-null   float64
 8   WeekDay                   3097 non-null   int64  
 9   date_year                 3097 non-null   int64  
 10  date_month                3097 non-null   int64  
 11  date_day                  3097 non-null   int64  
 12  population_annual         3097 non-null   int64  
 13  population_change_annual  3097 non-null   float64
dtypes: float

Unnamed: 0,total_people,total_rain,rain_quality,max_temp,max_temp_quality,min_temp,min_temp_quality,solar_exp,WeekDay,date_year,date_month,date_day,population_annual,population_change_annual
0,327383.0,0.0,0,14.7,1,4.3,1,4.8,6,2022,7,31,5151000,1.78
1,462115.0,0.0,0,13.0,1,2.1,1,11.3,5,2022,7,30,5151000,1.78
2,405511.0,1.0,0,12.7,1,6.5,1,11.2,4,2022,7,29,5151000,1.78
3,334858.0,1.0,0,13.2,1,9.3,1,9.3,3,2022,7,28,5151000,1.78
4,340569.0,3.0,0,15.3,1,9.3,1,7.7,2,2022,7,27,5151000,1.78
5,316316.0,4.4,0,13.2,1,8.8,1,6.4,1,2022,7,26,5151000,1.78
6,274106.0,0.0,0,16.8,1,8.1,1,5.1,0,2022,7,25,5151000,1.78
7,406977.0,7.8,0,19.3,1,10.4,1,10.1,6,2022,7,24,5151000,1.78
8,371336.0,1.2,0,14.5,1,8.1,1,5.1,5,2022,7,23,5151000,1.78
9,416838.0,0.0,0,19.0,1,3.4,1,9.9,4,2022,7,22,5151000,1.78


# Modeling and Prediction Stage

Do a K-Folds Cross Validation using XGBoost and get an MAE and an RMSE for mean error and indication of variance

Using Best Param Results from Hyperparameter Tuning:
- Best parameters found:  {'colsample_bytree': 0.7, 'eta': 0.1, 'max_depth': 6, 'n_estimators': 200}
- Lowest MAE found:  -43476.65617123382

In [71]:
# Test a basic XGBoost Regression with KFolds Cross Validation
randomSeed = databasic.get_random_seed()
model = xgb.XGBRegressor(objective="reg:squarederror", booster="gbtree", 
    n_estimators=200, max_depth=6, colsample_bytree=0.7, eta=0.1,
    seed=randomSeed)
    
modellingLog = ""   

targetColName = "total_people"
col_names = dfFootWeather.columns
feature_cols = col_names.drop([targetColName])
trainFeatures = dfFootWeather[feature_cols]
trainTargets = dfFootWeather[targetColName]


In [72]:

lstMae = []
lstRmse = []
kfolds = KFold(n_splits=5, random_state=randomSeed, shuffle=True)
for k, (train_index, test_index) in enumerate(kfolds.split(dfFootWeather)):
    # x_train = trainFeatures.loc[train_index, ]
    # x_vali = trainFeatures.loc[test_index, ]

    # y_train = trainTargets.loc[train_index, ]
    # y_vali = trainTargets.loc[test_index, ]
    x_train = trainFeatures.loc[trainFeatures.index.intersection(train_index)]
    x_vali = trainFeatures.loc[trainFeatures.index.intersection(test_index)]
    
    y_train = trainTargets.loc[trainTargets.index.intersection(train_index)]
    y_vali = trainTargets.loc[trainTargets.index.intersection(test_index)]
        
    model.fit(x_train, y_train)
    y_pred = model.predict(x_vali)

    # Compute the mae
    mae = mean_absolute_error(y_pred, y_vali)
    lstMae.append(mae)

    # Compute the rmse
    rmse = np.sqrt(mean_squared_error(y_pred, y_vali))
    lstRmse.append(rmse)
    
    print("Fold {0} MAE: {1}, RMSE: {2}".format(str(k), str(mae), str(rmse)))

    dfPredicted = x_vali
    dfPredicted["total_people"] = y_vali
    dfPredicted["total_people_predicted"] = y_pred
    dfPredicted.to_csv("./predictions/" + thisFileName+"_KFold" + str(k) + ".csv", index=False)

print("Final Result")
print("----------")
print("Average Mean Absolute Error (MAE): " + str(np.mean(lstMae)))
print("Average Root Mean Squared Error (RMSE): " + str(np.mean(lstRmse)))


Fold 0 MAE: 38740.018170362906, RMSE: 57597.96260659657
Fold 1 MAE: 44320.804892263106, RMSE: 68019.1101422841
Fold 2 MAE: 38013.02772869547, RMSE: 52438.59824632655
Fold 3 MAE: 44510.84156338348, RMSE: 65623.7924475841
Fold 4 MAE: 39813.222365963244, RMSE: 57459.67366133987
Final Result
----------
Average Mean Absolute Error (MAE): 41079.58294413364
Average Root Mean Squared Error (RMSE): 60227.82742082623


Run 1:
- Average Mean Absolute Error (MAE): 39974.85765021118
- Average Root Mean Squared Error (RMSE): 59220.14325103734

Run 2:
- Average Mean Absolute Error (MAE): 40131.682509491824
- Average Root Mean Squared Error (RMSE): 59640.9317324833

Run 3:
- Average Mean Absolute Error (MAE): 40558.228277959315
- Average Root Mean Squared Error (RMSE): 60033.38379598523