In [40]:
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import VarianceThreshold
from xgboost import XGBRegressor
import numpy as np

In [41]:
df = pd.read_excel("Realized Schedule 20210101-20220228.xlsx", engine='openpyxl')
df.shape

(39449, 9)

In [42]:
df = df.dropna(how='any')
df.shape

(39449, 9)

In [43]:
for feature in ["Airline", "Destination", "AircraftType", "FlightType", "Sector"]:
    df[feature] = df[feature].astype(str).astype('category')

df['Minute'] = df['ScheduleTime'].dt.minute
df['Hour'] = df['ScheduleTime'].dt.hour
df['Day'] = df['ScheduleTime'].dt.day
df['Week'] = df['ScheduleTime'].dt.isocalendar().week.astype(int)
df['Month'] = df['ScheduleTime'].dt.month
season_mapping = {12: "Winter", 1: "Winter", 2: "Winter",
                  3: "Spring", 4: "Spring", 5: "Spring",
                  6: "Summer", 7: "Summer", 8: "Summer",
                  9: "Autumn", 10: "Autumn", 11: "Autumn"}
df['Season'] = df['Month'].apply(lambda x: season_mapping[x])

In [44]:
df.head()

Unnamed: 0,ScheduleTime,Airline,FlightNumber,Destination,AircraftType,FlightType,Sector,SeatCapacity,LoadFactor,Minute,Hour,Day,Week,Month,Season
0,2021-01-01 06:35:00,IA,874,DEN,73W,J,US,142,0.408451,35,6,1,53,1,Winter
1,2021-01-01 10:35:00,JZ,818,YHM,AT7,J,CA,74,0.189189,35,10,1,53,1,Winter
2,2021-01-01 12:05:00,IA,876,DEN,73W,J,US,142,0.570423,5,12,1,53,1,Winter
3,2021-01-01 13:20:00,CN,514,EST,AT7,J,US,72,0.333333,20,13,1,53,1,Winter
4,2021-01-01 14:20:00,LJ,3140,DEN,32A,J,US,186,0.204301,20,14,1,53,1,Winter


In [45]:
df_dummies = pd.get_dummies(df, columns=["FlightNumber", "Airline", "Destination", "AircraftType", "FlightType", "Sector", "Season"]).drop(["ScheduleTime", "LoadFactor"], axis=1)
df_dummies

Unnamed: 0,SeatCapacity,Minute,Hour,Day,Week,Month,FlightNumber_500,FlightNumber_501,FlightNumber_504,FlightNumber_505,...,Sector_IS,Sector_MX,Sector_NL,Sector_QA,Sector_SG,Sector_US,Season_Autumn,Season_Spring,Season_Summer,Season_Winter
0,142,35,6,1,53,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
1,74,35,10,1,53,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,142,5,12,1,53,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
3,72,20,13,1,53,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
4,186,20,14,1,53,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39444,144,45,18,28,9,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
39445,156,25,19,28,9,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
39446,98,0,20,28,9,2,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
39447,186,30,19,28,9,2,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1


In [46]:
X = VarianceThreshold(threshold=0.005).fit_transform(df_dummies)
y = df['LoadFactor'].to_numpy()

In [47]:
X.shape

(39449, 163)

# Splitting the data
Since normal splitting stratigies assumes that data is i.i.d and we can clearly see that there are trends in the data we'll use `TimeSeriesSplit`

In [48]:
tscv = TimeSeriesSplit(n_splits=10)

xgb_params = {"n_estimators": [50,100,200,300], "reg_lambda": np.logspace(-3, -1, 5)}
RF_params = {"n_estimators": [50,100,200,300]}
RF_regr = RandomForestRegressor(oob_score=True, n_jobs=-1)
xgb_regr = XGBRegressor()

gs = GridSearchCV(RF_regr, param_grid=RF_params, cv=tscv, verbose=3, n_jobs=-1)
gs.fit(X, y)

Fitting 10 folds for each of 4 candidates, totalling 40 fits
