#### **Table of Contents**

* [Introduction](#section-one)
* [Baseline model](#section-two)
* [Adding early stopping and cross-validation](#section-three)
* [Hyperparameter tuning](#section-four)

In [None]:
# Input data files are available in the read-only "../input/" directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<a id="section-one"></a>
### **Data Preprocessing**

In [None]:
# import libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import KFold, cross_val_score, train_test_split

import xgboost as xgb
from sklearn.metrics import mean_absolute_error as mae

In [None]:
# import the data
train = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2022/train.csv', parse_dates=['time'])
test = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2022/test.csv', parse_dates=['time'])
sample_sub = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2022/sample_submission.csv')

In [None]:
# first define the time_of_day sub-function
def _time_of_day(hour):
    if hour in [2,3,4,5]:
        return "dawn"
    elif hour in [6,7,8,9]:
        return "morning"
    elif hour in [10,11,12,13]:
        return "noon"
    elif hour in [14,15,16,17]:
        return "afternoon"
    elif hour in [18,19,20,21]:
        return "evening"
    else: return "midnight"

def create_datetime_features(input_data):
    output_data = input_data.copy()
    
    enc = OrdinalEncoder()
    output_data['time_idx'] = enc.fit_transform(output_data['time'].values.reshape(-1,1)).astype(int)
    
    output_data['date'] = output_data.time.dt.date
    output_data['month'] = output_data.time.dt.month
    output_data['month_name'] = output_data.time.dt.month_name()
    output_data['day_of_month'] = output_data.time.dt.day
    output_data['hour'] = output_data.time.dt.hour
    output_data['minute'] = output_data.time.dt.minute
    output_data['day_of_week'] = output_data.time.dt.dayofweek
    output_data['day_of_week_name'] = output_data.time.dt.day_name()

    output_data['time_of_day'] = output_data.hour.apply(_time_of_day)
    output_data['is_weekend'] = output_data.day_of_week_name.apply(lambda x: "Yes" if x in ['Saturday','Sunday'] else "No")
    
    return output_data

train = create_datetime_features(train)
test = create_datetime_features(test)

In [None]:
# create a new feature combining x and y
train['xy'] = train.x.astype(str) + train.y.astype('str')
test['xy'] = test.x.astype(str) + test.y.astype('str')

# and a new feature combining day_of_week_name and time_of_day
train['day_of_week_time_of_day'] = train.day_of_week_name + ' ' + train.time_of_day
test['day_of_week_time_of_day'] = test.day_of_week_name + ' ' + test.time_of_day

# get rid of row_id
train = train.drop('row_id', axis=1)
test = test.drop('row_id', axis=1)

train.head()

In [None]:
# drop time, time_idx, month, day_of_week
train = train.drop(['time', 'time_idx', 'month', 'day_of_week'], axis=1)
test = test.drop(['time', 'time_idx', 'month', 'day_of_week'], axis=1)

In [None]:
# one-hot-encode all features except for date, hour and minute
def encode_features(train_input_df, test_input_df):
    passthrough_feats = ['hour', 'minute']
    ordinal_feats = ['month_name', 'day_of_month', 'day_of_week_name', 'time_of_day', 'day_of_week_time_of_day']
    cat_feats = ['is_weekend', 'direction', 'x', 'y', 'xy']
    
    ohe = OneHotEncoder(sparse=False, handle_unknown='error')
    
    train_ord_enc = ohe.fit_transform(train_input_df[ordinal_feats])
    train_ord_df = pd.DataFrame(train_ord_enc, columns=ohe.get_feature_names())   
    test_ord_enc = ohe.transform(test_input_df[ordinal_feats])
    test_ord_df = pd.DataFrame(test_ord_enc, columns=ohe.get_feature_names())
    
    train_cat_enc = ohe.fit_transform(train_input_df[cat_feats])
    train_cat_df = pd.DataFrame(train_cat_enc, columns=ohe.get_feature_names())
    test_cat_enc = ohe.transform(test_input_df[cat_feats])
    test_cat_df = pd.DataFrame(test_cat_enc, columns=ohe.get_feature_names())
    
    train_output_df = pd.concat([train_input_df[passthrough_feats], train_ord_df, train_cat_df], axis=1)
    test_output_df = pd.concat([test_input_df[passthrough_feats], test_ord_df, test_cat_df], axis=1)
    
    return train_output_df, test_output_df

X, X_test = encode_features(train, test)
X['congestion'] = train.congestion

<a id="section-two"></a>
### **Baseline Model**

In [None]:
# split the training data into training (~70%) and validation sets (~30%)
# as we are not time-series forecasting, it is ok to shuffle the data
X_train, X_val, y_train, y_val = train_test_split(X.loc[:, 'hour':'x4_23'], X[['congestion']], test_size=0.3, random_state=22)

In [None]:
X_train.head()

In [None]:
%%time
# instantiate an XGBRegressor object
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', seed=22)

# and fit the model to the training data
xgb_reg.fit(X_train, y_train)

In [None]:
# make predictions on the training and validation data
tr_preds = xgb_reg.predict(X_train)
val_preds = xgb_reg.predict(X_val)

# and analyse the accuracy
tr_mae = mae(y_train, tr_preds)
val_mae = mae(y_val, val_preds)

print("MAE on the training data is:", round(tr_mae, 4), "and MAE on the validation data is:", round(val_mae, 4))

In [None]:
# take a look at which features were considered most important
importances = xgb_reg.feature_importances_
i = np.argsort(importances)[::-1]
features = X_train.columns

d = {'index': i, 'importance': importances[i], 'feature': features[i]}
feat_imps = pd.DataFrame(data=d)

fig, axs = plt.subplots(figsize=(15,8))
sns.set_color_codes("muted")
sns.barplot(x="importance", y="feature", data=feat_imps, color="b")
sns.despine();

In [None]:
# make predictions on the test data
test_preds = xgb_reg.predict(X_test)

# need to reimport as changed the test file
test_orig = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2022/test.csv', parse_dates=['time'])

test_pred_df = pd.concat([test_orig[['row_id']], pd.Series(test_preds, name="pred")], axis=1)
test_pred_df.columns = ['row_id', 'congestion']

# submit the predictions
test_pred_df.to_csv('/kaggle/working/submission.csv', index=False)

<a id="section-three"></a>
### **Adding early stopping and cross-validation**

In [None]:
%%time

# indicates there would definitely be some value in increasing the number of iterations
eval_set = [(X_train, y_train), (X_val, y_val)]
xgb_reg.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="mae", eval_set=eval_set, verbose=False)

evals_result = xgb_reg.evals_result()
max_iter = len(evals_result['validation_0']['mae'])

evals_df = pd.DataFrame({'iter': np.arange(1,max_iter+1,1), 
                         'train': evals_result['validation_0']['mae'],
                         'val': evals_result['validation_1']['mae']})
evals_df_long = pd.melt(evals_df, id_vars='iter', var_name='df', value_name='score')
sns.lineplot(data=evals_df_long, x="iter", y="score", hue="df");

In [None]:
#%%time

#X_features = X.loc[:, 'hour':'x4_23']
#y = X[['congestion']]

# instantiate the k-fold cross-validator
#kf = KFold(n_splits=5, shuffle=True, random_state=22)

#all_test_preds = []
#tr_maes = []
#val_maes = []

#for fold, (train_idx, val_idx) in enumerate(kf.split(X_features)):
    # create the training and validation data
#    X_train, y_train = X_features.iloc[train_idx], y.iloc[train_idx]
#    X_val, y_val = X_features.iloc[val_idx], y.iloc[val_idx]
    
    # train the model
#    xgb_mod = xgb.XGBRegressor(objective='reg:squarederror', random_state=22)
#    xgb_mod.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="mae", eval_set=[(X_val, y_val)], verbose=False)
    
    # make predictions on the training and validation data
#    tr_preds = xgb_reg.predict(X_train)
#    val_preds = xgb_reg.predict(X_val)
    
    # and analyse the accuracy
 #   tr_mae = mae(y_train, tr_preds)
 #   val_mae = mae(y_val, val_preds)
    
 #   tr_maes.append(tr_mae)
 #   val_maes.append(val_mae)
    
    # and make predictions on the test data
 #   test_preds = xgb_reg.predict(X_test)
 #   all_test_preds.append(test_preds)
    
 #   print("Fold:", fold, "Training MAE:", round(tr_mae, 4), "Validation MAE:", round(val_mae, 4))

In [None]:
# calculate the average accuracy (mae) scores for the training and validation data
#print("Average MAE on the training data is:", round(np.array(tr_maes).mean(), 4))
#print("Average MAE on the validation data is:", round(np.array(val_maes).mean(), 4))

In [None]:
# create the submission file
#mean_test_preds = np.mean(all_test_preds, axis=0)

#test_pred_df = pd.concat([test_orig[['row_id']], pd.Series(mean_test_preds, name="pred")], axis=1)
#test_pred_df.columns = ['row_id', 'congestion']

# submit the predictions
#test_pred_df.to_csv('/kaggle/working/submission.csv', index=False)

<a id="section-four"></a>
### **Hyperparamater tuning**