In [None]:
# Input data files are available in the read-only "../input/" directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# import libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

from sklearn.model_selection import TimeSeriesSplit, train_test_split

from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

from sklearn.metrics import mean_absolute_error as mae

In [None]:
# import the data
train = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2022/train.csv', parse_dates=['time'])
test = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2022/test.csv', parse_dates=['time'])
sample_sub = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2022/sample_submission.csv')

# take a look at the training data (particularly to check for missing values)
print(train.info())
train.head()

In [None]:
print(test.shape)
test.head()

In [None]:
# visualise univariate variable distributions
fig, axs = plt.subplots(1, 4, figsize=(25,6))

sns.histplot(data=train, x="congestion", kde=True, ax=axs[0]) # note the spikes in congestion at about 15, 30 and 35
sns.countplot(data=train, y="x", palette="crest", ax=axs[1])
sns.countplot(data=train, y="y", palette="crest", ax=axs[2])
sns.countplot(data=train, y="direction", palette="crest", ax=axs[3])

sns.despine();

In [None]:
# visualise bivariate variable distributions against congestion
fig, axs = plt.subplots(1, 2, figsize=(25,6))

p1 = sns.violinplot(data=train, x="x", y="congestion", palette='crest_r', ax=axs[0])
sns.pointplot(data=train, x="x", y="congestion", join=False, ci=0, capsize=0.7, scale=0, palette='crest', ax=axs[0])
p2 = sns.violinplot(data=train, x="y", y="congestion", palette='crest_r', ax=axs[1])
sns.pointplot(data=train, x="x", y="congestion", join=False, ci=0, capsize=0.7, scale=0, palette='crest', ax=axs[1])
sns.despine();

In [None]:
fig, axs = plt.subplots(figsize=(15,6))

sns.violinplot(data=train, x="direction", y="congestion", palette='crest', scale='width')
sns.pointplot(data=train, x="x", y="congestion", join=False, ci=0, capsize=0.7, scale=0, palette='crest')
sns.despine();

In [None]:
# now go to town on the datetime feature engineering
# first define the time_of_day sub-function
def _time_of_day(hour):
    if hour in [2,3,4,5]:
        return "dawn"
    elif hour in [6,7,8,9]:
        return "morning"
    elif hour in [10,11,12,13]:
        return "noon"
    elif hour in [14,15,16,17]:
        return "afternoon"
    elif hour in [18,19,20,21]:
        return "evening"
    else: return "midnight"

def create_datetime_features(input_data):
    output_data = input_data.copy()
    
    enc = OrdinalEncoder()
    output_data['time_idx'] = enc.fit_transform(output_data['time'].values.reshape(-1,1)).astype(int)
    
    output_data['date'] = output_data.time.dt.date
    output_data['month'] = output_data.time.dt.month
    output_data['month_name'] = output_data.time.dt.month_name()
    output_data['day_of_month'] = output_data.time.dt.day
    output_data['hour'] = output_data.time.dt.hour
    output_data['minute'] = output_data.time.dt.minute
    output_data['day_of_week'] = output_data.time.dt.dayofweek
    output_data['day_of_week_name'] = output_data.time.dt.day_name()

    output_data['time_of_day'] = output_data.hour.apply(_time_of_day)
    output_data['is_weekend'] = output_data.day_of_week_name.apply(lambda x: "Yes" if x in ['Saturday','Sunday'] else "No")
    
    return output_data

train = create_datetime_features(train)
test = create_datetime_features(test)

In [None]:
# create a new feature combining x and y
train['xy'] = train.x.astype(str) + train.y.astype('str')
test['xy'] = test.x.astype(str) + test.y.astype('str')

# get rid of row_id
train = train.drop('row_id', axis=1)
test = test.drop('row_id', axis=1)

train.head()

In [None]:
# visualise new bivariate variable distributions against congestion
# note that only those that looked like there might be a relationship were plotted
fig, axs = plt.subplots(4, 2, figsize=(25,15))

p1 = sns.boxenplot(data=train, x="month_name", y="congestion", palette='crest', ax=axs[0,0])
p2 = sns.violinplot(data=train, x="hour", y="congestion", palette='crest', ax=axs[0,1])
sns.pointplot(data=train, x="hour", y="congestion", join=False, ci=0, capsize=0.7, scale=0, palette='crest_r', ax = axs[0,1])
p3 = sns.boxenplot(data=train, x="minute", y="congestion", palette='crest', ax=axs[1,0])
p4 = sns.boxenplot(data=train, x="day_of_week_name", y="congestion", palette='crest', ax=axs[1,1])
p5 = sns.violinplot(data=train, x="time_of_day", y="congestion", palette='crest', ax=axs[2,0])
sns.pointplot(data=train, x="time_of_day", y="congestion", join=False, ci=0, capsize=0.7, scale=0, palette='crest_r', ax = axs[2,0])
p6 = sns.boxenplot(data=train, x="is_weekend", y="congestion", palette='crest', ax=axs[2,1])
p7 = sns.violinplot(data=train, x="xy", y="congestion", palette='crest', ax=axs[3,0])
sns.pointplot(data=train, x="xy", y="congestion", join=False, ci=0, capsize=0.7, scale=0, palette='crest_r', ax = axs[3,0])

fig.tight_layout()
sns.despine();

In [None]:
# day of week by time of day
sns.catplot(x="time_of_day", y="congestion", col="day_of_week_name", data=train, col_wrap=3, kind="boxen", palette="crest", sharex=False, aspect=2);

In [None]:
# visualise correlation matrix to determine if there are any linear correlations between variables
# note that only numeric variables are included
corrs = train.corr(method='kendall')

mask = np.zeros_like(corrs)
mask[np.triu_indices_from(mask)] = True

fig = plt.subplots(figsize=(15,8))
corr_plot = sns.heatmap(corrs, mask=mask, cmap="YlGnBu", annot=True, linewidth=0.5, cbar=False)

In [None]:
# drop time, time_idx, month, day_of_week
train = train.drop(['time', 'time_idx', 'month', 'day_of_week'], axis=1)
test = test.drop(['time', 'time_idx', 'month', 'day_of_week'], axis=1)

In [None]:
# dummify all features except for date, hour and minute
def encode_features(train_input_df, test_input_df):
    passthrough_feats = ['date', 'hour', 'minute']
    ordinal_feats = ['month_name', 'day_of_month', 'day_of_week_name', 'time_of_day']
    cat_feats = ['is_weekend', 'direction', 'x', 'y', 'xy']
    
    ohe = OneHotEncoder(drop='first', sparse=False, handle_unknown='error')
    
    train_ord_enc = ohe.fit_transform(train_input_df[ordinal_feats])
    train_ord_df = pd.DataFrame(train_ord_enc, columns=ohe.get_feature_names())   
    test_ord_enc = ohe.transform(test_input_df[ordinal_feats])
    test_ord_df = pd.DataFrame(test_ord_enc, columns=ohe.get_feature_names())
    
    train_cat_enc = ohe.fit_transform(train_input_df[cat_feats])
    train_cat_df = pd.DataFrame(train_cat_enc, columns=ohe.get_feature_names())
    test_cat_enc = ohe.transform(test_input_df[cat_feats])
    test_cat_df = pd.DataFrame(test_cat_enc, columns=ohe.get_feature_names())
    
    train_output_df = pd.concat([train_input_df[passthrough_feats], train_ord_df, train_cat_df], axis=1)
    test_output_df = pd.concat([test_input_df[passthrough_feats], test_ord_df, test_cat_df], axis=1)
    
    return train_output_df, test_output_df

X, X_test = encode_features(train, test)
X['congestion'] = train.congestion

In [None]:
X.head()

In [None]:
# split the training data into training (~70%) and validation sets (~30%)
# as we are not time-series forecasting, it is ok to shuffle the data
X_train_all, X_val_all, y_train, y_val = train_test_split(X.loc[:, 'date':'x4_23'], X[['congestion']], test_size=0.3, random_state=22)

X_train = X_train_all.loc[:, 'hour':'x4_23']
X_val = X_val_all.loc[:, 'hour':'x4_23']

# check the split datasets have the correct shape
print("y_train shape:", y_train.shape)
print("X_train shape:", X_train.shape)

print("y_val shape:", y_val.shape)
print("X_val shape:", X_val.shape)

In [None]:
# estimate a simple OLS model
X_train_const = sm.add_constant(X_train)
ols_model = sm.OLS(y_train, X_train_const)
ols_est = ols_model.fit()

print(ols_est.summary())

In [None]:
# make predictions on the training and validation data
tr_preds = ols_est.predict(X_train_const)

X_val_const = sm.add_constant(X_val)
val_preds = ols_est.predict(X_val_const)

# and analyse the accuracy
tr_mae = mae(y_train, tr_preds)
val_mae = mae(y_val, val_preds)

print("MAE on the training data is:", round(tr_mae, 4), "and MAE on the validation data is:", round(val_mae, 4))

In [None]:
# plot the predictions against the actual values
tr_pred_df = pd.concat([X_train_all[['date']], y_train], axis=1)
tr_pred_df['pred'] = pd.Series(tr_preds, name="pred")
tr_pred_df.columns = ['date', 'actuals', 'preds']
tr_pred_df = pd.melt(tr_pred_df, id_vars=['date'], value_vars=['actuals', 'preds'], var_name='type', value_name='congestion')

val_pred_df = pd.concat([X_val_all[['date']], y_val], axis=1)
val_pred_df['pred'] = pd.Series(val_preds, name="pred")
val_pred_df.columns = ['date', 'actuals', 'preds']
val_pred_df = pd.melt(val_pred_df, id_vars=['date'], value_vars=['actuals', 'preds'], var_name='type', value_name='congestion')

In [None]:
fig, axs = plt.subplots(2, 1, figsize=(15,10))
sns.lineplot(data=tr_pred_df, x="date", y="congestion", hue="type", style="type", palette='crest_r', ci=None, sort=True, ax=axs[0])
sns.lineplot(data=val_pred_df, x="date", y="congestion", hue="type", style="type", palette='crest_r', ci=None, sort=True, ax=axs[1])

sns.despine();

In [None]:
# influence plots show the (externally) studentised results against the leverage of each observation as measured by the hat matrix
#fig = sm.graphics.influence_plot(ols_est, criterion="cooks")
#fig.tight_layout(pad=1.0)

In [None]:
# also plot the leverage-resid2 plot, which is closely related to the influence plot
#fig = sm.graphics.plot_leverage_resid2(ols_est)
#fig.tight_layout(pad=1.0)

In [None]:
# make predictions on the test data
X_test_all = X_test.copy()
X_test = X_test_all.loc[:, 'hour':'x4_23']

X_test_const = sm.add_constant(X_test, has_constant='add')
X_test_const
test_preds = ols_est.predict(X_test_const)

# need to reimport as changed the test file
test_orig = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2022/test.csv', parse_dates=['time'])

test_pred_df = pd.concat([test_orig[['row_id']], pd.Series(test_preds, name="pred")], axis=1)
test_pred_df.columns = ['row_id', 'congestion']
test_pred_df

In [None]:
test_pred_df.to_csv('/kaggle/working/submission.csv', index=False)