In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

# Load data and basic pre-processing

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv')
df_test  = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv')

In [None]:
def add_date_features(df):
    df['time']  = pd.to_datetime(df['time'])
    df['day']   = df['time'].dt.dayofweek
    df['month'] = df['time'].dt.month
    df['hour']  = df['time'].dt.hour
    df['week']  = df['time'].dt.isocalendar().week
    return df

In [None]:
df_train = add_date_features(df_train)
df_test  = add_date_features(df_test)

# How does total congestion evolve on a weekly basis?

In [None]:
# Average congestion in a monthly basis
weekly_grouped = df_train.groupby('week')['congestion'].mean().reset_index()
fig, ax = plt.subplots(figsize=(15, 5))
ax.plot(weekly_grouped['week'], weekly_grouped['congestion'])

ax.set_ylabel('Average weekly total congestion')
ax.set_xlabel('Ordinal week')
plt.show()

# Weekly congestion average per road

In [None]:
# Average congestion in a monthly basis
grouped = df_train.groupby(['week', 'x', 'y'])['congestion'].mean().reset_index()
fig, ax = plt.subplots(len(df_train['y'].unique()), len(df_train['x'].unique()), figsize=(30, 20))

for i, _y in enumerate(df_train['y'].unique()):
    for j, _x in enumerate(df_train['x'].unique()):
        
        view = grouped[ (grouped['x'] == _x) & (grouped['y'] == _y) ]
        
        ax[i, j].plot(view['week'], view['congestion'])
        ax[i, j].set_ylabel('Average weekly total congestion')
        ax[i, j].set_xlabel('Ordinal week')
        ax[i, j].set_title(f'Road (x, y): ({_x}, {_y})')
plt.show()

* Seems like there is an unusual drop in the average for week 40
* Particularly noticeable for road (1, 1), (1, 2),  (2, 1)

# Create week lag congestion features

In [None]:
print(df_train['x'].unique())
print(df_train['y'].unique())
print(df_train['direction'].unique())

# create lagged congestion feature, probably a better way to do this....
# unique x, y, direction pairings
x_unique         = df_train['x'].unique()
y_unique         = df_train['y'].unique()
direction_unique = df_train['direction'].unique()

print(f'Number of pairings: {len(x_unique) * len(y_unique) * len(direction_unique)}')

nlags = 20

lagged_dfs = []

# this assigns a variable congestion_week_lag_n for the congestion value on the n-1th week
for _x in x_unique:
    for _y in y_unique:
        for _direction in direction_unique:
            view = df_train[ (df_train['x'] == _x) & (df_train['y'] == _y) & (df_train['direction'] == _direction) ].copy(deep=True)
            for i in range(nlags):
                view[f'congestion_week_lag_{i+1}'] = view['congestion'].shift(i + 1)
                lagged_dfs.append(view)

In [None]:
# combine lagged DFs together
# concat them all together
df_train = pd.concat(lagged_dfs)

# drop nans
df_train.dropna(inplace=True)
#print(df_train)

# lag correlations

In [None]:
lags = ['congestion']
lags.extend( ['congestion_week_lag_' + str(v+1) for v in range(nlags)] )
lag_view = df_train[lags]
fig, ax = plt.subplots(1, 1, figsize=(15, 15))
sns.heatmap(lag_view.corr(), cmap=sns.color_palette("vlag", as_cmap=True), square=True, ax=ax, annot=True)
plt.show()

# Simple BRT model

In [None]:
# map direction with cyclical representations
# REF: https://www.kaggle.com/inversion/tps-mar-22-cyclical-features
from math import sin, cos, pi

sin_vals = {
    'NB': 0.0,
    'NE': sin(1 * pi/4),
    'EB': 1.0,
    'SE': sin(3 * pi/4),
    'SB': 0.0,
    'SW': sin(5 * pi/4),    
    'WB': -1.0,    
    'NW': sin(7 * pi/4),  
}

df_train['sin'] = df_train['direction'].map(sin_vals)

In [None]:
#print(df_train.columns)
training = df_train[
    [
        'x',
        'y',
        'day',
        'hour',
        'congestion_week_lag_1',
        'sin'
    ]
]
print(training)

In [None]:
# train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(training, df_train['congestion'], test_size=0.2, random_state=42)

# hyperparameter tuning

## Search space

In [None]:
# defining search space
from hyperopt import hp
#mln = [v + 1 for v in range(1, 20)]
#mln.append(None)
#print(mln)

params = {
    'n_estimators'   : hp.quniform('n_estimators', 25, 50, 5),
    'max_depth'      : hp.quniform('max_depth', 1, 20, 2),
    #'max_leaf_nodes' : hp.choice('max_leaf_nodes', mln),
}

## Objective function

In [None]:
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from hyperopt import STATUS_OK

def objective(params):
    global X_train, y_train, X_test, y_test
    
    print(f'Running with {params}')
    
    params['n_estimators']   = int(params['n_estimators'])
    params['max_depth']      = int(params['max_depth'])
    
    """if params['max_leaf_nodes'] is not None:
        params['max_leaf_nodes'] = int(params['max_leaf_nodes'])
"""    
    # fit model
    model = XGBRegressor(**params, random_state=42, verbosity=2, tree_method='hist')
    #model = GradientBoostingRegressor(**params, random_state=42)
    #model = RandomForestRegressor(**params, random_state=42)
    model.fit(X_train, y_train)
    
    # make predictions with fitted model
    y_pred = model.predict(X_test)
    
    print(f'Loss: {mean_absolute_error(y_pred, y_test)}')
    
    # return metrics
    return {
        'loss'     : mean_absolute_error(y_pred, y_test),
        'status'   : STATUS_OK,
        'RMSE'     : mean_squared_error(y_pred, y_test, squared=False)
        }

## Run trials

In [None]:
"""from hyperopt import fmin, tpe, Trials

trials = Trials()
best = fmin(objective,
            space = params,
            algo = tpe.suggest,
            max_evals = 50,
            trials = trials)
print(best)"""

# Checking with cross validation

In [None]:
"""from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor

model = XGBRegressor(max_depth=18, n_estimators=45, tree_method='hist')

scores = cross_val_score(model,
                         X_train,
                         y_train,
                         cv = 5,
                         scoring = 'neg_mean_absolute_error',
                         verbose = 1)"""

In [None]:
"""print(np.mean(scores))
print(np.std(scores))"""
#-4.737009782253329
#0.005871818057510222

# Fitting model on all training data

In [None]:
from xgboost import XGBRegressor

#model = XGBRegressor(max_depth=16, n_estimators=25, tree_method='hist', verbosity=2)
model = XGBRegressor(max_depth=18, n_estimators=45, tree_method='hist', verbosity=2)
model.fit(training, df_train['congestion'])

# Preparing test data

In [None]:
# prepare testing data
df_test['congestion_week_lag_1'] = np.nan
df_test['sin'] = df_test['direction'].map(sin_vals)

In [None]:
# creating 1 week lags
#print(df_train.head())
df_lagged = df_train.copy(deep=True)
df_lagged['time'] = df_lagged['time'] + dt.timedelta(days=7)
df_lagged = df_lagged[['x', 'y', 'direction', 'time', 'congestion']]
#print(df_lagged.head())
df_lagged = df_lagged[ ['time', 'x', 'y', 'congestion', 'direction'] ]
df_lagged = df_lagged.rename(columns={'congestion' : 'congestion_week_lag_1'})

In [None]:
# really hacky way to set the lag values, this REALLY needs fixing
testing = df_test.merge(df_lagged, on=['time', 'x', 'y', 'direction'], how='inner')
testing = testing[ testing['row_id'].isin(df_test['row_id']) ]
testing = testing.drop_duplicates()
print(testing)

In [None]:
# dropping columns not needed
testing.drop(['time', 'month', 'week', 'direction', 'congestion_week_lag_1_x'], axis=1, inplace=True)
testing = testing.rename(columns={'congestion_week_lag_1_y' : 'congestion_week_lag_1'})

In [None]:
# rearrange columns for model evaluation
testing = testing.reindex(columns=training.columns)

In [None]:
print(testing.head())
print(len(testing))

# Evaluate test data

In [None]:
pred = model.predict(testing)

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-mar-2022/sample_submission.csv')
submission['congestion'] = pred
submission.to_csv('submission.csv', index=False)