In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder, RobustScaler, StandardScaler
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold
!pip install pytorch-tabnet
from pytorch_tabnet.tab_model import TabNetRegressor

- Validation strategy is learned from AMBROSM's notebook
[https://www.kaggle.com/code/ambrosm/tpsmar22-random-forest](http://)

In [None]:
train = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv')
print(train.shape)
test = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv')
print(test.shape)
sub = pd.read_csv('../input/tabular-playground-series-mar-2022/sample_submission.csv')

In [None]:
train.head(65)

In [None]:
# remove outlier

train['time'] = pd.to_datetime(train['time'])

# Memorial Day
train = train[(train.time.dt.month != 5) | (train.time.dt.day != 27)]

# July 4
train = train[(train.time.dt.month != 7) | (train.time.dt.day != 4)]

# Labor Day
train = train[(train.time.dt.month != 9) | (train.time.dt.day != 2)]

In [None]:
df = pd.concat([train, test])
df['time'] = pd.to_datetime(df['time'])
df.head()

# FE

In [None]:
df['hr'] = df['time'].dt.hour
df['month'] = df['time'].dt.month
df['dow'] = df['time'].dt.dayofweek
df['Sat'] = (df['dow']==5).astype('int8')
df['Sun'] = (df['dow']==6).astype('int8')

df['day'] = df['time'].dt.day
df['daytime'] = df.time.dt.hour * 60 + df.time.dt.minute
df['min'] = df['time'].dt.minute
df['week'] = df['time'].dt.week

df['yesterday'] = df.groupby(['x','y','direction','hr','min'])['congestion'].transform(lambda x: x.shift(1))
df['lastweek'] = df.groupby(['x','y','direction','hr','min'])['congestion'].transform(lambda x: x.shift(7))
df['median_cong'] = df.groupby(['x','y','direction','hr','min'])['congestion'].transform(lambda x: x.median())
df['rolling_7_std'] = df.groupby(['x','y','direction','hr','min'])['congestion'].transform(lambda x: x.rolling(7).std())
df['x+y+direction'] = df['x'].astype('str') + df['y'].astype('str') + df['direction'].astype('str')

df.head()

In [None]:
#df = df[df.month >= 8]

# Pre-process

In [None]:
le = LabelEncoder()
obj_cols = ['direction','x+y+direction']
for c in obj_cols:
    df[c] = le.fit_transform(df[c])


# Use all Monday-Wednesday afternoons in August and September for validation

In [None]:
df = df.fillna(df.median())

In [None]:
train = df[:len(train)]
test = df[-len(test):]

X_train = train.drop(['row_id','time','congestion'], axis = 1)
y_train = train.congestion
X_test = test.drop(['row_id','time','congestion'], axis = 1)


In [None]:
X_train.head()

In [None]:
features = [ col for col in X_train.columns ] 

cat_idxs = [ 0,1,2,16 ]
print(cat_idxs)
cat_dims = [ X_train.iloc[:,i].nunique() for i in cat_idxs]
cat_dims

In [None]:
val_idx = ((train.time.dt.month >= 8) & 
           (train.time.dt.weekday <= 3) &
           (train.time.dt.hour >= 12))

train_idx = ~val_idx

X_tr = train.loc[train_idx].drop(['row_id','time','congestion'], axis = 1)
y_tr = train.loc[train_idx]['congestion'].to_numpy().reshape(-1,1)

X_val = train.loc[val_idx].drop(['row_id','time','congestion'], axis = 1)
y_val = train.loc[val_idx]['congestion'].to_numpy().reshape(-1,1)

print(X_tr.shape)
print(X_val.shape)

# Tabnet local validation

In [None]:
%%time

clf = TabNetRegressor(cat_idxs=cat_idxs)

clf.fit(X_tr.values, y_tr,
    eval_set=[(X_tr.values, y_tr),(X_val.values, y_val)],
    eval_name=['train','val'],
    eval_metric=['mae'],
    max_epochs=10,
    patience=3,

   )

   
   

# Refit training set

In [None]:
y_train_new = y_train.to_numpy().reshape(-1,1)

In [None]:
print(X_train.values.shape)
print(y_train_new.shape)

In [None]:
clf.fit(X_train.values, y_train_new, eval_set = [(X_train.values, y_train_new)], max_epochs=20,
    patience=3, eval_metric = ['mae'])

In [None]:
pred_test = np.round(clf.predict(X_test.values))

# Post-processing

In [None]:
sub['congestion'] = pred_test.reshape(-1)

# Compute the quantiles of workday afternoons in September except Labor Day
sep = train[(train.time.dt.hour >= 12) & (train.time.dt.weekday < 5) &
            (train.time.dt.dayofyear >= 246)]

lower = sep.groupby(['hr', 'min', 'x', 'y', 'direction']).congestion.quantile(0.15).values
upper = sep.groupby(['hr', 'min', 'x', 'y', 'direction']).congestion.quantile(0.7).values

# Clip the submission data to the quantiles
submission_out = sub.copy()
submission_out['congestion'] = sub.congestion.clip(lower, upper).round().astype(int)


submission_out.to_csv('submission.csv', index = False)

In [None]:
submission_out

In [None]:
submission_out.describe()