In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

- Validation strategy is learned from AMBROSM's notebook
[https://www.kaggle.com/code/ambrosm/tpsmar22-random-forest](http://)

In [None]:
train = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv')
print(train.shape)
test = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv')
print(test.shape)
sub = pd.read_csv('../input/tabular-playground-series-mar-2022/sample_submission.csv')

In [None]:
train.head(65)

In [None]:
df = pd.concat([train, test])
df['time'] = pd.to_datetime(df['time'])
df.head()

# FE

In [None]:
df['hr'] = df['time'].dt.hour
df['month'] = df['time'].dt.month
df['dow'] = df['time'].dt.dayofweek
df['Sat'] = (df['dow']==5).astype('int8')
df['Sun'] = (df['dow']==6).astype('int8')

df['day'] = df['time'].dt.day
df['daytime'] = df.time.dt.hour * 60 + df.time.dt.minute
df['min'] = df['time'].dt.minute
df['week'] = df['time'].dt.week

df['yesterday'] = df.groupby(['x','y','direction','hr','min'])['congestion'].transform(lambda x: x.shift(1))
df['lastweek'] = df.groupby(['x','y','direction','hr','min'])['congestion'].transform(lambda x: x.shift(7))
df['median_cong'] = df.groupby(['x','y','direction','hr','min'])['congestion'].transform(lambda x: x.median())
df['rolling_7_std'] = df.groupby(['x','y','direction','hr','min'])['congestion'].transform(lambda x: x.rolling(7).std())
df['x+y+direction'] = df['x'].astype('str') + df['y'].astype('str') + df['direction'].astype('str')

df.head()

# Preprocess

In [None]:
from sklearn.preprocessing import LabelEncoder, RobustScaler, StandardScaler
le = LabelEncoder()
obj_cols = ['direction','x+y+direction']
for c in obj_cols:
    df[c] = le.fit_transform(df[c])


# Use all Monday-Wednesday afternoons in August and September for validation

In [None]:
train = df[:len(train)]
test = df[-len(test):]

X_train = train.drop(['row_id','time','congestion'], axis = 1)
y_train = train.congestion
X_test = test.drop(['row_id','time','congestion'], axis = 1)


In [None]:
val_idx = ((train.time.dt.month >= 8) & 
           (train.time.dt.weekday <= 3) &
           (train.time.dt.hour >= 12))

train_idx = ~val_idx

X_tr = train.loc[train_idx].drop(['row_id','time','congestion'], axis = 1)
y_tr = train.loc[train_idx]['congestion']

X_val = train.loc[val_idx].drop(['row_id','time','congestion'], axis = 1)
y_val = train.loc[val_idx]['congestion']

print(X_tr.shape)
print(X_val.shape)

# XGB local validation

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold

In [None]:
clf = xgb.XGBRegressor(

n_estimators = 500,
    learning_rate = 0.025,
    max_depth = 9,
    subsample = 0.8,
    colsample_bytree = 0.8,
    reg_alpha = 0.1,
    tree_method = 'gpu_hist',

)

clf.fit(X_tr, y_tr, eval_set = [(X_tr, y_tr),(X_val, y_val)],
       early_stopping_rounds = 50, verbose = 100, eval_metric = 'mae'
       )
   
pred = np.round(clf.predict(X_val))
   
print('oof mae: ',  mean_absolute_error(y_val, pred))    

In [None]:
cols = list(X_tr.columns)
feature_imp = pd.DataFrame(sorted(zip(clf.feature_importances_,cols)), columns=['Value','Feature'])
plt.figure(figsize=(20, 10))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False).iloc[:20])
plt.title('XGB Most Important Features')
plt.tight_layout()
plt.show()

# Refit training set

In [None]:
clf.fit(X_train, y_train, eval_set = [(X_train, y_train)],verbose = 100, eval_metric = 'mae')

In [None]:
pred_test = np.round(clf.predict(X_test))

# Post-processing

In [None]:
sub['congestion'] = pred_test

# Compute the quantiles of workday afternoons in September except Labor Day
sep = train[(train.time.dt.hour >= 12) & (train.time.dt.weekday < 5) &
            (train.time.dt.dayofyear >= 246)]
lower = sep.groupby(['hr', 'min', 'x', 'y', 'direction']).congestion.quantile(0.15).values
upper = sep.groupby(['hr', 'min', 'x', 'y', 'direction']).congestion.quantile(0.7).values

# Clip the submission data to the quantiles
submission_out = sub.copy()
submission_out['congestion'] = sub.congestion.clip(lower, upper).round().astype(int)


submission_out.to_csv('submission.csv', index = False)

In [None]:
submission_out

In [None]:
submission_out.describe()