In [1]:
import numpy as np 
import pandas as pd 
import math
import datetime

from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.ensemble import GradientBoostingRegressor


from catboost import CatBoostRegressor, Pool
from sklearn.preprocessing import LabelEncoder

from numpy import mean, median

from warnings import simplefilter
simplefilter("ignore")

In [2]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor, Pool
from lightgbm import LGBMRegressor

In [3]:
df_train = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv', index_col="row_id", parse_dates=['time'])
df_test = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv', index_col="row_id", parse_dates=['time'])
df_train_idx = df_train.index
df_test_idx = df_test.index

In [4]:
df_train['road'] = df_train['x'].astype(str) + df_train['y'].astype(str) + df_train['direction']
df_test['road']  = df_test['x'].astype(str) + df_test['y'].astype(str) + df_test['direction']

le = LabelEncoder()
df_train['road'] = le.fit_transform(df_train['road'])
df_test['road']  = le.transform(df_test['road'])

In [5]:
minute = df_train.copy()
minute['time'] = minute['time'] + pd.Timedelta(20, unit="m")
minute = minute.rename(columns={'congestion':'lag'})[['time','direction','road', 'lag']]
df_train = df_train.merge(minute, on=['time','direction','road'], how='left')
df_test = df_test.merge(minute, on=['time','direction','road'], how='left')

In [6]:
df_train.head(500)

Unnamed: 0,time,x,y,direction,congestion,road,lag
0,1991-04-01 00:00:00,0,0,EB,70,0,
1,1991-04-01 00:00:00,0,0,NB,49,1,
2,1991-04-01 00:00:00,0,0,SB,24,2,
3,1991-04-01 00:00:00,0,1,EB,18,3,
4,1991-04-01 00:00:00,0,1,NB,60,4,
...,...,...,...,...,...,...,...
495,1991-04-01 02:20:00,2,0,NB,82,40,47.0
496,1991-04-01 02:20:00,2,0,SB,63,41,67.0
497,1991-04-01 02:20:00,2,0,WB,67,42,57.0
498,1991-04-01 02:20:00,2,1,EB,57,43,33.0


In [7]:
df_train['time'][848834] - df_train['time'][0]

Timedelta('182 days 11:40:00')

In [8]:
def add_datetime_features(df):
    df['month']   = df['time'].dt.month
    df['day']     = df['time'].dt.day
    df['weekday'] = df['time'].dt.weekday
    df['weekend'] = (df['time'].dt.weekday >= 5)
    df['hour']    = df['time'].dt.hour
    df['minute']  = df['time'].dt.minute
    df['afternoon'] = df['hour'] >= 12
    
    # combination of hour and minute features
    df['daytime_id'] = ( ( df.time.dt.hour*60 + df.time.dt.minute ) /20 ).astype(int)

In [9]:
add_datetime_features(df_train)
add_datetime_features(df_test)

In [10]:
median = df_train.groupby(['road', 'daytime_id']).congestion.median().astype(int)

In [11]:
df=df_train.copy()
df = df_train.merge(median, left_on=['road', 'daytime_id'], right_index=True)

In [12]:
df2 = df_test.copy()
df2 = df_test.merge(median,left_on=['road', 'daytime_id'], right_index=True)

In [13]:
medt = df2['congestion']
df_test['median'] = medt

In [14]:
df_test.head()

Unnamed: 0,time,x,y,direction,road,lag,month,day,weekday,weekend,hour,minute,afternoon,daytime_id,median
0,1991-09-30 12:00:00,0,0,EB,0,70.0,9,30,0,False,12,0,True,36,47
1,1991-09-30 12:00:00,0,0,NB,1,40.0,9,30,0,False,12,0,True,36,34
2,1991-09-30 12:00:00,0,0,SB,2,51.0,9,30,0,False,12,0,True,36,50
3,1991-09-30 12:00:00,0,1,EB,3,24.0,9,30,0,False,12,0,True,36,26
4,1991-09-30 12:00:00,0,1,NB,4,65.0,9,30,0,False,12,0,True,36,71


In [15]:
med = df['congestion_y']
df_train['median'] = med
df_train['lag'] = df_train['lag'].fillna(df_train['median']) 
df_test['lag'] = df_test['lag'].fillna(df_test['median'])

In [16]:
df_train.head()

Unnamed: 0,time,x,y,direction,congestion,road,lag,month,day,weekday,weekend,hour,minute,afternoon,daytime_id,median
0,1991-04-01,0,0,EB,70,0,38.0,4,1,0,False,0,0,False,0,38
1,1991-04-01,0,0,NB,49,1,36.0,4,1,0,False,0,0,False,0,36
2,1991-04-01,0,0,SB,24,2,24.0,4,1,0,False,0,0,False,0,24
3,1991-04-01,0,1,EB,18,3,21.0,4,1,0,False,0,0,False,0,21
4,1991-04-01,0,1,NB,60,4,67.0,4,1,0,False,0,0,False,0,67


In [17]:
X_train = df_train.copy()
y_train = df_train['congestion']
X_train = X_train.drop(['congestion','x','y','direction','time'],axis=1)

In [18]:
from sklearn.feature_selection import mutual_info_regression

mi_scores = mutual_info_regression(X_train, y_train)
mi_scores = pd.Series(mi_scores, name="MI_score", index=X_train.columns)
mi_scores = mi_scores.sort_values(ascending=False)
df_mi_scores = pd.DataFrame(mi_scores).reset_index().rename(columns={'index':'feature'})
df_mi_scores

Unnamed: 0,feature,MI_score
0,lag,0.771276
1,median,0.751966
2,road,0.70758
3,daytime_id,0.042295
4,hour,0.042196
5,afternoon,0.018478
6,weekend,0.005591
7,weekday,0.004442
8,day,0.001692
9,month,0.001204


The 20 minute lag and median seem to be the most informative features

Will use the median congestion and the congestion value 20 minutes prior to predict test congestions. See [notebook](https://www.kaggle.com/code/robertturro/tps-march-2022-useful-visuals) for EDA and visuals

In [19]:
y_train = df_train['congestion']
train = df_train.drop(['time','direction','month','day','weekday','weekend','hour','minute','afternoon','road','daytime_id','congestion'],axis=1)
test = df_test.drop(['time','direction','month','day','weekday','weekend','hour','minute','afternoon','road','daytime_id'],axis=1)

In [20]:
cat = CatBoostRegressor(logging_level='Silent', eval_metric='MAE', loss_function='MAE', random_state=42)
ada = AdaBoostRegressor(n_estimators=500,random_state=0)
bag = BaggingRegressor(n_estimators=500,random_state=0)
hist = HistGradientBoostingRegressor(learning_rate=0.001)

cat.fit(train,y_train)
ada.fit(train,y_train)
bag.fit(train,y_train)
hist.fit(train,y_train)

cat_pred = cat.predict(test)
ada_pred = ada.predict(test)
bad_pred = bag.predict(test)
hist_pred = hist.predict(test)

In [21]:
sample_submission = pd.read_csv('../input/tabular-playground-series-mar-2022/sample_submission.csv', index_col="row_id")

In [22]:
sample_submission['cat'] = cat_pred
sample_submission['ada'] = ada_pred
sample_submission['bag'] = bad_pred
sample_submission['hist'] = hist_pred

In [23]:
sample_submission['congestion'] = sample_submission.mean(axis=1)

In [24]:
sample_submission = sample_submission.astype(int)
sub = sample_submission.copy()
sub = sub.drop(['cat','ada','bag','hist'],axis=1)

In [25]:
df_test = df_test.set_index(df_test_idx)
df_test['congestion'] = sample_submission['congestion']

From [TPSMAR22 Generalizing the Special Values](https://www.kaggle.com/code/ambrosm/tpsmar22-generalizing-the-special-values)

In [26]:
submission_in = sub.copy()

sep = df_train[(df_train.time.dt.hour >= 12) & (df_train.time.dt.weekday < 5) &
            (df_train.time.dt.dayofyear >= 246)]
lower = sep.groupby(['hour', 'minute','x','y','direction']).congestion.quantile(0.15).values
upper = sep.groupby(['hour', 'minute','y','x','direction']).congestion.quantile(0.7).values

In [27]:
submission_out = submission_in.copy()
submission_out['congestion'] = submission_in.congestion.clip(lower, upper)

# Submission

In [28]:
submission_out['congestion'] = submission_out['congestion'].astype(int)

In [29]:
submission_out = submission_out.reset_index()

In [30]:
submission_out.to_csv('submission.csv',index=False)