In [None]:
import numpy as np 
import pandas as pd 
import math
import datetime

from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

from catboost import CatBoostRegressor, Pool
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from numpy import mean, median
# check xgboost version
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from warnings import simplefilter
import tensorflow as tf
simplefilter("ignore")

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv', index_col="row_id", parse_dates=['time'])
df_test = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv', index_col="row_id", parse_dates=['time'])
df_train_idx = df_train.index
df_test_idx = df_test.index

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
len(df_train)

In [None]:
df_train["direction"].value_counts()

In [None]:
df_train["y"].value_counts()

# Feature Engineering

In [None]:
#creat road column
df_train['road'] = df_train['x'].astype(str) + df_train['y'].astype(str) + df_train['direction']
df_test['road']  = df_test['x'].astype(str) + df_test['y'].astype(str) + df_test['direction']
#label encoder for road column
le = LabelEncoder()
df_train['road'] = le.fit_transform(df_train['road'])
df_test['road']  = le.transform(df_test['road'])

In [None]:

df_train['road'].value_counts()

In [None]:
#get weekday and hour
df_train['weekday'] = df_train['time'].dt.weekday
df_test['weekday'] = df_test['time'].dt.weekday
df_train['hour']    = df_train['time'].dt.hour
df_test['hour']    = df_test['time'].dt.hour
df_train['minute']    = df_train['time'].dt.minute
df_test['minute']    = df_test['time'].dt.minute

In [None]:
#get minimum congestion per 'road', 'weekday' and 'hour'
mins = pd.DataFrame(df_train.groupby(['road', 'weekday', 'hour','minute']).congestion.min().astype(int)).reset_index()
mins = mins.rename(columns={'congestion':'min'})
df_train = df_train.merge(mins, on=['road', 'weekday', 'hour','minute'], how='left')
df_test = df_test.merge(mins, on=['road', 'weekday', 'hour','minute'], how='left')

In [None]:
#get maximum congestion per 'road', 'weekday' and 'hour'
maxs = pd.DataFrame(df_train.groupby(['road', 'weekday', 'hour','minute']).congestion.max().astype(int)).reset_index()
maxs = maxs.rename(columns={'congestion':'max'})
df_train = df_train.merge(maxs, on=['road', 'weekday', 'hour','minute'], how='left')
df_test = df_test.merge(maxs, on=['road', 'weekday', 'hour','minute'], how='left')

In [None]:
#get median congestion per 'road', 'weekday' and 'hour'
medians = pd.DataFrame(df_train.groupby(['road', 'weekday', 'hour','minute']).congestion.median().astype(int)).reset_index()
medians = medians.rename(columns={'congestion':'median'})
df_train = df_train.merge(medians, on=['road', 'weekday', 'hour','minute'], how='left')
df_test = df_test.merge(medians, on=['road', 'weekday', 'hour','minute'], how='left')

In [None]:
# make lags
for delta in range(1,8):
    day = df_train.copy()
    day['time'] = day['time'] + pd.Timedelta(delta, unit="d")
    name = f'lag_{delta}'
    day = day.rename(columns={'congestion':name})[['time', 'road', name]]
    df_train = df_train.merge(day, on=['time', 'road'], how='left')
    df_test = df_test.merge(day, on=['time', 'road'], how='left')
df_train=df_train.fillna(df_train["congestion"].median())
df_test=df_test.fillna(df_train["congestion"].median())

In [None]:
df_test.columns

In [None]:
# Make morning Average depend on month and day
df_train['month']   = df_train['time'].dt.month
df_test['month']   = df_test['time'].dt.month
df_train['day']     = df_train['time'].dt.day
df_test['day']     = df_test['time'].dt.day
df_mornings = df_train[(df_train.hour >= 6) & (df_train.hour < 12)]
morning_avgs = pd.DataFrame(df_mornings.groupby(['month', 'day', 'road']).congestion.median().astype(int)).reset_index()
morning_avgs = morning_avgs.rename(columns={'congestion':'morning_avg'})
df_train = df_train.merge(morning_avgs, on=['month', 'day', 'road'], how='left')
df_test = df_test.merge(morning_avgs, on=['month', 'day', 'road'], how='left')

In [None]:
# Quantile 25 based on 'road', 'weekday', 'hour'
quantile25 = pd.DataFrame(df_train.groupby(['road', 'weekday', 'hour']).congestion.quantile([.25]).astype(int)).reset_index()
quantile25 = quantile25.rename(columns={'congestion':'quantile25'}).drop(['level_3'], axis=1)
df_train = df_train.merge(quantile25, on=['road', 'weekday', 'hour'], how='left')
df_test = df_test.merge(quantile25, on=['road', 'weekday', 'hour'], how='left')

In [None]:
# Quantile 75 based on 'road', 'weekday', 'hour'
quantile75 = pd.DataFrame(df_train.groupby(['road', 'weekday', 'hour']).congestion.quantile([.75]).astype(int)).reset_index()
quantile75 = quantile75.rename(columns={'congestion':'quantile75'}).drop(['level_3'], axis=1)
df_train = df_train.merge(quantile75, on=['road', 'weekday', 'hour'], how='left')
df_test = df_test.merge(quantile75, on=['road', 'weekday', 'hour'], how='left')

In [None]:
df_preproc = df_train.copy()
df_preproc = df_preproc.drop(['direction'],axis=1)
df_preproc['hour'] = df_preproc['hour'].astype(int)
df_test=df_test.drop(['direction'],axis=1)


In [None]:

sep = df_train[(df_train.time.dt.dayofyear >= 246)]
lower = sep.groupby(['time', 'x', 'y', 'direction']).congestion.quantile(0.15).values
upper = sep.groupby(['time', 'x', 'y', 'direction']).congestion.quantile(0.7).values

In [None]:
X = df_preproc.copy()
X = X.drop(['time'], axis=1)
Y = df_preproc['congestion']
X = X.drop(['congestion'], axis=1)
X_test = df_test.drop(['time'], axis=1)

In [None]:
# from sklearn.feature_selection import mutual_info_regression

# mi_scores = mutual_info_regression(X_train, Y_train)
# mi_scores = pd.Series(mi_scores, name="MI_score", index=X_train.columns)
# mi_scores = mi_scores.sort_values(ascending=False)
# df_mi_scores = pd.DataFrame(mi_scores).reset_index().rename(columns={'index':'feature'})
# df_mi_scores

In [None]:
dropped = ['month','weekday','day',"minute"]
X = X.drop(dropped,axis=1)
X_test = X_test.drop(dropped,axis=1)

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42,stratify=X['road'])

# Model Fitting & Prediction

In [None]:
#XGBOOST REGRESSION
#param = {"booster":"gblinear", "objective":"reg:linear"}
  
# model = XGBRegressor(n_estimators=1000, max_depth=7)
# model.fit(X,Y)
model = CatBoostRegressor(logging_level='Silent', random_state=42, eval_metric='MAE', loss_function='MAE').fit(X,Y)
# tf.random.set_seed(42)
# model_6 = tf.keras.Sequential([
#   tf.keras.layers.Dense(80, activation = tf.keras.activations.relu),
#     tf.keras.layers.Dense(30,activation = tf.keras.activations.relu),
#   tf.keras.layers.Dense(10,activation = tf.keras.activations.relu),
#   tf.keras.layers.Dense(1)
# ])
# model_6.compile(loss=tf.keras.losses.mae,
#                 optimizer=tf.keras.optimizers.Adam(),
#                 metrics=['mae'])
# model_6.fit(X, Y, epochs=100,batch_size=32)

In [None]:
# folds = KFold(n_splits = 5, shuffle = True, random_state = 100)

# scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=folds)
# scores Y

In [None]:
prediction=model.predict(X_test)
prediction=prediction.clip(lower, upper)

# Submission

In [None]:
final_df = pd.DataFrame({'row_id':df_test_idx,'congestion':prediction})
final_df.to_csv('submission.csv',index=False)