In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import pyplot
pd.options.mode.chained_assignment = None

## Data Info

In [None]:
train = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv')
test = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv')
train.head()

In [None]:
train.info()

In [None]:
train.nunique()

In [None]:
test.head()

In [None]:
test.info()

## EDA

In [None]:
train['time'] = pd.to_datetime(train.time, format='%Y-%m-%d %H:%M:%S')
test['time'] = pd.to_datetime(test.time, format='%Y-%m-%d %H:%M:%S')


In [None]:
temp = train.groupby(train.time.dt.month).congestion.mean()
plt.figure(figsize=(18, 6))
plt.title('Month')
plt.bar(temp.index, temp, color='#ffd700')
plt.xticks(ticks=temp.index, labels='AMJJAS')
plt.show()

In [None]:
temp = train.groupby(train.time.dt.dayofweek).congestion.mean()
plt.figure(figsize=(18, 6))
plt.title('Day of week')
plt.bar(temp.index, temp, color='#ffd700')
plt.xticks(ticks=temp.index, labels='MTWTFSS')
plt.show()

In [None]:
temp = train.groupby(train.time.dt.hour + train.time.dt.minute/60).congestion.mean()
plt.figure(figsize=(18, 6))
plt.title('Time of the day')
plt.bar(temp.index, temp, color='#ffd700', width=0.34)
plt.xticks(range(24))
plt.xlabel('Time of day')
plt.ylabel('Congestion')
plt.show()

In [None]:
def add_datetime_features(df):
    df['month']   = df['time'].dt.month
    df['day']     = df['time'].dt.day
    df['weekday'] = df['time'].dt.weekday
    df['hour']    = df['time'].dt.hour
    df['minute']  = df['time'].dt.minute
    df['afternoon'] = df['hour'] >= 12
    df['weekend'] = np.where(df.time.dt.weekday >= 5, 1, 0)
    # number of 20' period in a day
    df['moment']  = df['time'].dt.hour * 3 + df['time'].dt.minute // 20 

In [None]:
add_datetime_features(train)
add_datetime_features(test)

In [None]:
b = [0,6,9,11,16,19,24]
l = [0,1,2,3,4,5]
train['day_part'] = pd.cut(train.time.dt.hour, bins=b, labels=l, include_lowest=True).astype(int)
test['day_part'] = pd.cut(test.time.dt.hour, bins=b, labels=l, include_lowest=True).astype(int)
train.drop(['time', 'row_id'], axis=1, inplace=True)
test.drop(['time', 'row_id'], axis=1, inplace=True)

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
train['direction'] = le.fit_transform(train['direction'])
test['direction'] = le.transform(test['direction'])

In [None]:
df_mornings = train[(train.hour >= 6) & (train.hour < 12)]
morning_avgs = pd.DataFrame(df_mornings.groupby(['month', 'day', 'direction', 'x', 'y']).congestion.median().astype(int)).reset_index()
morning_avgs = morning_avgs.rename(columns={'congestion':'morning_avg'})
train = train.merge(morning_avgs, on=['month', 'day', 'direction', 'x', 'y'], how='left')
test = test.merge(morning_avgs, on=['month', 'day', 'direction', 'x', 'y'], how='left')

In [None]:
mins = pd.DataFrame(train.groupby(['direction','x', 'y', 'weekday', 'hour', 'minute']).congestion.min().astype(int)).reset_index()
mins = mins.rename(columns={'congestion':'min'})
train = train.merge(mins, on=['direction','x', 'y', 'weekday', 'hour', 'minute'], how='left')
test = test.merge(mins, on=['direction', 'x', 'y','weekday', 'hour', 'minute'], how='left')

In [None]:
maxs = pd.DataFrame(train.groupby(['direction', 'x', 'y','weekday', 'hour', 'minute']).congestion.max().astype(int)).reset_index()
maxs = maxs.rename(columns={'congestion':'max'})
train = train.merge(maxs, on=['direction', 'x', 'y','weekday', 'hour', 'minute'], how='left')
test = test.merge(maxs, on=['direction', 'x', 'y', 'weekday', 'hour', 'minute'], how='left')

In [None]:
medians = pd.DataFrame(train.groupby(['direction', 'x', 'y', 'weekday', 'hour', 'minute']).congestion.median().astype(int)).reset_index()
medians = medians.rename(columns={'congestion':'median'})
train = train.merge(medians, on=['direction', 'x', 'y', 'weekday', 'hour', 'minute'], how='left')
test = test.merge(medians, on=['direction', 'x', 'y', 'weekday', 'hour', 'minute'], how='left')

In [None]:
train.head()

In [None]:
train.info()

## Feature Importance

In [None]:
y = train['congestion']
X = train.loc[:, train.columns != 'congestion']

In [None]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()
model.fit(X, y)
importance = model.feature_importances_
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))
pyplot.bar([x for x in range(len(importance))], importance)
pyplot.show()

In [None]:
important_features = ['moment', 'median', 'min', 'max', 'morning_avg']
X = X.loc[:, important_features]
test = test.loc[:, important_features]


In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=2)
X = pca.fit_transform(X)
print(pca.explained_variance_ratio_)
test = pca.transform(test)

## Modeling

In [None]:
import lightgbm as lgb
from catboost import CatBoostRegressor

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
lgbm_models = []
cat_models = []
mae = []
r2 = []
kf = KFold(n_splits = 5, shuffle = True, random_state = 123)
for train_index, val_index in kf.split(X):
    x_train, x_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    lgbm = lgb.LGBMRegressor(learning_rate=0.4, objective='mae', n_estimators=600, early_stopping_round=20, random_state=0)
    cat = CatBoostRegressor(logging_level='Silent', eval_metric='MAE', loss_function='MAE', n_estimators=5000, early_stopping_rounds=20, random_state=0)
    lgbm.fit(x_train, y_train, eval_set=(x_val,y_val), verbose=-1)
    cat.fit(x_train, y_train, eval_set=(x_val,y_val))
    lgbm_models.append(lgbm)
    cat_models.append(cat)
    mae.append(mean_absolute_error(y_val, lgbm.predict(x_val)))
    mae.append(mean_absolute_error(y_val, cat.predict(x_val)))
    r2.append(r2_score(y_val, lgbm.predict(x_val)))
    r2.append(r2_score(y_val, cat.predict(x_val)))

In [None]:
preds_lgbm = []
for model in lgbm_models:
    pred = model.predict(test)
    preds_lgbm.append(pred)
lgbm_prediction = np.mean(preds_lgbm, axis=0)

In [None]:
preds_cat = []
for model in cat_models:
    pred = model.predict(test)
    preds_cat.append(pred)
cat_prediction = np.mean(preds_cat, axis=0)

In [None]:
print('mae:', np.mean(mae))
print('r2:', np.mean(r2))

## Blended target

In [None]:
special = pd.read_csv('../input/tps-mar-22-special-values/special v2.csv', index_col="row_id")
special = special[['congestion']].rename(columns={'congestion':'special'})

In [None]:
sample = pd.read_csv('../input/tabular-playground-series-mar-2022/sample_submission.csv')
sample['congestion'] = lgbm_prediction * 0.6 + cat_prediction * 0.4
sample = sample.merge(special, left_index=True, right_index=True, how='left')
sample['special'] = sample['special'].fillna(sample['congestion']).round().astype(int)
sample = sample.drop(['congestion'], axis=1).rename(columns={'special':'congestion'})
sample.to_csv('submission.csv', index=False)
sample.head()