# Import Libraries

In [None]:
from pathlib import Path
import datetime as dt

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (10, 10)

In [None]:
class CFG:
    n_roadways = 65
    seed = 42

def create_roadways(df):
    roads = list(range(CFG.n_roadways))
    return roads * int(len(df) / CFG.n_roadways) 

def preprocess(df):
    df_ = df.copy()
    df_['roadway'] = create_roadways(df_)
    df_['time'] = pd.to_datetime(df_['time'])
    return df_

In [None]:
train = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv')
train = preprocess(train)

test = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv')
test = preprocess(test)

train.head()

# EDA

In [None]:
train['congestion'].plot.hist(color= "gray");

In [None]:
train.groupby('roadway')['congestion'].agg('mean').plot.bar();

In [None]:
# congestion of a sample roadway along time
df_roadway = train[train['roadway'] == 0].reset_index(drop=True)
plt.plot(df_roadway['congestion'], color='blue');

In [None]:
ax = train.groupby('direction')['congestion'].mean().plot.bar()
ax.bar_label(ax.containers[0])
plt.xticks(rotation=0);

In [None]:
train['x'].value_counts().sort_index()

In [None]:
train['y'].value_counts().sort_index()

In [None]:
print(train['time'].min(), train['time'].max())
print(train['time'].max() - train['time'].min())

In [None]:
print(test['time'].min(), test['time'].max())
print(test['time'].max() - test['time'].min())

In [None]:
val_cutoff = train['time'].max() - dt.timedelta(hours=12)
print('val_cutoff:', val_cutoff)

In [None]:
X_train = train[train['time'] <= val_cutoff].reset_index(drop=True)
print(X_train['time'].min(), X_train['time'].max())
print(X_train['time'].max() - X_train['time'].min())

In [None]:
X_val = train[train['time'] > val_cutoff].reset_index(drop=True)
print(X_val['time'].min(), X_val['time'].max())
print(X_val['time'].max() - X_val['time'].min())

# Baseline

In [None]:
preds = X_train.groupby('roadway')['congestion'].mean().rename('y_pred').reset_index().round(0)
df_preds = X_val.merge(preds, on='roadway', how='left')
mae = metrics.mean_absolute_error(df_preds['congestion'], df_preds['y_pred'])
print('MAE:', mae)

In [None]:
df_sub = test[['row_id', 'roadway']].copy()

preds_test = train.groupby('roadway')['congestion'].mean().rename('congestion').reset_index().round(0)
df_sub = df_sub.merge(preds_test, on='roadway', how='left')
df_sub.head()

In [None]:
plt.hist(train['congestion'], density=True, alpha=0.5, bins=30, label='Observed')
plt.hist(df_sub['congestion'], density=True, alpha=0.5, bins=30, label='Predicted')
plt.legend();

# Submission

In [None]:
df_sub[['row_id', 'congestion']].to_csv('submission.csv', index=False)

In [None]:
!head submission.csv

# WIP