In [None]:
from pathlib import Path
import datetime as dt

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from tqdm.notebook import tqdm

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (10, 10)

In [None]:
PATH = Path('../input/tabular-playground-series-mar-2022')
!ls {PATH}

In [None]:
class CFG:
    n_roadways = 65
    seed = 42

def create_roadways(df):
    roads = list(range(CFG.n_roadways))
    return roads * int(len(df) / CFG.n_roadways) 

def preprocess(df):
    df_ = df.copy()
    df_['roadway'] = create_roadways(df_)
    df_['time'] = pd.to_datetime(df_['time'])
    df_['weekday'] = df_.time.dt.weekday
    df_['weekend'] = (df_['weekday'] >= 5).astype(int)
    df_['hour'] = df_.time.dt.hour
    df_['minute'] = df_.time.dt.minute
    df_ = df_.sort_values(['roadway', 'time']).reset_index(drop=True)
    return df_

In [None]:
train = pd.read_csv(PATH / 'train.csv')
train = preprocess(train)

test = pd.read_csv(PATH / 'test.csv')
test = preprocess(test)

train

In [None]:
# every roadway appears the same number of times
train['roadway'].value_counts().unique()

## EDA

### Congestion distribution

In [None]:
train['congestion'].plot.hist();

### Average congestion, per roadway

In [None]:
train.groupby('roadway')['congestion'].agg('mean').plot.bar();

In [None]:
# congestion of a sample roadway along time
df_roadway = train[train['roadway'] == 0].reset_index(drop=True)
plt.plot(df_roadway['congestion']);

### Average congestion, per direction

In [None]:
ax = train.groupby('direction')['congestion'].mean().plot.bar()
ax.bar_label(ax.containers[0])
plt.xticks(rotation=0);

Some directions has lower average congestion, like northwest (NW).

In [None]:
train['x'].value_counts().sort_index()

In [None]:
train['y'].value_counts().sort_index()

## Validation strategy

In [None]:
print(train['time'].min(), train['time'].max())
print(train['time'].max() - train['time'].min())

In [None]:
print(test['time'].min(), test['time'].max())
print(test['time'].max() - test['time'].min())

In [None]:
val_cutoff = train['time'].max() - dt.timedelta(hours=12)
print('val_cutoff:', val_cutoff)

In [None]:
X_train = train[train['time'] <= val_cutoff].reset_index(drop=True)
print(X_train['time'].min(), X_train['time'].max())
print(X_train['time'].max() - X_train['time'].min())

In [None]:
X_val = train[train['time'] > val_cutoff].reset_index(drop=True)
print(X_val['time'].min(), X_val['time'].max())
print(X_val['time'].max() - X_val['time'].min())

## Baseline

In [None]:
def pred_roadway_by_groups(df_train, df_test, roadway, group_vars, q):
    preds = df_train[df_train['roadway'] == roadway].groupby(group_vars)['congestion'].quantile(q=q).rename('y_pred').round(0).astype(int)
    df_preds = df_test[df_test['roadway'] == roadway].merge(preds, on=group_vars, how='left')
    return df_preds

In [None]:
df_oof = []
for roadway in tqdm(range(65)):
    # print('Roadway:', roadway)
    preds_weekday = pred_roadway_by_groups(
        X_train,
        X_val,
        roadway,
        ['weekday', 'hour', 'minute'],
        0.5
    )
    mae_weekday = metrics.mean_absolute_error(preds_weekday['congestion'], preds_weekday['y_pred'])
    # r2_weekday = metrics.r2_score(preds_weekday['congestion'], preds_weekday['y_pred'])
    # print('MAE:', mae_weekday)
    # print('R2:', r2_weekday)

    preds_weekend = pred_roadway_by_groups(
        X_train,
        X_val,
        roadway,
        ['weekend', 'hour', 'minute'],
        0.5
    )
    mae_weekend = metrics.mean_absolute_error(preds_weekend['congestion'], preds_weekend['y_pred'])
    # r2_weekend = metrics.r2_score(preds['congestion'], preds['y_pred'])
    # print('MAE:', mae_weekend)
    # print('R2:', r2_weekend)

    if mae_weekday <= mae_weekend:
        best = 'weekday'
        score = mae_weekday
        preds = preds_weekday
    else:
        best = 'weekend'
        score = mae_weekend
        preds = preds_weekend
    # print('Best:', best)
    # print()
    
    # store best prediction
    obj = {}
    obj[str(roadway)] = preds
    obj['mae'] = score
    df_oof.append(obj)
    
df_oof = pd.concat([df_oof[i][str(i)] for i in range(len(df_oof))], axis=0)
df_oof.index = X_val.index
mae = metrics.mean_absolute_error(df_oof['congestion'], df_oof['y_pred'])
r2 = metrics.r2_score(df_oof['congestion'], df_oof['y_pred'])
print('MAE:', mae)
print('R2:', r2)

In [None]:
assert (df_oof['row_id'] == X_val['row_id']).all()
assert (df_oof['congestion'] == X_val['congestion']).all()

In [None]:
plt.scatter(df_oof['congestion'], df_oof['y_pred'])
plt.title('R2: ' + str(round(r2, 4)));

In [None]:
plt.scatter(df_oof['time'], df_oof['congestion'] - df_oof['y_pred']);

## Submission

In [None]:
df_sub = []
for roadway in tqdm(range(65)):
    preds_weekday = pred_roadway_by_groups(
        train,
        test,
        roadway,
        ['weekday', 'hour', 'minute'],
        0.5
    )
    preds_weekend = pred_roadway_by_groups(
        train,
        test,
        roadway,
        ['weekend', 'hour', 'minute'],
        0.5
    )
    if mae_weekday <= mae_weekend:
        best = 'weekday'
        score = mae_weekday
        preds = preds_weekday
    else:
        best = 'weekend'
        score = mae_weekend
        preds = preds_weekend

    # store best prediction
    obj = {}
    obj[str(roadway)] = preds
    df_sub.append(obj)

df_sub = pd.concat([df_sub[i][str(i)] for i in range(len(df_sub))], axis=0)
df_sub.index = test.index
df_sub = df_sub.rename(columns={'y_pred': 'congestion'})
df_sub

In [None]:
assert (df_sub['row_id'] == test['row_id']).all()

In [None]:
plt.hist(train['congestion'], density=True, alpha=0.5, bins=30, label='Observed')
plt.hist(df_sub['congestion'], density=True, alpha=0.5, bins=30, label='Predicted')
plt.legend();

In [None]:
pd.concat([
    train['congestion'].describe().rename('observed'),
    df_sub['congestion'].describe().rename('predicted')
], axis=1)

In [None]:
df_sub[['row_id', 'congestion']].to_csv('submission.csv', index=False)

In [None]:
!head submission.csv