In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
import sklearn
from sklearn import (
    metrics,model_selection,
    preprocessing,impute,
    compose,pipeline,
    feature_selection,inspection,
    ensemble,linear_model,neural_network,tree,neighbors,
)
import datetime as dt
import os

In [None]:
train_raw = pd.read_csv(
    '../input/tabular-playground-series-mar-2022/train.csv',
    index_col=0, parse_dates=['time']).reset_index(drop=True)
test_raw = pd.read_csv(
    '../input/tabular-playground-series-mar-2022/test.csv', 
    index_col=0, parse_dates=['time'])
train = train_raw.sort_values('time x y direction'.split()).reset_index(drop=True).copy()
test = test_raw.sort_values('time x y direction'.split()).copy()
test_index = test.index
target = yyy = 'congestion'
scoring = 'neg_mean_absolute_error'
train_raw.shape, test_raw.shape

* Encode direction into integer within range 0 until 7 (inclusive), then store it into new column `dire`.
* Extract any periodic/cycle time-related information.

In [None]:
direnc = 'nb ne eb se sb sw wb nw'.upper().split()
direnc = {vv:ii for ii,vv in enumerate(direnc)}
for ii in [train,test]:
    ii['dire'] = ii.direction.map(direnc)
    ii['date'] = ii.time.dt.date
    ii['clock'] = ii.time.dt.time
    ii['m60'] = ii.time.dt.minute
    ii['h24'] = ii.time.dt.hour
    ii['m12'] = ii.time.dt.month
    ii['d31'] = ii.time.dt.day
    ii['d7'] = ii.time.dt.weekday
train

* Training only against Monday afternoon.
* Add two new columns = `median` and `mean` of congestion value for each `(direction, x, y, hour, minute)` pair. The calculation is performed excluding the current date data (out-of-fold), to prevent data-leaking.

In [None]:
ab = train[(train.d7==0) & (train.h24>=12)].copy()
agg = []
for date,b in ab.groupby('date'.split()):
    a = ab.index.isin(b.index)
    a = ~a
    a = ab[a]
    group = 'clock x y direction'.split()
    aggcol = ['median', 'mean']
    bp = a.groupby(group).congestion.agg(aggcol)
    bp = bp.reset_index()
    assert len(bp)==len(b)
    assert (
        bp[group].reset_index(drop=True)\
        ==b[group].reset_index(drop=True)
    ).all().all()
    agg.append(pd.DataFrame(bp[aggcol].values, index=b.index, columns=aggcol))
agg = pd.concat(agg).sort_index()
for cname in agg.columns:
    ab[cname] = agg[cname]
assert ab.notna().all().all()
print(metrics.mean_absolute_error(ab.congestion, ab['median']))
print(metrics.mean_absolute_error(ab.congestion, ab['mean']))
ab

* Choose which features will be used for training.
* `HistGradientBoostingRegressor` is the chosen model.
* Training against whole training-data (cross-validation is already performed in another notebook).

In [None]:
%%time
xxx = ['x', 'y', 'dire', 'm60', 'h24', 'm12', 'd31', 'median', 'mean']
est = ensemble.HistGradientBoostingRegressor(
    loss='absolute_error',
    categorical_features=[0,1,2],
    random_state=0,
    max_iter=1362,
    early_stopping=False,
)
est.fit(ab[xxx], ab[yyy])

Generate the same feature for `test.csv`, similar with what we did for `train.csv`

In [None]:
group = 'clock x y direction'.split()
aggcol = ['median', 'mean']
agg = ab.groupby(group).congestion.agg(aggcol)
agg = agg.reset_index()
assert len(agg)==len(test)
assert (
    agg[group].reset_index(drop=True)\
    ==test[group].reset_index(drop=True)
).all().all()
agg

In [None]:
for cname in aggcol:
    test[cname] = agg[cname].values
test

In [None]:
pred = est.predict(test[xxx])
pred

In [None]:
test[yyy] = pred
test

In [None]:
test[yyy].round().to_csv('submission.csv')
test[yyy].round()