Using other means idea is coming from Advanced Mean Approaches notebook by wti200: https://www.kaggle.com/wti200/advanced-mean-approaches

In [None]:
import pandas as pd
from scipy.stats.mstats import gmean, hmean

train = pd.read_csv("../input/tabular-playground-series-mar-2022/train.csv")
test = pd.read_csv("../input/tabular-playground-series-mar-2022/test.csv")
sample_submission = pd.read_csv("../input/tabular-playground-series-mar-2022/sample_submission.csv")

In [None]:
train["is_train"] = 1
test["is_train"] = 0
all = pd.concat([train, test], axis=0)
all["time"] = pd.to_datetime(all.time)

In [None]:
all["x+y"] = all["x"].astype(str) + all["y"].astype(str)
all["x+y+direction"] = all["x+y"] + all["direction"].astype(str)
all['hour'] = all['time'].dt.hour
all['minute'] = all['time'].dt.minute
all['weekday'] = all['time'].dt.dayofweek
all['hour+minute'] = all['time'].dt.hour * 60 + all['time'].dt.minute
all['month'] = all['time'].dt.month


*Remove April and May values*

# Calculate the Medians

In [None]:
medians = pd.DataFrame(all.groupby(["x+y+direction", 'weekday', 'hour', 'minute']).congestion.median()).reset_index()
medians = medians.rename(columns={'congestion': 'congestion_median'})
all = all.merge(medians, on=["x+y+direction", 'weekday', 'hour', 'minute'], how='left')

means = pd.DataFrame(all.groupby(["x+y+direction", 'weekday', 'hour', 'minute']).congestion.mean()).reset_index()
means = means.rename(columns={'congestion': 'congestion_mean'})
all = all.merge(means, on=["x+y+direction", 'weekday', 'hour', 'minute'], how='left')

h_mean = pd.DataFrame(all[all.is_train == 1].groupby(["x+y+direction", 'weekday', 'hour', 'minute']).congestion.apply(hmean)).reset_index()
h_mean = h_mean.rename(columns={'congestion': 'congestion_hmean'})
all = all.merge(h_mean, on=["x+y+direction", 'weekday', 'hour', 'minute'], how='left')

g_mean = pd.DataFrame(all[all.is_train == 1].groupby(["x+y+direction", 'weekday', 'hour', 'minute']).congestion.apply(gmean)).reset_index()
g_mean = g_mean.rename(columns={'congestion': 'congestion_gmean'})
all = all.merge(g_mean, on=["x+y+direction", 'weekday', 'hour', 'minute'], how='left')

# Validation Set

Used last 3 September Monday afternoons for validation set.

In [None]:
train_idxes = all[all.is_train == 1].index
tst_start = pd.to_datetime('1991-09-23 12:00')
tst_finish = pd.to_datetime('1991-09-23 23:40')
val_idxes = all[(all['time'] >= tst_start) & (all['time'] <= tst_finish)].index
tst_start = pd.to_datetime('1991-09-16 12:00')
tst_finish = pd.to_datetime('1991-09-16 23:40')
val_idxes_2 = all[(all['time'] >= tst_start) & (all['time'] <= tst_finish)].index
tst_start = pd.to_datetime('1991-09-09 12:00')
tst_finish = pd.to_datetime('1991-09-09 23:40')
val_idxes_3 = all[(all['time'] >= tst_start) & (all['time'] <= tst_finish)].index
val_idxes = val_idxes.union(val_idxes_2).union(val_idxes_3)
train_after_dropped_idxes = train_idxes.difference(val_idxes)

In [None]:
import numpy as np

all["median_mae"] = np.abs(all["congestion"] - all["congestion_median"])
all["gmean_mae"] = np.abs(all["congestion"] - all["congestion_gmean"])
all["hmean_mae"] = np.abs(all["congestion"] - all["congestion_hmean"])
all["mean_mae"] = np.abs(all["congestion"] - all["congestion_mean"])

maes = all.iloc[val_idxes].groupby(["x+y+direction", "weekday", "hour+minute"])["median_mae", "gmean_mae", "hmean_mae", "mean_mae"].mean()
maes = maes.rename(columns={'median_mae': 'val_median_mae', 'gmean_mae': 'val_gmean_mae', 'hmean_mae': 'val_hmean_mae','mean_mae': 'val_mean_mae'})
all = all.merge(maes, on=["x+y+direction", "weekday", "hour+minute"], how='left')

# Pick the Less MAE Central Mean

For every road pick the least mae from the means

In [None]:
def decide_mean(row):
    means = [row.congestion_mean, row.congestion_hmean, row.congestion_gmean, row.congestion_median]
    means = [i for i in means if i != 0]
    decision = min(means)
    return decision
sample_submission = sample_submission.set_index('row_id', drop=False)
sample_submission["congestion"] = all[all.is_train==0].apply(lambda row: decide_mean(row), axis=1)

In [None]:
all

# Special Values Correction

*Submission before special values correction for 22-SE*

In [None]:
sample_submission.loc[[848891,848956,848956,849021,849151,849216,849281,849346,849411]]

In [None]:
sample_submission.loc[[848891,848956,848956,849021,849151,849216,849281,849346,849411],"congestion"] = 20

*Special values for other anomalies*

In [None]:
special = pd.read_csv('../input/tps-mar-22-special-values/special.csv', index_col="row_id")
special = special[['congestion']].rename(columns={'congestion':'special'})
sample_submission = sample_submission.merge(special, left_index=True, right_index=True, how='left')
sample_submission['special'] = sample_submission['special'].fillna(sample_submission['congestion'])
sample_submission = sample_submission.drop(['congestion'], axis=1).rename(columns={'special':'congestion'})
sample_submission['congestion'] = sample_submission['congestion'].astype(int).tolist()

In [None]:
import matplotlib.pyplot as plt
fig, axes = plt.subplots(1,1, figsize=(16,5))

axes.hist(all.iloc[val_idxes].congestion.values,
         bins=np.linspace(-0.5, 100.5, 102), density=True, label='Validation', color='b')

axes.hist(sample_submission['congestion'], np.linspace(-0.5, 100.5, 102),
         density=True, rwidth=0.5, label='Test predictions', color='r')
axes.legend()
axes.set_title("Picked Mean", fontsize=12, fontproperties="italic")
axes.set_ylabel('Density', fontproperties="italic")
axes.set_xlabel('Congestion', fontproperties="italic")


In [None]:
sample_submission[["row_id", "congestion"]].to_csv(
    f"submission_pick_the_mean.csv", sep=",", index=False
)
sample_submission.head(10)