In [None]:
import numpy as np
import pandas as pd
import sklearn.metrics as metrics
from itertools import chain, combinations

df_train = pd.read_csv("../input/tabular-playground-series-mar-2022/train.csv")
df_test = pd.read_csv("../input/tabular-playground-series-mar-2022/test.csv")

# ============ Very simple feature engineering =============
def fe(df_train, df_test):
    df_train["test"] = 0
    df_test["test"] = 1
    df = pd.concat([df_train, df_test])
    df['time'] = pd.to_datetime(df['time'])
    df['month'] = df['time'].dt.month.astype(int)
    df['weekday'] = df['time'].dt.weekday.astype(int)
    df['hour'] = df['time'].dt.hour.astype(int)
    df['day'] = df['time'].dt.day.astype(int)
    df['minute'] = df['time'].dt.minute.astype(int)
    # Take only hour >= 12
    good_months = [4,5,6,9]
    df = df.loc[(df.hour>=12)&(df.weekday<=5)&(df.month.isin(good_months)),:]
    return df.loc[df.test==0, :], df.loc[df.test==1, :]
 
df_train, df_test = fe(df_train, df_test)

# ============ Split only some Mondays at hour >= 12 =============
month_day = [(4, 29), (5,20), (6,17), (9, 9), (9, 23)]
df_train["fold"] = -1
for k, (month, day) in enumerate(month_day):
    index = df_train["day"] == day
    index &= df_train["month"] == month
    index &= df_train["hour"] >= 12
    df_train.loc[index, "fold"] = k

n_splits = len(month_day)

df_train.head()

In [None]:
gropuby_list = ["month", "weekday", "hour", "minute", "x", "y", "direction"]
best_group = None
best_score = 100000
# Iterate over all possible combinations of groupby_list
for groupby in list(chain.from_iterable(combinations(gropuby_list, r) for r in range(4, len(gropuby_list)+1))):
    groupby = list(groupby)
    
    score_list = []
    for k in range(n_splits):
        train_index = df_train["fold"] != k
        val_index = df_train["fold"] == k
        
        train = df_train.loc[train_index, :]
        val = df_train.loc[val_index, :]
        y_val = df_train.loc[val_index, "congestion"].values
        
        # Model (Train)
        df_grouped = train.groupby(groupby)["congestion"].median()
        # Eval on fold k
        multi_index = val[groupby].values.tolist()
        y_pred = df_grouped[multi_index].values
        y_pred = np.clip(y_pred, 0, 100)
        # Compute Fold - MAE
        score = metrics.mean_absolute_error(y_val, y_pred)
        score_list.append(score)
    
    # Compute CV MAE
    score = np.mean(score_list)
    if score < best_score:
        best_score = score
        best_group = groupby
        print(f"Best MAE: {best_score:.6f}, {best_group}")

In [None]:
# Write submission
df_sub = pd.DataFrame()
df_sub["row_id"] = df_test.row_id

# Best model (Train)
df_grouped = df_train.groupby(best_group)["congestion"].median()
# Inference
multi_index = df_test[best_group].values.tolist()
y_test = df_grouped[multi_index].values
df_sub["congestion"] = np.clip(y_test, 0, 100)
df_sub.to_csv("submission.csv", index=False)