# Simple solution with any machine learning

Inspired by some discussions in the forum this notebook tries to solve the competition using a simple solution: Calculate the mean congestion value for each time stamp and use this value for the prediction. I will create the mean congestion values in the training data set, group the data by the columns 'x','y','direction', 'day_of_week','hour', 'minute'. Afterthat this mean congestion value wil be added to the test data set. As the mean value i use the arithmetic, the geometric mean and the median value. 

I also add the idea from huseyincotel and his notebook https://www.kaggle.com/huseyincot/without-machine-learning-pick-the-means to get the best congestion value for each row: calculate the absolute difference between the real congestion value and the different mean values. The mean value with the lowest difference is used for the submission then.

Thanks https://www.kaggle.com/wti200 for the idea of Geometric and Harmonic means.

In [None]:
import numpy as np
import pandas as pd
import holidays

import seaborn as sns

from fastai.imports import *
from fastai.tabular.core import *
from scipy.stats.mstats import gmean
from sklearn.metrics import mean_absolute_error

In [None]:
path = Path('../input/tabular-playground-series-mar-2022')
Path.BASE_PATH = path
path.ls()

Load the data

In [None]:
train_df = pd.read_csv(os.path.join(path, 'train.csv')).set_index("row_id")
test_df = pd.read_csv(os.path.join(path, 'test.csv')).set_index("row_id")
sample_submission = pd.read_csv(os.path.join(path, 'sample_submission.csv'))

dep_var = 'congestion'
date_var = 'time'

In [None]:
train_df.head()

The following function are used to add extendeded feature like the minute,hour, day of week,etc to the data sets. I use this feature later to group the data or to select a subset of the provided training data.

In [None]:
def add_holiday_info(df):
    
    make_date(df, date_var)
     
    us_holidays = []
    for x in holidays.UnitedStates(years=1991).items():
        us_holidays.append(str(x[0]))
    
    df['is_holiday'] = [1 if str(val).split()[0] in us_holidays else 0 for  val in df[date_var].dt.date]
    return df

In [None]:
train_df = add_holiday_info(train_df)
test_df = add_holiday_info(test_df)

In [None]:
def add_time_features(df):
    
    make_date(df, date_var)
    
    date_field = df[date_var] 
    
    df['day_of_week'] = date_field.dt.dayofweek
    df['day_of_year'] = date_field.dt.dayofyear - 1
    
    df['month'] = date_field.dt.month 
    df['hour'] = date_field.dt.hour 
    df['minute'] = date_field.dt.minute 
    
    return df


In [None]:
train_df = add_time_features(train_df)
test_df = add_time_features(test_df)

This function calculates the different congestion mean values and the absolut differences of the traing data and add these new features to the trainging and test data set.

In [None]:
def add_congestion_stats(df1, df2):

    tmp_df = df1.reset_index()
    keys = ['x','y','direction', 'day_of_week','hour', 'minute']

    df = tmp_df.groupby(by=keys)[dep_var].mean().reset_index().set_index(keys)
    df['congestion_mean'] = df['congestion']
    df1 = df1.merge(df['congestion_mean'], how='left', left_on=keys, right_on=keys)
    df2 = df2.merge(df['congestion_mean'], how='left', left_on=keys, right_on=keys)

    df = tmp_df.groupby(by=keys)[dep_var].median().reset_index().set_index(keys)
    df['congestion_median'] = df['congestion']
    df1 = df1.merge(df['congestion_median'], how='left', left_on=keys, right_on=keys)
    df2 = df2.merge(df['congestion_median'], how='left', left_on=keys, right_on=keys)
    
    df = tmp_df.groupby(by=keys)[dep_var].apply(gmean).reset_index().set_index(keys)
    df['congestion_geo_mean'] = df['congestion']
    df1 = df1.merge(df['congestion_geo_mean'], how='left', left_on=keys, right_on=keys)
    df2 = df2.merge(df['congestion_geo_mean'], how='left', left_on=keys, right_on=keys)
    
    df1["mae_mean"] = np.abs(df1[dep_var] - df1["congestion_mean"])
    df2["mae_mean"] = np.abs(df1[dep_var] - df2["congestion_mean"])
    
    df1["mae_median"] = np.abs(df1[dep_var] - df1["congestion_median"])
    df2["mae_median"] = np.abs(df1[dep_var] - df2["congestion_median"])
    
    df1["mae_geo_mean"] = np.abs(df1[dep_var] - df1["congestion_geo_mean"])
    df2["mae_geo_mean"] = np.abs(df1[dep_var] - df2["congestion_geo_mean"])
    
    return df1, df2


Select only Monday data from the passed dataframe. I can limit the selection to p.m. values and/or holidays

In [None]:
def get_all_mondays(df, is_holiday=0, only_pm_values = False):
    
    mask = (df['day_of_week'] == 0) &  (df['is_holiday'] == is_holiday)
    if only_pm_values:
        mask = mask & (df['hour'] >11)

    return df[mask]

Select only Mondays in September 1991

In [None]:
def get_september_mondays(df, only_pm_values = False):
    
    mask = (df['day_of_week'] == 0) &  (df['month'] == 9)
    if only_pm_values:
        mask = mask & (df['hour'] >11)
    
    return df[mask]

Select the proper training data set

In [None]:
training_df = get_all_mondays(train_df, only_pm_values=False)
# training_df = get_september_mondays(train_df, only_pm_values=False)

training_df['day_of_year'].min(), training_df['day_of_year'].max()

In [None]:
training_df, test_df = add_congestion_stats(training_df, test_df)

This function returns the lowest mean congstion value

In [None]:
def get_best_congestion(row):

    ret_val = row['congestion_mean']
    
    if row['mae_median'] < row['mae_geo_mean'] and row['mae_median'] < row['mae_mean']:
        ret_val= row['congestion_median']
    elif row['mae_geo_mean'] < row['mae_mean']:
        ret_val= row['congestion_geo_mean'] 
    
    if ret_val == 0:
        ret_val = row['congestion_mean']
    return ret_val

In [None]:
test_df.head()

In [None]:
sample_submission[dep_var] = test_df.apply(lambda row: get_best_congestion(row), axis=1)
sample_submission.head()

In [None]:
plt.figure(figsize = (16,6))
sns.lineplot(data = test_df , x ="time",  y = sample_submission[dep_var], label = dep_var, ci=None)
plt.title("The predictions")
plt.show()

Let's do the 'special value' handling

In [None]:
sample_submission = sample_submission.set_index('row_id', drop=False)
sample_submission.loc[[848891,848956,848956,849021,849151,849216,849281,849346,849411]]

In [None]:
sample_submission.loc[[848891,848956,848956,849021,849151,849216,849281,849346,849411],dep_var] = 20
sample_submission.to_csv("submission.csv", index=False)

In [None]:
sample_submission.loc[[848891,848956,848956,849021,849151,849216,849281,849346,849411]]

In [None]:
!ls -la