In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import datetime as dt

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv')
df_train['time'] = pd.to_datetime(df_train['time'])
print(df_train.head())
df_train.drop(['row_id'], axis=1, inplace=True)
print(df_train.head())

df_test = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv')

In [None]:
df_train.describe()

In [None]:
# check for any missing/null/nans/duplicates
print(df_train.isnull().sum())
print(df_train.isna().sum())
print(f'Duplicates: {df_train.duplicated().sum()}')

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(15, 15))
ax[0, 0].hist(df_train['congestion'], bins=100)
ax[0, 0].set_xlabel('Congestion')

ax[0, 1].hist(df_train['x'])
ax[0, 1].set_xlabel('x')

ax[1, 0].hist(df_train['y'])
ax[1, 0].set_xlabel('y')

ax[1,1].hist(df_train['direction'], bins=df_train['direction'].nunique(), align='right')
ax[1, 1].set_xlabel('direction')

fig.show()

# TODO
* Reweighting of congestion (target)?

In [None]:
# adding month and day features
df_train['day']   = df_train['time'].dt.dayofweek
df_train['month'] = df_train['time'].dt.month

In [None]:
print(df_train['day'].value_counts())
print(df_train['month'].value_counts())

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(15, 7))
ax[0].hist2d(df_train['congestion'], df_train['day'], bins = [50, 7], cmap=mpl.cm.Blues)
ax[0].set_xlabel('Congestion')
ax[0].set_ylabel('Weekday')

ax[1].hist2d(df_train['congestion'], df_train['month'], bins = [50, 12], cmap=mpl.cm.Blues, range=([0, 100], [0, 11]))
ax[1].set_xlabel('Congestion')
ax[1].set_ylabel('Month')

plt.show()

# Where do the congestion spikes come from?

In [None]:
fig, ax = plt.subplots(3, 3, figsize=(30, 30))
unique_directions = df_train['direction'].unique()

title_map = {
    0 : 'Monday',
    1 : 'Tuesday',
    2 : 'Wednesday',
    3 : 'Thursday',
    4 : 'Friday',
    5 : 'Saturday',
    6 : 'Sunday',
}

ax = ax.ravel()
for i in range(7):
    day_view = df_train[df_train['day'] == i]
    for direction in unique_directions:
        direction_view = day_view[day_view['direction'] == direction]
        ax[i].hist(direction_view['congestion'], label=direction, bins=50, stacked=True)
    ax[i].legend(loc='best')
    ax[i].set_xlabel('Congestion')
    ax[i].set_ylabel('Count/bin')
    ax[i].set_title(title_map[i])
ax = ax.reshape(3, 3)
plt.show()

* Spikes are limited to NW and SE traffic?
* What is special about these values?
* The day of week and travel direction already seem like a good enough grouping to form sensible predictions of congestion
* Try submitting the mean of these groupings

In [None]:
"""import holidays
holiday_list = holidays.US()
def is_holiday(x):
    return int(x in holiday_list)
df_train['national_holiday'] = df_train['time'].apply(is_holiday)"""

In [None]:
"""fig, ax = plt.subplots(1, 1, figsize=(10, 10))
sns.heatmap(df_train.corr(), cmap=sns.color_palette("vlag", as_cmap=True), square=True, ax=ax, annot=True)
plt.show()"""

# Congestion probably also has a high correlation with time of day, let's check

In [None]:
df_train['hour'] = df_train['time'].dt.hour

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(7, 7))
#ax.plot(day_view['time'], day_view['congestion'])
#ax.plot(day_view['hour'], day_view['congestion'], 'b*')
ax.hist2d(df_train['hour'], df_train['congestion'], bins=(24, 50))
ax.set_ylabel('Congestion')
ax.set_xlabel('Time')
plt.show()

## naive model, grouping by direction, day of week, hour, and road position (x,y)

In [None]:
naive_model = df_train.groupby(['hour', 'direction', 'x', 'y', 'day']).mean()

In [None]:
print(naive_model)

## Adding features to testing

In [None]:
df_test['time'] = pd.to_datetime(df_test['time'])

df_test['hour']  = df_test['time'].dt.hour
df_test['day']   = df_test['time'].dt.dayofweek
df_test['month'] = df_test['time'].dt.month

In [None]:
list_preds = []
for i in range(len(df_test)):
    row = df_test.iloc[i]
    hour, direction, x, y, day = row['hour'], row['direction'], row['x'], row['y'], row['day']
    
    row_id     = row['row_id']
    prediction = naive_model.loc[hour, direction, x, y, day]['congestion']
    list_preds.append( {'row_id' : row_id, 'congestion' : prediction})
    
predictions = pd.DataFrame(columns=['row_id', 'congestion'])
predictions = predictions.append(list_preds)
predictions['congestion'] = predictions['congestion'].round().astype(int)
print(predictions)

In [None]:

predictions.to_csv('submission.csv', index=False)