---
In this notebook I have performed a short analysis of the traffic volumes. I will keep adding content; its work in progress.

   <a id="toc"></a>
   
   - [1. EDA - Short Insights](#1)
   - [2. Local Cross Validation Strategy](#2)
   - [3. Advanced Central Tendency Approach](#3)

---

<a id="1"></a>
### **EDA - short insights**

In [None]:
# import libraries
import pandas as pd
import numpy as np

from datetime import datetime, time
import matplotlib.pyplot as plt
import matplotlib 
from matplotlib import gridspec
import seaborn as sns
from scipy.stats.mstats import gmean, hmean

import warnings
warnings.filterwarnings("ignore")


# runtime configuration of matplotlib
plt.style.use("Solarize_Light2")
plt.rc("figure", 
    autolayout=True, 
    figsize=(20, 10)
)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=20,
    titlepad=10,
)

# load data
train_raw = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv', index_col='row_id', parse_dates=['time'])
test_raw = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv', index_col='row_id', parse_dates=['time'])



In [None]:
dates = [datetime(1991,9,30,0,0,0),train_raw['time'].max(), test_raw['time'].min(), test_raw['time'].max()]

min_date = datetime(1991,9,29,23,0,0)
max_date = datetime(1991,9,30,23,59,0)

labels = ["", 'End train period', 'Start test period', 'End test period']
# labels with associated dates
labels = ['{0:%H:%M}\n{1}'.format(d, l) for l, d in zip(labels, dates)]

fig, ax = plt.subplots(figsize=(15, 2), constrained_layout=False)
_ = ax.set_ylim(-2, 1.75)
_ = ax.set_xlim(min_date, max_date)
_ = ax.axhline(0, xmin=0.05, xmax=0.95, c='deeppink', zorder=1)
 
_ = ax.scatter(dates, np.zeros(len(dates)), s=120, c='palevioletred', zorder=2)
_ = ax.scatter(dates, np.zeros(len(dates)), s=30, c='darkmagenta', zorder=3)

label_offsets = np.zeros(len(dates))
label_offsets[::2] = 1
label_offsets[1::2] = -1.5
for i, (l, d) in enumerate(zip(labels, dates)):
    _ = ax.text(d, label_offsets[i], l, ha='center', fontfamily='serif', fontweight='bold', color='black',fontsize=10)

stems = np.zeros(len(dates))
stems[::2] = 0.9
stems[1::2] =-0.9  
markerline, stemline, baseline = ax.stem(dates, stems, use_line_collection=True)
_ = plt.setp(markerline, marker=',', color='darkmagenta')
_ = plt.setp(stemline, color='darkmagenta')

# hide lines around chart
for spine in ["left", "top", "right", "bottom"]:
    _ = ax.spines[spine].set_visible(False)
 
# hide tick labels
_ = ax.set_xticks([])
_ = ax.set_yticks([])
 
_ = ax.set_title('30 September 1991', fontweight="normal", fontfamily='serif', fontsize=12, color='black');


In [None]:
# Function to extract time components
def time_components(df):
    
    df_copy = df.copy()
    
    df_copy['year'] = df_copy['time'].dt.year
    df_copy['month'] = df_copy['time'].dt.month
    df_copy['day'] = df_copy['time'].dt.day
    df_copy['hour'] = df_copy['time'].dt.hour
    df_copy['minute'] = df_copy['time'].dt.minute
    df_copy['weekday'] = df_copy['time'].dt.weekday
    
    return df_copy

# Add time components
train = time_components(train_raw)
test = time_components(test_raw)

# Prepare data for plot
weekday = {0: "Mon", 1: "Tue", 2: "Wed", 3: "Thur", 4: "Fri", 5: "Sat", 6: "Sun"}
train["day_week"] = train["weekday"].map(weekday)

df_plot = train.loc[:,["day_week", "weekday", "hour","congestion"]].groupby(["day_week", "weekday","hour"]).mean("congestion")
df_plot.reset_index(["day_week", "weekday","hour"], inplace=True)

df_plot["weekday-hour"] = df_plot["day_week"] + "-" + df_plot["hour"].astype(str)
df_plot["hour"] = df_plot["hour"].astype(int)
df_plot.sort_values(by=["weekday", "hour"], inplace=True)

# plot
fig = plt.figure(constrained_layout=False)
spec = gridspec.GridSpec(ncols=2, nrows=2, figure=fig)

ax1 = fig.add_subplot(spec[0, 0])
sns.countplot(y="direction", ax=ax1, data=train);
ax1.set(xlabel="Number of rides", ylabel = "")
ax1.set_title('Destinations');

ax2 = fig.add_subplot(spec[0, 1])
sns.histplot(x="congestion", ax=ax2, data=train, palette=['r' if con in [15, 20, 21, 29, 34] else '#ffd700' for con in range(101)]);
ax2.set(xlabel="Congestion values", ylabel = "")
ax2.set_title('Congestion Histogram');

ax3 = fig.add_subplot(spec[1,:])
sns.lineplot(x="weekday-hour", y="congestion", data=df_plot, color="g")

days = ["Mon-0", "Tue-0", "Wed-0", "Thur-0", "Fri-0", "Sat-0", "Sun-0"]
weekdays = ["Monday", "Tuesday", "Wendsday", "Thursday", "Friday", "Saturday", "Sunday"]

for day, weekday in zip(days, weekdays):
    plt.axvline(x = day, color = 'k', alpha=0.3)
    ax3.text(day, 56, weekday, fontsize=12, verticalalignment='top', rotation="horizontal", color="k", fontproperties="bold")

ax3.set_xlim(-1, "Sun-23");
ax3.xaxis.set_ticks(["Mon-8","Mon-16", "Tue-8","Tue-17", "Wed-8","Wed-17", "Thur-8", "Thur-17", "Fri-8","Fri-17", "Sat-15", "Sun-14"])
ax3.set_xticklabels(["8.00 AM", "16.00 PM", "8.00 AM","17.00 PM", "8.00 AM","17.00 PM", "8.00 AM","17.00 PM","8.00 AM", "17.00 PM","15.00 PM", "14.00 PM"])
ax3.set_title('Hourly Congestion Level')
ax3.set(ylabel="Average congestion", xlabel=None)

plt.setp(ax3.get_xticklabels(), rotation=0, weight='bold');

* **Takeaways**
    - The second plot has some interessting observations at `[15, 20, 21, 29, 34]`. We have to dig deeper here to understand why these congestion levels are overrepresented. When useing Random Forest one must adjust for this with `sample_weight` parameter while fitting. I will get back to this at a later moment.
    - The third plotgrab details normal traffic congestion pattern on a daily basis, for each hour of the day. Travel times are seen to peak around 8.00 AM and 17.00 PM.
    - In the second plot we can see that there are `0` congestion observations which is overrepresented compared to `1, 2, 3,...` congestion levels. Could this be road closures for example? We have to dig in to the data and see what is happening. 

---

<a id="2"></a>

### **Local Cross Validation Strategy**

The second step after EDA in every Kaggle competition is to build a reliable local validation strategy. With reliable I mean a `local CV` score that correlates with `LB` score. Because then we can use our local CV score to evaluate experiments or to tune (hyper)parameters. There are two question that I usually try to answer. 

1. How to split the data in `train` and `validation` (there are a lot of different strategies) and 
2. Once a strategy is chosen does `LB` score moves in the direction of `local CV` score? If the answer is yes then probably the relationship between your local folds is the same relationship between Kaggle's train and test. If not, try other CV strategy and if you cannot find a reliable `local CV` then is it probably time to stop taking part in the competition because at the end you might be highly disappointed after the final shake-up. 
    

For this competition I have the following CV strategy:
    
 - *Train/Validation split* : All `Mondays` from`12 noon` until `12 midnight` are selected as training data. September 2 which is Labour Day is excluded from analysis because it had a negative impact on the performance. Mondays for the month of September are used as `3 validation` folds and the rest of the mondays are used as training data.
 - *Local CV and LB agreement*: See next section where **Advanced Central Tendency Approach** is applied to predict congestion.

In [None]:
#
train_monday = train.loc[(train["weekday"]==0) & (train["hour"]>=12),:]
train_monday["date"] = train_monday["time"].dt.date

mean_train_monday = train_monday[["time", "congestion"]].groupby(["time"]).mean("congestion")
mean_train_monday.reset_index(inplace=True)
mean_train_monday = time_components(mean_train_monday)

mean_train_monday["weekday_hour"] = mean_train_monday["month"].astype(str) + "-" + mean_train_monday["day"].astype(str) + " " \
    + mean_train_monday["hour"].astype(str)+ ":" + mean_train_monday["minute"].astype(str)
mean_train_monday.sort_values(by=["time"], inplace=True)

from matplotlib.ticker import (MultipleLocator, FormatStrFormatter,AutoMinorLocator)
fig = plt.figure(constrained_layout=False)
spec = gridspec.GridSpec(ncols=1, nrows=2, figure=fig)


ax1 = fig.add_subplot(spec[0,0])
sns.scatterplot(x="weekday_hour", y="congestion", data=mean_train_monday, color="g")

mondays = ["4-1 12:0","5-6 12:0","6-3 12:0","7-1 12:0","8-5 12:0","9-2 12:0"]
months = ["April", "May", "June", "July", "August", "September"]
xticklabels = ["April-1", "May-6", "June-3", "July-1", "August-5", "September-2"]

ax1.xaxis.set_ticks(mondays)
ax1.set_xticklabels(xticklabels)

ax1.set_xlim("4-1 12:0", "9-30 12:0")


for monday, month in zip(mondays, months):
    plt.axvline(x = monday, color = 'k', alpha=0.3)
    ax1.text(monday, 61, month, fontsize=12, verticalalignment='top', rotation="horizontal", color="k", fontproperties="bold")
    
plt.setp(ax1.get_xticklabels(), rotation=45, weight='bold');

fig.patches.extend([plt.Rectangle((0.845,0.545),0.04,0.15,
                                  fill=True, color='r', alpha=0.2, zorder=1000,
                                  transform=fig.transFigure, figure=fig)])
ax1.annotate("Labour Day", ("9-2 12:0", 43), ("8-19 12:0", 40) , arrowprops={"arrowstyle": "simple"},\
    fontproperties="cursive", fontsize=15)

fig.patches.extend([plt.Rectangle((0.3259,0.545),0.04,0.265,
                                  fill=True, color='r', alpha=0.2, zorder=1000,
                                  transform=fig.transFigure, figure=fig)])
ax1.annotate("Memorial Day", ("6-3 12:0", 43), ("6-10 12:0", 40) , arrowprops={"arrowstyle": "simple"},\
     fontproperties="cursive", fontsize=15)

ax1.set_title('Monday Congestion (Hourly-Minute)')
ax1.set(ylabel="Mean congestion", xlabel=None);

* Takeaways
    - Month of `August` has the highst congestion level and it looks pretty consistent for all of the mondays of this month.
    - First monday of `September` is `Labour Day` that is why congestion level is significantly low.
    - For the month of `May` the last monday is `Memorial Day` that is why congestion levels are relatively low.
    - My general observation is that congestion level are more or less the same for the mondays within the same month, excluding holidays of course.

---
<a id="3"></a>

### **Advanced Central Tendency Approach**

In this section I would like to see how the central tendency approaches like `geometric mean`,,`harmonic mean`, `arithmetic mean`, `median` and `mode` performs. I validate my results with the afternoon `congestion` level for `9, 16` and `23 September`.  `Labour Day` is removed and the performance increased. I also removed `Memorial Day` but that did not improve the performance. So, after calculating the above mentioned approaches I also took the linear ensemble of some of the approaches. The coefficients were calculated with the help of linear regression.

In [None]:
train = time_components(train_raw)
test = time_components(test_raw)

In [None]:
#
from scipy.stats.mstats import gmean

train_monday = train.loc[(train["weekday"]==0) & (train["hour"]>=12),:]
train_monday["date"] = train_monday["time"].dt.date

train_period = [day for day in list(train_monday['date'].unique()) if (day.month!=9)]
val_period = [day for day in list(train_monday['date'].unique()) if (day.month==9 and day.day!=2)]

train_X = train_monday.loc[train_monday['date'].isin(train_period),:]
train_X = train_X.loc[(train_X["congestion"]>=0) & (train_X["congestion"]<=100),:]

train_val = train_monday.loc[train_monday['date'].isin(val_period),:]

# Compute the median congestion for every place and time of week    
median = pd.DataFrame(train_X.groupby(['x', 'y', 'direction', 'hour', 'minute']).congestion.median())
mean = pd.DataFrame(train_X.loc[train_X["congestion"]>0,:].groupby(['x', 'y', 'direction', 'hour', 'minute']).congestion.mean())
h_mean = pd.DataFrame(train_X.loc[train_X["congestion"]>0,:].groupby(['x', 'y', 'direction', 'hour', 'minute']).congestion.apply(hmean))
g_mean = pd.DataFrame(train_X.loc[train_X["congestion"]>0,:].groupby(['x', 'y', 'direction', 'hour', 'minute']).congestion.apply(gmean))
mode = train_X.groupby(['x', 'y', 'direction', 'hour', 'minute'])['congestion'].agg(lambda x:x.value_counts().index[0])

central_measures = median.merge(h_mean, how="left", left_index=True, right_index=True)
central_measures = central_measures.merge(g_mean, how="left", left_index=True, right_index=True)
central_measures = central_measures.merge(mean, how="left", left_index=True, right_index=True)
central_measures = central_measures.merge(mode, how="left", left_index=True, right_index=True)

central_measures.columns = ["median", "h_mean","g_mean", "mean", "mode"]

# train set error performance
train_score = train_X.merge(central_measures, how="left", \
left_on=['x', 'y', 'direction', 'hour', 'minute'], right_on=['x', 'y', 'direction', 'hour', 'minute'])
train_score["median_mae"] = np.abs(train_score["congestion"] - train_score["median"])
train_score["g_mean_mae"] = np.abs(train_score["congestion"] - train_score["g_mean"])
train_score["h_mean_mae"] = np.abs(train_score["congestion"] - train_score["h_mean"])
train_score["mean_mae"] = np.abs(train_score["congestion"] - train_score["mean"])
train_score["mode_mae"] = np.abs(train_score["congestion"] - train_score["mode"])

from sklearn import linear_model

X = train_score[["median", "g_mean"]]
y = train_score["congestion"]

lm = linear_model.LinearRegression()
model = lm.fit(X,y)
predictions = lm.predict(X)
print(f'Median linear weight:{np.round(lm.coef_[0],4)}\n')
print(f'Geometric Mean linear weight:{np.round(lm.coef_[1],4)}\n')

train_score["ensemble"] =lm.coef_[0]*train_score["median"] + lm.coef_[1]*train_score["g_mean"]
train_score["ensemble_mae"] = np.abs(train_score["congestion"] - train_score["ensemble"])

# validation performance
val_score = train_val.merge(central_measures, how="left", \
left_on=['x', 'y', 'direction', 'hour', 'minute'], right_on=['x', 'y', 'direction', 'hour', 'minute'])
val_score["median_mae"] = np.abs(val_score["congestion"] - val_score["median"])
val_score["g_mean_mae"] = np.abs(val_score["congestion"] - val_score["g_mean"])
val_score["h_mean_mae"] = np.abs(val_score["congestion"] - val_score["h_mean"])
val_score["mean_mae"] = np.abs(val_score["congestion"] - val_score["mean"])
val_score["mode_mae"] = np.abs(val_score["congestion"] - val_score["mode"])

val_score["ensemble"] =lm.coef_[0]*val_score["median"] + lm.coef_[1]*val_score["g_mean"]
val_score["ensemble_mae"] = np.abs(val_score["congestion"] - val_score["ensemble"])
val_score[["date", "median_mae", "g_mean_mae", "h_mean_mae", "mean_mae", "mode_mae", "ensemble_mae"]].groupby(["date"]).mean()

---
* **Takeaways**
    - `Geometric` and `harmonic` means outperforms other approaches.
    - The ensemble of the `median` and `geometric mean` gives the best `local CV` and `LB` performance.
    - The `geometric mean` approach scored `4.967` on the public LB and `ensemble` scored `4.940` which is inline with the local CV. 
---

*Stay tuned for:*

- Error Analysis of Central Tendency Approach
- Tree Based application
- Analysis of `0` congestion
- ...

---
### Submission

In [None]:
train_monday = train.loc[(train["weekday"]==0) & (train["hour"]>=12),:]
train_monday["date"] = train_monday["time"].dt.date

train_period = [day for day in list(train_monday['date'].unique()) if (day.month!=2)]

train_X = train_monday.loc[train_monday['date'].isin(train_period),:]
train_X = train_X.loc[(train_X["congestion"]>=0) & (train_X["congestion"]<=100),:]

# Compute the median congestion for every place and time of week    
median = pd.DataFrame(train_X.groupby(['x', 'y', 'direction', 'hour', 'minute']).congestion.median())
mean = pd.DataFrame(train_X.loc[train_X["congestion"]>0,:].groupby(['x', 'y', 'direction', 'hour', 'minute']).congestion.mean())
h_mean = pd.DataFrame(train_X.loc[train_X["congestion"]>0,:].groupby(['x', 'y', 'direction', 'hour', 'minute']).congestion.apply(hmean))
g_mean = pd.DataFrame(train_X.loc[train_X["congestion"]>0,:].groupby(['x', 'y', 'direction', 'hour', 'minute']).congestion.apply(gmean))
mode = train_X.groupby(['x', 'y', 'direction', 'hour', 'minute'])['congestion'].agg(lambda x:x.value_counts().index[0])

central_measures = median.merge(h_mean, how="left", left_index=True, right_index=True)
central_measures = central_measures.merge(g_mean, how="left", left_index=True, right_index=True)
central_measures = central_measures.merge(mean, how="left", left_index=True, right_index=True)
central_measures = central_measures.merge(mode, how="left", left_index=True, right_index=True)

central_measures.columns = ["median", "h_mean","g_mean", "mean", "mode"]

df_score = train_X.merge(central_measures, how="left", \
left_on=['x', 'y', 'direction', 'hour', 'minute'], right_on=['x', 'y', 'direction', 'hour', 'minute'])
df_score["median_mae"] = np.abs(df_score["congestion"] - df_score["median"])
df_score["g_mean_mae"] = np.abs(df_score["congestion"] - df_score["g_mean"])
df_score["h_mean_mae"] = np.abs(df_score["congestion"] - df_score["h_mean"])
df_score["mean_mae"] = np.abs(df_score["congestion"] - df_score["mean"])
df_score["mean_mode"] = np.abs(df_score["congestion"] - df_score["mode"])


## Without a constant
from sklearn import linear_model

X = df_score[["median", "g_mean"]]
y = df_score["congestion"]

lm = linear_model.LinearRegression()
model = lm.fit(X,y)
predictions = lm.predict(X)

df_score["ensemble"] =lm.coef_[0]*df_score["median"] + lm.coef_[1]*df_score["g_mean"]
df_score["ensemble_mae"] = np.abs(df_score["congestion"] - df_score["ensemble"])

test = time_components(test_raw)
test.reset_index("row_id", inplace=True)

submission = df_score[['x', 'y', 'direction' ,'hour','minute','g_mean', 'h_mean',"median","mean", "ensemble"]].groupby(['x', 'y', 'direction' ,'hour', 'minute']).mean(["g_mean","h_mean","median","mean", "ensemble"])
submission.reset_index(inplace=True)

submission = test.merge(submission, how="left", \
left_on=['x', 'y', 'direction', 'hour', 'minute'], right_on=['x', 'y', 'direction', 'hour', 'minute'])

In [None]:
fig, axes = plt.subplots(1,1, figsize=(16,5))

axes.hist(train.congestion[((train.time.dt.weekday == 0) & (train.time.dt.hour >= 12)).values],
         bins=np.linspace(-0.5, 100.5, 102), density=True, label='Train', color='b')

axes.hist(submission['ensemble'], np.linspace(-0.5, 100.5, 102),
         density=True, rwidth=0.5, label='Test predictions', color='r')
axes.legend()
axes.set_title("Ensemble (Geometric Mean & Median)", fontsize=12, fontproperties="italic")
axes.set_ylabel('Density', fontproperties="italic")
axes.set_xlabel('Congestion', fontproperties="italic")


plt.show()

In [None]:
submission = submission[['row_id','ensemble']]
submission.columns = ["row_id", "congestion"]
submission.set_index("row_id", inplace=True)
submission['congestion'] = submission['congestion'].round().astype(int)
submission.to_csv("submission.csv", index=True)

submission.head(5)