# Clustering morning commutes
Intuitively, it makes sense that the congestion on similar mornings will have similar congestion levels for the rest of the day. I therefore tried clustering together the morning commutes and just pooling their afternoon/evening commutes and using them as predictions.

In [None]:
import numpy as np 
import pandas as pd 
import re
import datetime
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import time
from joblib import Parallel, delayed
import seaborn as sns

input_dir = '/kaggle/input/tabular-playground-series-mar-2022/'

In [None]:
##Feature engineering
def handle_dates(df):
    df['datetime'] = pd.to_datetime(df['time'])
    df['time'] = [datetime.datetime.time(d) for d in df.loc[:,'datetime']] 
    time_mapping = {t:ii for ii,t in enumerate(train.time.unique())}
    df['time_number'] = [time_mapping[d] for d in df.loc[:,'time']] 
    df['date'] = [datetime.datetime.date(d) for d in df.loc[:,'datetime']] 
    df['weekday'] = [d.weekday() for d in df.datetime]
    return df

In [None]:
##Load and transform data
train = pd.read_csv(input_dir + 'train.csv')
test = pd.read_csv(input_dir + 'test.csv')
train = handle_dates(train)
test = handle_dates(test)

print('Train shape: ' + str(train.shape) + ', Test shape:' + str(test.shape))

In [None]:
##Pivot the data to be a single row per day
train.loc[:,'loc_dir_time'] = [str(x)+str(y)+direction+str(t) for _, x,y,direction,t in train.loc[:,['x','y','direction','time_number']].itertuples()]
test.loc[:,'loc_dir_time'] = [str(x)+str(y)+direction+str(t) for _, x,y,direction,t in test.loc[:,['x','y','direction','time_number']].itertuples()]
Xy = train.loc[:,['loc_dir_time','weekday','congestion','date']]
Xy = pd.pivot_table(Xy, values='congestion', index=['date', 'weekday'], columns=['loc_dir_time'], ).reset_index()

In [None]:
##Save the 30th of September for the final prediction
FINAL = Xy.query('date==datetime.date(1991,9,30)')
Xy = Xy.query('date<datetime.date(1991,9,30)')

In [None]:
##Some days have missing timestamps for certain location and direction combinations
##At the moment I am just imputing using the median value for that location, direction, time, day combination
##However, I highly suspect this isnt the best method as it may well be causing data leakage in the later analysis
##Fixing this is on my todo list
Xy = Xy.fillna(Xy.groupby('weekday').transform('median')).set_index('date')

In [None]:
##Split the data into independent and dependent data by using all times from noon-midnight as dependent variables and midnight-noon as independent variables
cols = Xy.columns
time_number = [int(re.findall('(?<=[A-Z])[0-9]+',x)[0]) for x in cols[1:]]
X = Xy.loc[:,[False]+[t<36 for t in time_number]]
y = Xy.loc[:,[False]+[t>=36 for t in time_number]]
X_train = X#.loc[Xy.weekday<5,:]
y_train = y#.loc[Xy.weekday<5]
yt = y_train.values
X_final = FINAL.loc[:,[False,False]+[t<36 for t in time_number]]

In [None]:
##Create some functions for checking the results
##Try raw predictions, rounding, and taking the floor of them
##Create some functions for checking the results
##Try raw predictions, rounding, and taking the floor of them
def get_mae(A):
    S_MAE = [np.mean(a) for a in A]
    S = np.mean(S_MAE)
    return S,S_MAE
    
def get_scores(n,c1,c2,res, mae, PP, pooling_method):
    ABS = [np.abs(yt[ii,:] - PP[ii]) for ii in range(len(PP))]
    ABS_rounded = [np.abs(yt[ii,:] - np.round(PP[ii])) for ii in range(len(PP))]
    ABS_int = [np.abs(yt[ii,:] - np.floor(PP[ii])) for ii in range(len(PP))]
    
    RAW, RAW_MAE = get_mae(ABS)
    ROUNDED, ROUNDED_MAE = get_mae(ABS_rounded)
    FLOOR, FLOOR_MAE = get_mae(ABS_int)

    res.append([n, c1,c2, 'raw', RAW, pooling_method])
    res.append([n, c1,c2, 'rounded', ROUNDED, pooling_method])
    res.append([n, c1,c2, 'floor', FLOOR, pooling_method])  
    
    mae[0].append(RAW_MAE)
    mae[1].append(ROUNDED_MAE)
    mae[2].append(FLOOR_MAE)
    
    return res, [RAW,ROUNDED,FLOOR], mae

def update(best_score,
           best_params,
           score,
           name,
           method):
    if score<best_score:
        best_score = score
        best_params = [n,cutoff,max_cutoff,name,method]
    return best_score, best_params

In [None]:
##Try clustering using:
## 1. different cluster sizes
## 2. different amounts of morning data (i.e. midnight-1am may not be as important as its no as recent as the other times)
## 3. pooling the neighbours by mean or median

def myfunc(arg):
    global time_number, X_train, y_train, yt, get_scores
    c1, c2, max_clust = arg
    NN = NearestNeighbors(n_neighbors=max_clust,metric = 'manhattan')
    NN.fit(X_train.loc[:,[(t>=c1) & (t<=c2) for t in time_number if t<36]])
    distances, indices = NN.kneighbors(X_train.loc[:,[(t>=c1) & (t<=c2) for t in time_number if t<36]])
    res = []
    mae = [[],[],[]]
    for n in range(1,max_clust):
        res,scores, mae = get_scores(n,c1,c2,res,mae, [np.mean(yt[jj[1:(n+1)],:],axis = 0) for jj in indices],'mean')
        res,scores, mae = get_scores(n,c1,c2,res,mae, [np.median(y_train.iloc[jj[1:(n+1)],:],axis = 0) for jj in indices],'median')
    return res,mae

##################################################
##Commented out as it takes a looooong time to run
##################################################
#results = res = Parallel(n_jobs=4)(
#                 map(delayed(myfunc), [(c1,c2, 182) for c1 in range(36) for c2 in range(c1,36)]))

results = res = Parallel(n_jobs=4)(
                map(delayed(myfunc), [(c1,c2, 50) for c1 in range(36) for c2 in range(c1,36)]))

In [None]:
##Extract results

#res contains all the results with scores for each day aggregated by mean
res = [r for r1, _ in results for r in r1]

#mae contains all the nonaggregated socres for each day for each condition tried
##Going to find the optimum parameter set for:
    ##Non-national holiday days as they look different
non_holidays = [x not in [datetime.date(1991,4,1), datetime.date(1991,5,27), datetime.date(1991,9,2)] for x in Xy.index]
    ##Weekdays as they are likely to be more similar
days_to_use = np.argwhere((Xy.weekday.values<5) & non_holidays).flatten()

mae = [np.mean([r2[jj][ii][kk] for kk in days_to_use]) for _, r2 in results for ii in range(len(r2[0])) for jj in range(3)]

In [None]:
##Extract parameters
n_clust = [r[0] for r in res]
start_cutoff = [r[1] for r in res]
end_cutoff = [r[2] for r in res]
adjust = [r[3] for r in res]
score = mae
method = [r[5] for r in res]

In [None]:
#Plot a heatmap of best score for the window to used to cluster days
rev_time_mapping = {ii:t for ii,t in enumerate(train.time.unique())}

temp_df = pd.DataFrame({'score':[-s for s in score],'start_cutoff':[rev_time_mapping[s] for s in start_cutoff],'end_cutoff':[rev_time_mapping[e] for e in end_cutoff]}).groupby(['start_cutoff','end_cutoff']).max().reset_index()

fig = plt.figure(figsize = (10,7))
sns.heatmap(pd.pivot_table(temp_df,values = 'score', index = 'start_cutoff',columns = 'end_cutoff'))
plt.xlabel('End of time window',fontsize = 20)
plt.ylabel('Start of time window',fontsize = 20)
plt.show()

In [None]:
#Look at how individual parameters vary the score
fig,ax = plt.subplots(1,3, figsize = (30,10))

ax[0].scatter(start_cutoff,score)
ax[0].set_xlabel('Start of clustering window',fontsize = 20)
ax[0].set_ylabel('MAE',fontsize = 20)
ax[0].set_xticks(ticks = range(0,36,5),labels = [rev_time_mapping[s] for s in np.unique(start_cutoff) if s in range(0,36,5)])

ax[1].scatter(end_cutoff,score)
ax[1].set_xlabel('End of clustering window',fontsize = 20)
ax[1].set_ylabel('MAE',fontsize = 20)
ax[1].set_xticks(ticks = range(0,36,5),labels = [rev_time_mapping[e] for e in np.unique(end_cutoff) if e in range(0,36,5)])

ax[2].scatter(n_clust,score)
ax[2].set_xlabel('Number of neighbours to pool together',fontsize = 20)
ax[2].set_ylabel('MAE',fontsize = 20)

plt.show()

In [None]:
fig,ax = plt.subplots(1,2, figsize = (15,7))

sns.stripplot(x = adjust, y =score, ax = ax[0])
ax[0].set_xlabel('Adjustment method',fontsize = 20)
ax[0].set_ylabel('MAE',fontsize = 20)

sns.stripplot(x = method, y =score, ax = ax[1])
ax[1].set_xlabel('Pooling method',fontsize = 20)
ax[1].set_ylabel('MAE',fontsize = 20)

plt.show()

In [None]:
##Find optimum parameter set
best_param_position = np.argmin(score)
n_neighbours = n_clust[best_param_position]
min_cutoff = start_cutoff[best_param_position]
max_cutoff = end_cutoff[best_param_position]
adjust = adjust[best_param_position]
method = method[best_param_position]

if adjust == 'floor':
    adjust = np.floor
elif adjust == 'rounded':
    adjust = np.round
else:
    lambda x: x
    
if method == 'median':
    method = np.median
else:
    method = np.mean

In [None]:
##Find prediction for 30th September
NN = NearestNeighbors(n_neighbors=n_neighbours,metric = 'manhattan')
NN.fit(X_train.loc[:,[(t>=min_cutoff) & (t<=max_cutoff) for t in time_number if t<36]])
distances, indices = NN.kneighbors(X_final.loc[:,[(t>=min_cutoff) & (t<=max_cutoff) for t in time_number if t<36]])
sub = pd.melt(pd.DataFrame({k:v for k,v in zip(y_train.columns,adjust(method(y_train.iloc[indices[0],:],axis = 0)))}, index=[0]), value_name = 'congestion', var_name = 'loc_dir_time')
test.merge(sub).loc[:,['row_id','congestion']].to_csv('submission.csv',index=False)

# Future steps
1. Fix the imputation method used prior to analysis
2. See how well clustering works for each day individually (unlikely, but hopeful it works best for Mondays!)