**PLEASE UPVOTE https://www.kaggle.com/sytuannguyen/tps-mar-2022-eda-model**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['axes.facecolor'] = 'green'
import plotly.express as px
import seaborn as sns

from sklearn.linear_model import LinearRegression as lr
from sklearn.ensemble import RandomForestRegressor as rfr, ExtraTreesRegressor as etr
from sklearn.model_selection import TimeSeriesSplit, KFold
from sklearn.metrics import mean_absolute_error

from lightgbm import LGBMRegressor as lgb

import warnings
warnings.simplefilter('ignore')

In [None]:
train = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv')
train.time = pd.to_datetime(train.time)
train['time_id'] = (((train.time.dt.dayofyear-1) * 24 * 60 + train.time.dt.hour * 60 + train.time.dt.minute) / 20).astype(int)
train = train.set_index('row_id', drop=True)

In [None]:
train

In [None]:
test = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv', index_col='row_id')
test.time = pd.to_datetime(test.time)

submission = pd.read_csv('../input/tabular-playground-series-mar-2022/sample_submission.csv')

In [None]:
train['dir_xy'] = train.direction.map({
    'NB': '[0, 1]',
    'NE': '[1, 1]',
    'EB': '[1, 0]',
    'SE': '[1, -1]',
    'SB': '[0, -1]',
    'SW': '[-1, -1]',
    'WB': '[-1, 0]',
    'NW': '[-1, 1]',
})

In [None]:
loc_dir = train.groupby(['x', 'y']).dir_xy.unique().reset_index()
loc_dir['num_dir'] = loc_dir.dir_xy.apply(lambda roadway: len(roadway))
print(loc_dir)
print(f'ROADWAYS NUMBER: {loc_dir.num_dir.sum()}')

In [None]:
train

In [None]:
plt.figure(figsize=(10, 7), facecolor='green')

for idx, row in loc_dir.iterrows():
    origin = np.repeat(np.array([[row.x], [row.y]]), row.num_dir, axis=-1)
    V = np.array([eval(s) for s in row.dir_xy])
    
    plt.quiver(*origin, V[:, 0], V[:, 1], scale=15, color='deeppink')
    
plt.xlim(-0.3, 2.3)
plt.ylim(-0.5, 3.5)
plt.xticks([])
plt.yticks([])
plt.show()

In [None]:
train = train.drop('dir_xy', axis=1)

In [None]:
print(train.groupby(['x', 'y', 'direction']).time.count())
print(f'TOTAL TIME INSTANTS WITHIN THE OBSERVATION PERIOD: {int(train.time_id.max() - train.time_id.min() + 1)}')

In [None]:
time = pd.DataFrame(pd.date_range('1991-04-01 00:00:00', '1991-09-30 11:40:00', freq='20Min'), columns=['time'])

dfs = []
for group in train.groupby(['x', 'y', 'direction']):
    dfs.append(group[1].merge(time, on='time', how='right'))
    
train_new = pd.concat(dfs, axis=0).sort_values(['time', 'x', 'y', 'direction']).reset_index()
print(f'MISSING ROW:\n{train_new.isna().sum()}')

In [None]:
plt.figure(figsize=(10, 5))
train.congestion.hist(bins=100, color='yellow')
plt.xlabel('CONGESTION', fontsize=16)
plt.ylabel('COUNT', fontsize=16)
plt.show()

In [None]:
plt.figure(figsize=(15, 5))

plt.subplot(1, 2, 1)
train[train.time.dt.hour < 12].congestion.hist(bins=100, color='yellow', label='MORNING')
train[train.time.dt.hour >= 12].congestion.hist(bins=100, color='cyan', label='AFTERNOON')
plt.xlabel('CONGESTION', fontsize=16)
plt.ylabel('COUNT', fontsize=16)
plt.legend()

plt.subplot(1, 2, 2)
train[train.time.dt.hour >= 12].congestion.hist(bins=100, color='cyan', label='AFTERNOON')
train[train.time.dt.hour < 12].congestion.hist(bins=100, color='yellow', label='MORNING')
plt.xlabel('CONGESTION', fontsize=16)
plt.ylabel('COUNT', fontsize=16)
plt.legend()

plt.show()

In [None]:
plt.figure(figsize=(15, 5))

plt.subplot(1, 2, 1)
(train[train.time.dt.hour < 12].congestion + 5).hist(bins=100, color='magenta', label='MORNING + 5')
train[train.time.dt.hour >= 12].congestion.hist(bins=100, color='red', label='AFTERNOON')
plt.xlabel('CONGESTION', fontsize=16)
plt.ylabel('COUNT', fontsize=16)
plt.legend()

plt.subplot(1, 2, 2)
train[train.time.dt.hour >= 12].congestion.hist(bins=100, color='red', label='AFTERNOON')
(train[train.time.dt.hour < 12].congestion + 5).hist(bins=100, color='magenta', label='MORNING + 5')
plt.xlabel('CONGESTION', fontsize=16)
plt.ylabel('COUNT', fontsize=16)
plt.legend()

plt.show()

In [None]:
for x in range(3):
    for y in range(4):
        plt.figure(figsize=(12, 12))
        for idx, direction in enumerate(train.direction.unique()):
            roadway = train[(train.x == x) & (train.y == y) & (train.direction == direction)]
            
            if roadway.shape[0] > 0:
                plt.subplot(4, 2, idx + 1)
                roadway.congestion.hist(bins=100, color='lime')
                plt.title(f'X={x} Y={y} DIRECTION={direction}', fontsize=14)
                plt.xlabel('CONGESTION', fontsize=14)
                plt.ylabel('COUNT', fontsize=14)
                plt.tight_layout()
                
        plt.show()
        print('\n\n')

In [None]:
dfs = []
columns = []
for x in range(3):
    for y in range(4):
        df = train[(train.x == x) & (train.y == y)]
        dfs.append(df.groupby('time').congestion.mean().to_list())
        columns.append(f'X{x}Y{y}')
        
location_congestions = pd.DataFrame(np.array(dfs).T, columns=columns)

plt.figure(figsize=(12, 10))
sns.heatmap(location_congestions.corr(), cmap='spring', annot=True)
plt.title('CORRELATION BETWEEN 12 LOCATIONS OF CONGESTION', fontsize=14)
plt.xticks(rotation=90, fontsize=14)
plt.yticks(rotation=0, fontsize=14)

plt.show()

In [None]:
plt.figure(figsize=(18, 8))

dfs = []
columns=[]
for x in range(3):
    for y in range(4):
        df = train[(train.x == x) & (train.y == y) & (train.time.dt.hour == 8) & (train.time.dt.minute == 0)]
        dfs.append(df.groupby('time').congestion.mean().tolist())
        columns.append(f'X{x}Y{y}')
        
location_congestions = pd.DataFrame(np.array(dfs).T, columns=columns)

plt.subplot(1, 2, 1)
sns.heatmap(location_congestions.corr(), cmap='winter', annot=True)
plt.title("CORRELATION BETWEEN LOCATIONS AT 8 O'CLOCK", fontsize=14)
plt.xticks(rotation=90, fontsize=10)
plt.yticks(rotation=0, fontsize=10)
plt.tight_layout()


dfs = []
columns=[]
for x in range(3):
    for y in range(4):
        df = train[(train.x == x) & (train.y == y) & (train.time.dt.hour == 8) & (train.time.dt.minute == 20)]
        dfs.append(df.groupby('time').congestion.mean().tolist())
        columns.append(f'X{x}Y{y}')
        
location_congestions = pd.DataFrame(np.array(dfs).T, columns=columns)

plt.subplot(1, 2, 2)
sns.heatmap(location_congestions.corr(), cmap='winter', annot=True)
plt.title("CORRELATION BETWEEN LOCATIONS AT 8:20 H", fontsize=14)
plt.xticks(rotation=90, fontsize=10)
plt.yticks(rotation=0, fontsize=10)
plt.tight_layout()

plt.show()

In [None]:
train['daytime_id'] = ((train.time.dt.hour * 60 + train.time.dt.minute) / 20).astype(int)

In [None]:
corrs = []
for daytime_id in train.daytime_id.unique():
    loc_X1Y2 = train[(train.x == 1) & (train.y == 2) & (train.daytime_id == daytime_id)].groupby('time').congestion.mean().tolist()
    loc_X1Y3 = train[(train.x == 1) & (train.y == 3) & (train.daytime_id == daytime_id)].groupby('time').congestion.mean().tolist()
    
    corr = pd.DataFrame(np.array([loc_X1Y2, loc_X1Y3]).T).corr()[0][1]
    corrs.append(corr)
    
plt.figure(figsize=(10, 7))
plt.plot(train.daytime_id.unique() / 3.0, corrs, 'darkviolet', linewidth=3)
plt.xlabel('HOUR', fontsize=14)
plt.ylabel('CORRELATION COEFICIENT', fontsize=14)
plt.title('CORRELATION BETWEEN LOCATIONS X1Y2 and X1Y3', fontsize=16)
plt.text(8.5, 0.85, '8am', color='violet', fontsize=14)
plt.text(18, 0.65, '5pm', color='violet', fontsize=14)

plt.show()

In [None]:
plt.figure(figsize=(12, 36))
for x in range(3):
    for y in range(4):
        dfs = []
        for idx, direction in enumerate(train.direction.unique()):
            df = train[(train.x == x) & (train.y == y) & (train.direction == direction)]
            
            if df.shape[0] > 0:
                dfs.append(df.congestion.tolist())
                
        roadway_congestions = pd.DataFrame(np.array(dfs).T, columns=train[(train.x == x) & (train.y == y)].direction.unique())
        
        plt.subplot(6, 2, 4 * x + y + 1)
        sns.heatmap(roadway_congestions.corr(), cmap='autumn', annot=True)
        plt.title(f'CORRELATION BETWEEN {len(dfs)} DIRECTIONS AT X{x}Y{y}', fontsize=12)
        plt.tight_layout()
        
plt.show()

In [None]:
plt.rcParams['axes.facecolor'] = 'maroon'

plt.figure(figsize=(10, 7))
plt.bar(train.time.dt.month.unique(), train.groupby(train.time.dt.month).congestion.mean(), color='red')
plt.title('MEAN CONGESTION PER MONTH', fontsize=14)
plt.xlabel('MONTH', fontsize=12)
plt.ylabel('CONGESTION', fontsize=12)
plt.ylim(45, 50)
plt.show()

In [None]:
for x in range(3):
    for y in range(4):
        plt.figure(figsize=(12, 24))
        for idx, direction in enumerate(train.direction.unique()):
            df = train[(train.x == x) & (train.y == y) & (train.direction == direction)]
            
            if df.shape[0] > 0:
                plt.subplot(4, 2, idx + 1)
                plt.bar(df.time.dt.month.unique(), df.groupby(df.time.dt.month).congestion.mean(), color='deeppink')
                plt.title(f'MEAN CONGESTION PER MONTH AT X{x}Y{y} {direction} DIRECTION', fontsize=10)
                plt.xlabel('MONTH', fontsize=10)
                plt.ylabel('CONGESTION', fontsize=10)
                plt.ylim(df.groupby(df.time.dt.month).congestion.mean().round().astype(int).min() - 1,
                         df.groupby(df.time.dt.month).congestion.mean().round().astype(int).max() + 1)
        
        plt.show()
        print('\n\n')

In [None]:
plt.rcParams['axes.facecolor'] = 'dimgray'

plt.figure(figsize=(10, 7))
plt.bar(train.time.dt.day.unique(), train.groupby(train.time.dt.day).congestion.mean(), color='aquamarine')
plt.title('MEAN MONTHLY CONGEATION', fontsize=16)
plt.xlabel('DAY OF MONTH', fontsize=14)
plt.ylabel('CONGESTION', fontsize=14)
plt.ylim(45, 50)

plt.show()

In [None]:
for x in range(3):
    for y in range(4):
        plt.figure(figsize=(12, 24))
        for idx, direction in enumerate(train.direction.unique()):
            df = train[(train.x == x) & (train.y == y) & (train.direction == direction)]
            
            if df.shape[0] > 0:
                plt.subplot(4, 2, idx + 1)
                plt.bar(df.time.dt.day.unique(), df.groupby(df.time.dt.day).congestion.mean(), color='indigo')
                plt.title(f'MEAN MONTHLY CONGESTION X{x}Y{y} {direction} DIRECTION', fontsize=12)
                plt.xlabel('DAY OF MONTH', fontsize=10)
                plt.ylabel('CONGESTION', fontsize=10)
                plt.ylim(df.groupby(df.time.dt.day).congestion.mean().round().astype(int).min() - 1,
                         df.groupby(df.time.dt.day).congestion.mean().round().astype(int).max() + 1)
                
        plt.show()
        print('\n\n')

In [None]:
plt.figure(figsize=(10, 7))
plt.bar(train.time.dt.week.unique(), train.groupby(train.time.dt.week).congestion.mean(), color='darkmagenta')
plt.title(f'MEAN CONGESTION PER WEEK', fontsize=16)
plt.xlabel('WEEK', fontsize=14)
plt.ylabel('CONGESTION', fontsize=14)
plt.ylim(45,50)

plt.show()

In [None]:
for x in range(3):
    for y in range(4):
        plt.figure(figsize=(12, 24))
        for idx, direction in enumerate(train.direction.unique()):
            df = train[(train.x == x) & (train.y == y) & (train.direction == direction)]
            
            if df.shape[0]>0:                
                plt.subplot(4, 2, idx + 1)
                plt.bar(df.time.dt.week.unique(), df.groupby(df.time.dt.week).congestion.mean(), color='crimson')
                plt.title(f'MEAN CONGESTION PER WEEK AT X{x}Y{y} {direction} DIRECTION', fontsize=12)
                plt.xlabel('WEEK', fontsize=10)
                plt.ylabel('CONGESTION', fontsize=10)
                plt.ylim(df.groupby(df.time.dt.week).congestion.mean().round().astype(int).min() - 1, 
                         df.groupby(df.time.dt.week).congestion.mean().round().astype(int).max() + 1)
                
        plt.show()   
        print('\n\n')

In [None]:
plt.figure(figsize=(10, 7))
plt.bar(train.time.dt.weekday.unique(), train.groupby(train.time.dt.weekday).congestion.mean(), color='red')
plt.title(f'MEAN WEEKLY CONGESTION', fontsize=16)
plt.ylabel('CONGESTION', fontsize=14)
plt.xticks(range(7), ['MONDAY', 'TUESDAY', 'WEDNESDAY', 'THURSDAY', 'FRIDAY', 'SATURDAY', 'SUNDAY'], fontsize=14, rotation=90)
plt.ylim(40, 50)

plt.show()

In [None]:
for x in range(3):
    for y in range(4):
        plt.figure(figsize=(12, 24))
        for idx, direction in enumerate(train.direction.unique()):
            df = train[(train.x == x) & (train.y == y) & (train.direction == direction)]
            
            if df.shape[0] > 0:                
                plt.subplot(4, 2, idx + 1)
                plt.bar(df.time.dt.weekday.unique(), df.groupby(df.time.dt.weekday).congestion.mean(), color='deeppink')
                plt.title(f'MEAN WEEKLY CONGESTION AT X{x}Y{y} {direction} DIRECTION', fontsize=14)
                plt.ylabel('CONGESTION', fontsize=10)
                plt.ylim(df.groupby(df.time.dt.weekday).congestion.mean().round().astype(int).min() - 1, 
                         df.groupby(df.time.dt.weekday).congestion.mean().round().astype(int).max() + 1)
                plt.xticks(range(7),
                           ['MONDAY', 'TUESDAY', 'WEDNESDAY', 'THURSDAY', 'FRIDAY', 'SATURDAY', 'SUNDAY'], 
                           fontsize=10,
                           rotation=90)
                plt.tight_layout()
                
        plt.show()
        print('\n\n')

In [None]:
plt.figure(figsize=(10, 7))
plt.bar(train.time.dt.dayofyear.unique(), 
        train.groupby(train.time.dt.dayofyear).congestion.mean(), 
        color=['deeppink' if (int(day) % 7) == 6 else 'crimson' for day in train.time.dt.dayofyear.unique()])
plt.title(f'MEAN CONGESTION PER DAY', fontsize=14)
plt.xlabel('DAY OF YEAR (SUNDAY - LIGHT MAGENTA)', fontsize=12)
plt.ylabel('CONGESTION', fontsize=12)
plt.ylim(40, 50)

plt.show()

In [None]:
for x in range(3):
    for y in range(4):
        plt.figure(figsize=(12, 24))
        for idx, direction in enumerate(train.direction.unique()):
            df = train[(train.x == x) & (train.y == y) & (train.direction == direction)]
            
            if df.shape[0] > 0:                
                plt.subplot(4, 2, idx + 1)
                plt.bar(df.time.dt.dayofyear.unique(), 
                        df.groupby(df.time.dt.dayofyear).congestion.mean(), 
                        color=['aquamarine' if (int(day)%7)==6 else 'darksLategray' for day in train.time.dt.dayofyear.unique()])
                plt.title(f'MEAN CONGESTION PER DAY AT X{x}Y{y} {direction} DIRECTION', fontsize=12)
                plt.xlabel('DAY OF YEAR (SUNDAY - LIGHT CYAN)', fontsize=12)
                plt.ylabel('CONGESTION', fontsize=12)                
                plt.ylim(df.groupby(df.time.dt.dayofyear).congestion.mean().round().astype(int).min() - 1, 
                         df.groupby(df.time.dt.dayofyear).congestion.mean().round().astype(int).max() + 1)
                plt.tight_layout()
                
        plt.show()   
        print('\n\n')

In [None]:
plt.rcParams['axes.facecolor'] = 'silver'

plt.figure(figsize=(10, 7))
plt.plot(train.groupby(train.daytime_id).daytime_id.unique() / 3.0, 
         train.groupby(train.daytime_id).congestion.mean(), 'lime', linewidth=5)
plt.title(f'MEAN DAILY CONGESTION', fontsize=16)
plt.xlabel('HOUR', fontsize=14)
plt.ylabel('CONGESTION', fontsize=14)

plt.show()

In [None]:
for x in range(3):
    for y in range(4):
        plt.figure(figsize=(12, 24))
        for idx, direction in enumerate(train.direction.unique()):
            df = train[(train.x == x) & (train.y == y) & (train.direction == direction)]
            
            if df.shape[0] > 0:                
                plt.subplot(4, 2, idx + 1)
                plt.plot(df.groupby(df.daytime_id).daytime_id.unique() / 3.0, 
                         df.groupby(df.daytime_id).congestion.mean(), 'lime', linewidth=3, label='DAILY AVERAGE')
                plt.plot(df[-36:].daytime_id / 3.0, df[-36:].congestion, 'darkolivegreen', linewidth=3, label='MONDAY SEPTEMBER 30TH')
                plt.title(f'MEAN DAILY CONGESTION AT X{x}Y{y} {direction} DIRECTION', fontsize=12)
                plt.xlabel('HOUR', fontsize=10)
                plt.ylabel('CONGESTION', fontsize=10)
                plt.legend()
        plt.show()
        
        print('\n\n')

In [None]:
df = train[train.time.dt.weekday == 0]

In [None]:
plt.figure(figsize=(10, 7))
plt.plot(df.groupby(df.daytime_id).daytime_id.unique() / 3.0,
         df.groupby(df.daytime_id).congestion.mean(), 
         'cyan', 
         linewidth=3, 
         label='MONDAYS MEAN')

plt.plot(train.groupby(train.daytime_id).daytime_id.unique() / 3.0, 
         train.groupby(train.daytime_id).congestion.median(), 
         'lime', linewidth=3, 
         label='ALL DAYS MEAN')
plt.title(f'MONDAY CONGESTION MEAN', fontsize=16)
plt.xlabel('HOUR', fontsize=14)
plt.ylabel('CONGESTION', fontsize=14)
plt.legend()

plt.show()

In [None]:
for x in range(3):
    for y in range(4):
        plt.figure(figsize=(12, 24))
        for idx, direction in enumerate(train.direction.unique()):
            df = train[(train.time.dt.weekday == 0) & (train.x == x) & (train.y == y) & (train.direction == direction)]
            
            if df.shape[0] > 0:                
                plt.subplot(4, 2, idx + 1)
                plt.plot(df.groupby(df.daytime_id).daytime_id.unique() / 3.0,
                         df.groupby(df.daytime_id).congestion.median(), 'darkred', linewidth=3, label='MONDAY MEAN')
                plt.plot(df[-36:].daytime_id/3.0, df[-36:].congestion, 'deeppink', linewidth=3, label='MONDAY SEPTEMBER 30TH')
                plt.title(f'X{x}Y{y} {direction} DIRECTION', fontsize=12)
                plt.xlabel('HOUR', fontsize=10)
                plt.ylabel('CONGESTION', fontsize=10)
                plt.legend()
                
        plt.show()
        print('\n\n')

In [None]:
for x in range(3):
    for y in range(4):
        plt.figure(figsize=(12, 24))
        for idx, direction in enumerate(train.direction.unique()):
            df = train[(train.time.dt.weekday == 0) & (train.x == x) & (train.y == y) & (train.direction == direction)]
            
            if df.shape[0] > 0:                
                plt.subplot(4, 2, idx + 1)
                plt.plot(df[train.time.dt.dayofyear == test.time.dt.dayofyear.tolist()[0] - 7].daytime_id / 3.0, 
                         df[train.time.dt.dayofyear == test.time.dt.dayofyear.tolist()[0] - 7].congestion,
                         'darkcyan', 
                         linewidth=3, 
                         label='MONDAY SEPTEMBER 23TH')
                plt.plot(df[-36:].daytime_id / 3.0, df[-36:].congestion, 'turquoise', linewidth=3, label='MONDAY SEPTEMBER 23TH')
                plt.title(f'X{x}Y{y} {direction} DIRECTION', fontsize=12)
                plt.xlabel('HOUR', fontsize=10)
                plt.ylabel('CONGESTION', fontsize=10)
                plt.legend()
                
        plt.show()
        print('\n\n')

In [None]:
train_new.congestion = train_new.congestion.fillna(train_new.congestion.mean())

In [None]:
congestion = pd.DataFrame(np.array(train_new.congestion).reshape(365, 2340).T)
congestion_morning = congestion[[col for col in congestion.columns if col % 2 == 0]]
congestion_afternoon = congestion[[col for col in congestion.columns if col % 2 == 1]]
congestion_monday_afternoon = congestion[[col for col in congestion.columns if col % 14 == 1]].round().astype(int)
congestion_monday_afternoon_smoothed = congestion_monday_afternoon.rolling(10, min_periods = 1).mean()

In [None]:
plt.figure(figsize=(18, 7))

plt.subplot(1, 2, 1)
sns.heatmap(congestion_monday_afternoon[congestion_monday_afternoon.columns[-10:]].corr(), cmap='hsv', annot=True)
plt.title('CORRELATION BETWEEN LAST 10 MONDAY AFTERNOONS', fontsize=12)

plt.subplot(1, 2, 2)
sns.heatmap(congestion_monday_afternoon_smoothed[congestion_monday_afternoon_smoothed.columns[-10:]].corr(), cmap='hsv', annot=True)
plt.title('SMOOTHED', fontsize=12)

plt.show()

In [None]:
df = congestion_monday_afternoon
scores=[]
for idx in df.columns:
    score = 0
    for time_id in range(36):
        score += mean_absolute_error(df[idx][(time_id * 65):((time_id + 1) * 65)].astype(int), df[df.columns[-1]][(time_id * 65):((time_id + 1) * 65)].astype(int))
    scores.append(score / 36.0)

plt.figure(figsize=(10, 7))
plt.bar(range(len(scores)),scores, color='deeppink')
plt.xlabel('MONDAY AFTERNOONS', fontsize=14)
plt.ylabel('MAE VS MONDAY SEPTEMBER 23TH', fontsize=14)

plt.show()

In [None]:
df = congestion_afternoon
df_monday = congestion_monday_afternoon
scores=[]
for idx in df_monday.columns:
    score =mean_absolute_error(df_monday[idx].astype(int), np.median(df, axis=1).round().astype(int))
    scores.append(score)
plt.figure(figsize=(10, 7))
plt.bar(range(len(scores)), scores, color='springgreen')
plt.xlabel('MONDAY AFTERNOONS', fontsize=14)
plt.ylabel('MAE VS ALL AFTERNOONS MEDIAN', fontsize=14)

plt.show()

In [None]:
df = congestion_afternoon
scores=[]
for idx in df.columns:
    score =mean_absolute_error(df[idx].astype(int), df[df.columns[-1]].astype(int))
    scores.append(score)
plt.figure(figsize=(10, 7))
plt.bar(range(len(scores)), scores, color='cyan')
plt.xlabel('AFTERNOONS', fontsize=14)
plt.ylabel('MAE VS AFTERNOON SEPTEMBER 29TH', fontsize=14)

plt.show()

In [None]:
df = congestion_afternoon
scores=[]
for idx in df.columns:
    score =mean_absolute_error(df[idx].astype(int), np.median(df, axis=1).round().astype(int))
    scores.append(score)
plt.figure(figsize=(10, 7))
plt.bar(range(len(scores)), scores, color='aquamarine')
plt.xlabel('AFTERNOONS', fontsize=14)
plt.ylabel('MAE VS ALL AFTERNOONS MEDIAN', fontsize=14)

plt.show()

In [None]:
df = congestion_afternoon
outliers=[]
outlier_days=[]
for idx in df.columns:
    if (mean_absolute_error(df[idx].astype(int), np.median(df, axis=1).round().astype(int)) > 7):
        day_idx = idx // 2 + 91
        
        outliers.append(idx)
        outlier_days.append(day_idx)
print(f'% OUTLIERS: {len(outliers) / len(df.columns) * 100}')

In [None]:
df = congestion_afternoon[[idx for idx in congestion_afternoon.columns if idx not in outliers]]
scores=[]
for idx in df.columns:
    score = mean_absolute_error(df[idx].astype(int), np.median(df, axis=1).round().astype(int))
    scores.append(score)
plt.figure(figsize=(10, 7))
plt.bar(range(len(scores)),scores, color='red')
plt.xlabel('AFTERNOONS', fontsize=14)
plt.ylabel('MAE VS ALL AFTERNOONS MEDIAN', fontsize=14)

plt.show()

In [None]:
test['congestion'] = -1

for x in range(3):
    for y in range(4):
        for idx, direction in enumerate(train.direction.unique()):
            df = train[(train.x == x) & (train.y == y) & (train.direction == direction)]
            
            if df.shape[0] > 0:                
                test.congestion[(test.x == x) & (test.y == y) & (test.direction == direction)] = df.groupby(df.daytime_id).congestion.median().tolist()[-36:]

In [None]:
submission.congestion = test.congestion.round().astype(int).tolist()
submission.congestion[submission.congestion < 0] = 0
submission.congestion[submission.congestion > 100] = 100
submission.to_csv('dailyMedian.csv', index=False)
submission.head()

In [None]:
test['congestion'] = -1

for x in range(3):
    for y in range(4):
        for idx, direction in enumerate(train.direction.unique()):
            df = train[(train.x == x) & (train.y == y) & (train.direction == direction)]
            
            if df.shape[0] > 0:                
                test.congestion[(test.x == x) & (test.y == y) & (test.direction == direction)] = df[df.time.dt.weekday <= 4].groupby(df.daytime_id).congestion.median().tolist()[-36:]

In [None]:
submission.congestion = test.congestion.round().astype(int).tolist()
submission.congestion[submission.congestion < 0] = 0
submission.congestion[submission.congestion > 100] = 100
submission.to_csv('dailyMedianExcludeWeekend.csv', index=False)
submission.head()

In [None]:
test['congestion'] = -1

for x in range(3):
    for y in range(4):
        for idx, direction in enumerate(train.direction.unique()):
            df = train[(train.x == x) & (train.y == y) & (train.direction == direction) & (~train.time.dt.dayofyear.isin(outlier_days))]
            
            if df.shape[0] > 0:                
                test.congestion[(test.x == x) & (test.y == y) & (test.direction == direction)] = df.groupby(df.daytime_id).congestion.median().tolist()[-36:]

In [None]:
submission.congestion = test.congestion.round().astype(int).tolist()
submission.congestion[submission.congestion < 0] = 0
submission.congestion[submission.congestion > 100] = 100
submission.to_csv('dailyMedianRemoveOutliers.csv', index=False)
submission.head()

In [None]:
test['congestion'] = -1

for x in range(3):
    for y in range(4):
        for idx, direction in enumerate(train.direction.unique()):
            df = train[(train.time.dt.weekday == 0) & (train.x == x) & (train.y == y) & (train.direction == direction)]
            
            if df.shape[0] > 0:                
                test.congestion[(test.x == x) & (test.y == y) & (test.direction == direction)] = df.groupby(df.daytime_id).congestion.mean().tolist()[-36:]

In [None]:
submission.congestion = test.congestion.round().astype(int).tolist()
submission.congestion[submission.congestion < 0] = 0
submission.congestion[submission.congestion > 100] = 100
submission.to_csv('MondayMean.csv', index=False)
submission.head()

In [None]:
test['congestion'] = -1

for x in range(3):
    for y in range(4):
        for idx, direction in enumerate(train.direction.unique()):
            df = train[(train.time.dt.dayofyear == test.time.dt.dayofyear.tolist()[0] - 7) & (train.x == x) & (train.y == y) & (train.direction == direction)]
            
            if df.shape[0] > 0:                
                test.congestion[(test.x == x) & (test.y == y) & (test.direction == direction)] = df.congestion.rolling(10).mean()[-36:].tolist()

In [None]:
submission.congestion = test.congestion.round().astype(int).tolist()
submission.congestion[submission.congestion < 0] = 0
submission.congestion[submission.congestion > 100] = 100
submission.to_csv('23SeptSmooth.csv', index=False)
submission.head()

In [None]:
test['congestion'] = -1

for x in range(3):
    for y in range(4):
        for idx, direction in enumerate(train.direction.unique()):
            df = train_new[(train_new.x == x) & (train_new.y == y) & (train_new.direction == direction)]
            df_new = pd.DataFrame()
            if df.shape[0] > 0:                
                for idx, group in df.groupby(df.time.dt.dayofyear):
                    if len(group.congestion) == 72:
                        df_new[f'day_{idx}'] = group.congestion.tolist()
            
                X_train = df_new[df_new.columns[:-1]][-36:]
                y_train = df_new[df_new.columns[-1]][-36:]

                X_test = df_new[df_new.columns[1:]][-36:]

                model = lr()
                model.fit(X_train, y_train)

                test.congestion[(test.x == x) & (test.y == y) & (test.direction == direction)] = model.predict(X_test).tolist()

In [None]:
submission.congestion = test.congestion.round().astype(int).tolist()
submission.congestion[submission.congestion < 0] = 0
submission.congestion[submission.congestion > 100] = 100
submission.to_csv('DailyLinearRegression.csv', index=False)
submission.head()