In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# What about Data
train.csv - the training set, comprising measurements of traffic congestion across 65 roadways from April through September of 1991.

row_id - a unique identifier for this instance
time - the 20-minute period in which each measurement was taken
x - the east-west midpoint coordinate of the roadway
y - the north-south midpoint coordinate of the roadway
direction - the direction of travel of the roadway. EB indicates "eastbound" travel, for example, while SW indicates a "southwest" direction of travel.
congestion - congestion levels for the roadway during each hour; the target. The congestion measurements have been normalized to the range 0 to 100.
test.csv - the test set; you will make hourly predictions for roadways identified by a coordinate location and a direction of travel on the day of 1991-06-30.

In [None]:
import numpy as np 
import pandas as pd 
from matplotlib import pyplot as plt
plt.rcParams['axes.facecolor']='black'
import seaborn as sns
import plotly.express as px

In [None]:
import warnings
warnings.simplefilter('ignore')

# Now Load The Data

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv')
df_train.time = pd.to_datetime(df_train.time)
df_train['time_id'] = ( ( (df_train.time.dt.dayofyear-1)*24*60 + df_train.time.dt.hour*60 + df_train.time.dt.minute ) /20 ).astype(int)
df_train = df_train.set_index('row_id', drop=True)

# Now Check the first 5 rows

In [None]:
df_train.head()

# Now test and subission dataset

In [None]:
df_test = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv', index_col='row_id')
df_test.time = pd.to_datetime(df_test.time)

submission = pd.read_csv('../input/tabular-playground-series-mar-2022/sample_submission.csv')

# Roadways
There are 65 roadways that are combined by 12 locations

In [None]:
df_train['dir_xy'] = df_train.direction.map({'EB':'[1,0]', 'NB':'[0,1]', 'SB':'[0,-1]', 'WB':'[-1,0]', 'NE':'[1,1]', 'SW':'[-1,-1]', 'NW':'[-1,1]', 'SE':'[1,-1]'})

In [None]:
loc_dir = df_train.groupby(['x','y']).dir_xy.unique().reset_index()
loc_dir['num_dir'] = loc_dir.dir_xy.apply(lambda x: len(x))
print(loc_dir)
print(f'Number of roadways: {loc_dir.num_dir.sum()}')

In [None]:
plt.figure(figsize=(10,7),facecolor="black")

for idx, row in loc_dir.iterrows():
    origin = np.repeat(np.array([[row.x],[row.y]]),row.num_dir, axis=-1)
    V = np.array([eval(s) for s in row.dir_xy])
    plt.quiver(*origin, V[:,0], V[:,1], scale=15, color='white')
    
plt.xlim(-0.5,2.5)
plt.ylim(-0.5,3.5)
plt.xticks([])
plt.yticks([])
plt.show()

In [None]:
df_train=df_train.drop('dir_xy',axis=1)

# Now Time

In [None]:
print(df_train.groupby(['x', 'y', 'direction']).time.count())
print(f'Total time instants within the observation period: {int(df_train.time_id.max() - df_train.time_id.min()+1)}')

In [None]:
time = pd.DataFrame(pd.date_range('1991-04-01 00:00:00', '1991-09-30 11:40:00', freq='20Min'), columns=['time'])

dfs = []
for group in df_train.groupby(['x', 'y', 'direction']):
    dfs.append(group[1].merge(time, on='time', how= 'right'))

df_train_new = pd.concat(dfs, axis=0).sort_values(['time', 'x', 'y', 'direction']).reset_index()
print(f'Missing row:\n{df_train_new.isna().sum()}')

# Target Congestion Level

# Histogram

In [None]:
plt.figure(figsize=(12,6))
df_train.congestion.hist(bins=100, color='yellow')
plt.xlabel('Congestion', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.show()

# Histograms for eash roadway

In [None]:
for x in range(3):
    for y in range(4):
        plt.figure(figsize=(14,14))
        for idx, direction in enumerate(df_train.direction.unique()):
            
            df = df_train[(df_train.x == x) & (df_train.y==y) & (df_train.direction==direction)]
            
            if df.shape[0]>0:
                plt.subplot(4,2,idx+1)
                df.congestion.hist(bins=100, color='yellow')
                plt.title(f'x={x}, y={y}, direction={direction}', fontsize=16)
                plt.xlabel('Congestion', fontsize=16)
                plt.ylabel('Count', fontsize=16)
                plt.tight_layout()
            
        plt.show()
        
        print('\n\n')
        break

# Now Check Correlation between 12 locations

In [None]:
dfs = []
columns = []
for x in range(3):
    for y in range(4):
        df = df_train[(df_train.x == x) & (df_train.y==y)]
        dfs.append(df.groupby('time').congestion.mean().tolist())
        columns.append(f'x{x}y{y}')      
location_congestions = pd.DataFrame(np.array(dfs).T, columns=columns)

plt.figure(figsize=(12,10))
sns.heatmap(location_congestions.corr(), annot=True)
plt.title('Correlation between the congestion of 12 locations', fontsize=16)
plt.xticks(rotation=90, fontsize=16)
plt.yticks(rotation=90, fontsize=16)
plt.show()

# Correlation Between different direstions at each locations

In [None]:
plt.figure(figsize=(16,36))
for x in range(3):
    for y in range(4):
        dfs = []
        for idx, direction in enumerate(df_train.direction.unique()):
            df = df_train[(df_train.x == x) & (df_train.y==y) & (df_train.direction==direction)]
            
            if df.shape[0]>0:                
                dfs.append(df.congestion.tolist())
                
        roadway_congestions = pd.DataFrame(np.array(dfs).T, columns=df_train[(df_train.x == x) & (df_train.y==y)].direction.unique())
        
        plt.subplot(6,2,4*x+y+1)
        sns.heatmap(roadway_congestions.corr(), annot=True)
        plt.title(f'Correlation between {len(dfs)} directions at x={x}, y={y}', fontsize=16)
        plt.tight_layout()
plt.show()

# Average congestion per month

In [None]:
plt.figure(figsize=(14,7))
plt.bar(df_train.time.dt.month.unique(), df_train.groupby(df_train.time.dt.month).congestion.mean(), color='y')
plt.title(f'Mean congestion per month', fontsize=16)
plt.xlabel('Month', fontsize=16)
plt.ylabel('Congestion', fontsize=16)
plt.ylim(45,50)
plt.show()

# Average Congestion per month for each roadway

In [None]:
for x in range(3):
    for y in range(4):
        plt.figure(figsize=(12,24))
        for idx, direction in enumerate(df_train.direction.unique()):
            # extract data for each location and direction
            df = df_train[(df_train.x == x) & (df_train.y==y) & (df_train.direction==direction)]
            
            if df.shape[0]>0:                
                plt.subplot(4,2,idx+1)
                plt.bar(df.time.dt.month.unique(), df.groupby(df.time.dt.month).congestion.mean(), color='y')
                plt.title(f'Mean congestion per month at x={x}, y={y}, direction={direction}')
                plt.xlabel('Month', fontsize=16)
                plt.ylabel('Congestion', fontsize=16)
                plt.ylim(df.groupby(df.time.dt.month).congestion.mean().round().astype(int).min()-1, df.groupby(df.time.dt.month).congestion.mean().round().astype(int).max()+1)
        plt.show()
        
        print('\n\n')
        break

# Average Monthly Congestion

In [None]:
plt.figure(figsize=(14,7))
plt.bar(df_train.time.dt.day.unique(), df_train.groupby(df_train.time.dt.day).congestion.mean(), color='y')
plt.title(f'Mean monthly congestion', fontsize=16)
plt.xlabel('Day of month', fontsize=16)
plt.ylabel('Congestion', fontsize=16)
plt.ylim(45,50)
plt.show()

# Average Monthly congestion for each roadway

In [None]:
for x in range(3):
    for y in range(4):
        plt.figure(figsize=(12,24))
        for idx, direction in enumerate(df_train.direction.unique()):
            # extract data for each location and direction
            df = df_train[(df_train.x == x) & (df_train.y==y) & (df_train.direction==direction)]
            
            if df.shape[0]>0:                
                plt.subplot(4,2,idx+1)
                plt.bar(df.time.dt.day.unique(), df.groupby(df.time.dt.day).congestion.mean(), color='y')
                plt.title(f'Mean monthly congestion at x={x}, y={y}, direction={direction}')
                plt.xlabel('Day of month', fontsize=16)
                plt.ylabel('Congestion', fontsize=16)
                plt.ylim(df.groupby(df.time.dt.day).congestion.mean().round().astype(int).min()-1, df.groupby(df.time.dt.day).congestion.mean().round().astype(int).max()+1)
        plt.show()
        
        print('\n\n')
        break

# Average Congestion Per week

In [None]:
plt.figure(figsize=(14,7))
plt.bar(df_train.time.dt.week.unique(), df_train.groupby(df_train.time.dt.week).congestion.mean(), color='y')
plt.title(f'Mean congestion per week', fontsize=16)
plt.xlabel('Week', fontsize=16)
plt.ylabel('Congestion', fontsize=16)
plt.ylim(45,50)
plt.show()

# Average Congestion per week for each roadway

In [None]:
for x in range(3):
    for y in range(4):
        plt.figure(figsize=(12,24))
        for idx, direction in enumerate(df_train.direction.unique()):
            # extract data for each location and direction
            df = df_train[(df_train.x == x) & (df_train.y==y) & (df_train.direction==direction)]
            
            if df.shape[0]>0:                
                plt.subplot(4,2,idx+1)
                plt.bar(df.time.dt.week.unique(), df.groupby(df.time.dt.week).congestion.mean(), color='y')
                plt.title(f'Mean congestion per week at x={x}, y={y}, direction={direction}')
                plt.xlabel('Week', fontsize=16)
                plt.ylabel('Congestion', fontsize=16)
                plt.ylim(df.groupby(df.time.dt.week).congestion.mean().round().astype(int).min()-1, df.groupby(df.time.dt.week).congestion.mean().round().astype(int).max()+1)
        plt.show()
        
        print('\n\n')
        break

# Average Weekly Congestion

In [None]:
plt.figure(figsize=(14,7))
plt.bar(df_train.time.dt.weekday.unique(), df_train.groupby(df_train.time.dt.weekday).congestion.mean(), color='y')
plt.title(f'Mean weely congestion', fontsize=16)
plt.ylabel('Congestion', fontsize=16)
plt.xticks(range(7), ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], fontsize=16, rotation=90)
plt.ylim(40,50)
plt.show()

# Average weekly congestion for each roadway

In [None]:
for x in range(3):
    for y in range(4):
        plt.figure(figsize=(12,24))
        for idx, direction in enumerate(df_train.direction.unique()):
            # extract data for each location and direction
            df = df_train[(df_train.x == x) & (df_train.y==y) & (df_train.direction==direction)]
            
            if df.shape[0]>0:                
                plt.subplot(4,2,idx+1)
                plt.bar(df.time.dt.weekday.unique(), df.groupby(df.time.dt.weekday).congestion.mean(), color='y')
                plt.title(f'Mean weekly congestion at x={x}, y={y}, direction={direction}')
                plt.ylabel('Congestion', fontsize=16)
                plt.ylim(df.groupby(df.time.dt.weekday).congestion.mean().round().astype(int).min()-1, df.groupby(df.time.dt.weekday).congestion.mean().round().astype(int).max()+1)
                plt.xticks(range(7), ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], fontsize=16, rotation=90)
                plt.tight_layout()
        plt.show()
                
        print('\n\n')
        break

# Average congestion per day

In [None]:
plt.figure(figsize=(14,7))
plt.bar(df_train.time.dt.dayofyear.unique(), df_train.groupby(df_train.time.dt.dayofyear).congestion.mean(), color=['r' if (int(day)%7)==6 else 'y' for day in df_train.time.dt.dayofyear.unique()])
plt.title(f'Mean congestion per day', fontsize=16)
plt.xlabel('Day of year (Sunday is marked by red color)', fontsize=16)
plt.ylabel('Congestion', fontsize=16)
plt.ylim(40,50)
plt.show()

# Average congestion per day for each roadway

In [None]:
for x in range(3):
    for y in range(4):
        plt.figure(figsize=(12,24))
        for idx, direction in enumerate(df_train.direction.unique()):
            # extract data for each location and direction
            df = df_train[(df_train.x == x) & (df_train.y==y) & (df_train.direction==direction)]
            
            if df.shape[0]>0:                
                plt.subplot(4,2,idx+1)
                plt.bar(df.time.dt.dayofyear.unique(), df.groupby(df.time.dt.dayofyear).congestion.mean(), color=['r' if (int(day)%7)==6 else 'y' for day in df_train.time.dt.dayofyear.unique()])
                plt.title(f'Mean congestion per day at x={x}, y={y}, direction={direction}')
                plt.xlabel('Day of year (Sunday is marked by red color)', fontsize=16)
                plt.ylabel('Congestion', fontsize=16)                
                plt.ylim(df.groupby(df.time.dt.dayofyear).congestion.mean().round().astype(int).min()-1, df.groupby(df.time.dt.dayofyear).congestion.mean().round().astype(int).max()+1)
                plt.tight_layout()
        plt.show()
                
        print('\n\n')
        break 

# Average daily congestion

In [None]:
df_train['daytime_id'] = ( (df_train.time.dt.hour*60+df_train.time.dt.minute) /20 ).astype(int)

In [None]:
plt.figure(figsize=(14,7))
plt.plot(df_train.groupby(df_train.daytime_id).daytime_id.unique()/3.0, df_train.groupby(df_train.daytime_id).congestion.mean(), 'y', linewidth=5)
plt.title(f'Mean daily congestion', fontsize=16)
plt.xlabel('Hour', fontsize=16)
plt.ylabel('Congestion', fontsize=16)
plt.title(f'Mean daily congestion', fontsize=16)
plt.xlabel('Hour', fontsize=16)
plt.ylabel('Congestion', fontsize=16)
plt.show()

# Average daily congestion for each roadway

In [None]:
for x in range(3):
    for y in range(4):
        plt.figure(figsize=(12,24))
        for idx, direction in enumerate(df_train.direction.unique()):
            # extract data for each location and direction
            df = df_train[(df_train.x == x) & (df_train.y==y) & (df_train.direction==direction)]
            
            if df.shape[0]>0:                
                plt.subplot(4,2,idx+1)
                plt.plot(df.groupby(df.daytime_id).daytime_id.unique()/3.0, df.groupby(df.daytime_id).congestion.mean(), 'y', linewidth=3, label='Daily average')
                plt.plot(df[-36:].daytime_id/3.0, df[-36:].congestion, 'r', linewidth=3, label='The Monday 30 Sept')
                plt.title(f'Mean daily congestion at x={x}, y={y}, direction={direction}')
                plt.xlabel('Hour', fontsize=16)
                plt.ylabel('Congestion', fontsize=16)
                plt.legend()
        plt.show()
        
        print('\n\n')
        break

# Average Monday congestion

In [None]:
df = df_train[df_train.time.dt.weekday==0]


In [None]:
plt.figure(figsize=(14,7))
plt.plot(df.groupby(df.daytime_id).daytime_id.unique()/3.0, df.groupby(df.daytime_id).congestion.mean(), 'y', linewidth=3, label='Mean of Mondays')

plt.plot(df_train.groupby(df_train.daytime_id).daytime_id.unique()/3.0,df_train.groupby(df_train.daytime_id).congestion.median(), 'orange', linewidth=3, label='Mean of all days')
plt.title(f'Mean of Monday congestion', fontsize=16)
plt.xlabel('Hour', fontsize=16)
plt.ylabel('Congestion', fontsize=16)
plt.legend()
plt.show()

# Average Monday congestion for each roadway

In [None]:
for x in range(3):
    for y in range(4):
        plt.figure(figsize=(12,24))
        for idx, direction in enumerate(df_train.direction.unique()):
            # extract data for each location and direction
            df = df_train[(df_train.time.dt.weekday==0) & (df_train.x == x) & (df_train.y==y) & (df_train.direction==direction)]
            
            if df.shape[0]>0:                
                plt.subplot(4,2,idx+1)
                plt.plot(df.groupby(df.daytime_id).daytime_id.unique()/3.0, df.groupby(df.daytime_id).congestion.median(), 'y', linewidth=3, label='Mean Monday')
                plt.plot(df[-36:].daytime_id/3.0, df[-36:].congestion, 'r', linewidth=3, label='The Monday 30 Sept')
                plt.title(f'x={x}, y={y}, direction={direction}')
                plt.xlabel('Hour', fontsize=16)
                plt.ylabel('Congestion', fontsize=16)
                plt.legend()
        plt.show()
        
        print('\n\n')
        break

# The nearest Monday 23 Sept

In [None]:
for x in range(3):
    for y in range(4):
        plt.figure(figsize=(12,24))
        for idx, direction in enumerate(df_train.direction.unique()):
            # extract data for each location and direction
            df = df_train[(df_train.time.dt.weekday==0) & (df_train.x == x) & (df_train.y==y) & (df_train.direction==direction)]
            
            if df.shape[0]>0:                
                plt.subplot(4,2,idx+1)
                plt.plot(df[df_train.time.dt.dayofyear==df_test.time.dt.dayofyear.tolist()[0]-7].daytime_id/3.0, df[df_train.time.dt.dayofyear==df_test.time.dt.dayofyear.tolist()[0]-7].congestion, 'y', linewidth=3, label='The Monday 23 Sept')
                plt.plot(df[-36:].daytime_id/3.0, df[-36:].congestion, 'r', linewidth=3, label='The Monday 30 Sept')
                plt.title(f'x={x}, y={y}, direction={direction}')
                plt.xlabel('Hour', fontsize=16)
                plt.ylabel('Congestion', fontsize=16)
                plt.legend()
        plt.show()
        
        print('\n\n')
        break

# Correlation between the days

In [None]:
df_train_new.congestion = df_train_new.congestion.fillna(df_train_new.congestion.mean())

In [None]:
congestion = pd.DataFrame(np.array(df_train_new.congestion).reshape(365,2340).T)
congestion_morning = congestion[[col for col in congestion.columns if col%2==0]]
congestion_afternoon = congestion[[col for col in congestion.columns if col%2==1]]
congestion_monday_afternoon = congestion[[col for col in congestion.columns if col%14==1]].round().astype(int)
congestion_monday_afternoon_smoothed = congestion_monday_afternoon.rolling(10, min_periods=1).mean()

In [None]:
plt.figure(figsize=(18,7))

plt.subplot(1,2,1)
sns.heatmap(congestion_monday_afternoon[congestion_monday_afternoon.columns[-10:]].corr(), annot=True)
plt.title('Correlation between the last 10 Monday afternoons', fontsize=16)

plt.subplot(1,2,2)
sns.heatmap(congestion_monday_afternoon_smoothed[congestion_monday_afternoon_smoothed.columns[-10:]].corr(), annot=True)
plt.title('Smoothed', fontsize=16)

plt.show()

# Import ML Libs

In [None]:
from sklearn.linear_model import LinearRegression as lr
from sklearn.ensemble import RandomForestRegressor as rfr, ExtraTreesRegressor as etr
from sklearn.model_selection import TimeSeriesSplit, KFold
from sklearn.metrics import mean_absolute_error

from lightgbm import LGBMRegressor as lgb

import warnings
warnings.simplefilter('ignore')

# Score between the congestion of the Monday afternoons

In [None]:
df = congestion_monday_afternoon
scores=[]
for idx in df.columns:
    score = 0
    for time_id in range(36):
        score +=mean_absolute_error(df[idx][(time_id*65):((time_id+1)*65)].astype(int), df[df.columns[-1]][(time_id*65):((time_id+1)*65)].astype(int))
    scores.append(score/36.0)

plt.figure(figsize=(10,7))
plt.bar(range(len(scores)),scores, color='y')
plt.xlabel('The Monday afternoons', fontsize=16)
plt.ylabel('MAE vs the Monday afternoon 23 Sept', fontsize=16)
plt.show()

# Score between the congestion of the afternoons

In [None]:
df = congestion_afternoon
scores=[]
for idx in df.columns:
    score =mean_absolute_error(df[idx].astype(int), df[df.columns[-1]].astype(int))
    scores.append(score)
plt.figure(figsize=(10,7))
plt.bar(range(len(scores)),scores, color='y')
plt.xlabel('The afternoons', fontsize=16)
plt.ylabel('MAE vs the afternoon 29 Sept', fontsize=16)
plt.show()

# Score between the congestion of the afternoons w.r.t. the median congestion

In [None]:
df = congestion_afternoon
scores=[]
for idx in df.columns:
    score =mean_absolute_error(df[idx].astype(int), np.median(df, axis=1).round().astype(int))
    scores.append(score)
plt.figure(figsize=(10,7))
plt.bar(range(len(scores)),scores, color='y')
plt.xlabel('The afternoons', fontsize=16)
plt.ylabel('MAE vs the median of all the afternoons', fontsize=16)
plt.show()

# The ouliers

In [None]:
df = congestion_afternoon
outliers=[]
outlier_days=[]
for idx in df.columns:
    if (mean_absolute_error(df[idx].astype(int), np.median(df, axis=1).round().astype(int)) > 7):
        day_idx = idx //2 + 91
        
        outliers.append(idx)
        outlier_days.append(day_idx)
print(f'% outliers: {len(outliers)/len(df.columns)*100}')

In [None]:
df = congestion_afternoon[[idx for idx in congestion_afternoon.columns if idx not in outliers]]
scores=[]
for idx in df.columns:
    score =mean_absolute_error(df[idx].astype(int), np.median(df, axis=1).round().astype(int))
    scores.append(score)
plt.figure(figsize=(10,7))
plt.bar(range(len(scores)),scores, color='y')
plt.xlabel('The afternoons', fontsize=16)
plt.ylabel('MAE vs the median of all the afternoons', fontsize=16)
plt.show()

# Simple baselines without Machine Learning

# Daily average

In [None]:
df_test['congestion'] = -1

for x in range(3):
    for y in range(4):
        for idx, direction in enumerate(df_train.direction.unique()):
            # extract data for each roadway
            df = df_train[(df_train.x == x) & (df_train.y==y) & (df_train.direction==direction)]
            
            if df.shape[0]>0:                
                df_test.congestion[(df_test.x == x) & (df_test.y==y) & (df_test.direction==direction)] = df.groupby(df.daytime_id).congestion.median().tolist()[-36:]

In [None]:
submission.congestion = df_test.congestion.round().astype(int).tolist()
submission.congestion[submission.congestion<0] = 0
submission.congestion[submission.congestion>100] = 100
submission.to_csv('dailyMedian.csv', index=False)
submission.head()

# Daily average, exclude weekend

In [None]:
df_test['congestion'] = -1

for x in range(3):
    for y in range(4):
        for idx, direction in enumerate(df_train.direction.unique()):
            # extract data for each roadway
            df = df_train[(df_train.x == x) & (df_train.y==y) & (df_train.direction==direction)]
            
            if df.shape[0]>0:                
                df_test.congestion[(df_test.x == x) & (df_test.y==y) & (df_test.direction==direction)] = df[df.time.dt.weekday<=4].groupby(df.daytime_id).congestion.median().tolist()[-36:]

In [None]:
submission.congestion = df_test.congestion.round().astype(int).tolist()
submission.congestion[submission.congestion<0] = 0
submission.congestion[submission.congestion>100] = 100
submission.to_csv('dailyMedianExcludeWeekend.csv', index=False)
submission.head()

# Daily average, remove outliers

In [None]:
df_test['congestion'] = -1

for x in range(3):
    for y in range(4):
        for idx, direction in enumerate(df_train.direction.unique()):
            # extract data for each roadway
            df = df_train[(df_train.x == x) & (df_train.y==y) & (df_train.direction==direction) & (~df_train.time.dt.dayofyear.isin(outlier_days))]
            
            if df.shape[0]>0:                
                df_test.congestion[(df_test.x == x) & (df_test.y==y) & (df_test.direction==direction)] = df.groupby(df.daytime_id).congestion.median().tolist()[-36:]

In [None]:
submission.congestion = df_test.congestion.round().astype(int).tolist()
submission.congestion[submission.congestion<0] = 0
submission.congestion[submission.congestion>100] = 100
submission.to_csv('dailyMedianRemoveOutliers.csv', index=False)
submission.head()

# Average of all Monday

In [None]:
df_test['congestion'] = -1

for x in range(3):
    for y in range(4):
        for idx, direction in enumerate(df_train.direction.unique()):
            # extract data for each roadway
            df = df_train[(df_train.time.dt.weekday==0) & (df_train.x == x) & (df_train.y==y) & (df_train.direction==direction)]
            
            if df.shape[0]>0:                
                df_test.congestion[(df_test.x == x) & (df_test.y==y) & (df_test.direction==direction)] = df.groupby(df.daytime_id).congestion.mean().tolist()[-36:]

In [None]:
submission.congestion = df_test.congestion.round().astype(int).tolist()
submission.congestion[submission.congestion<0] = 0
submission.congestion[submission.congestion>100] = 100
submission.to_csv('MondayMean.csv', index=False)
submission.head()

# The nearest Monday

In [None]:
df_test['congestion'] = -1

for x in range(3):
    for y in range(4):
        for idx, direction in enumerate(df_train.direction.unique()):
            # extract data for each roadway
            df = df_train[(df_train.time.dt.dayofyear==df_test.time.dt.dayofyear.tolist()[0]-7) & (df_train.x == x) & (df_train.y==y) & (df_train.direction==direction)]
            
            if df.shape[0]>0:                
                df_test.congestion[(df_test.x == x) & (df_test.y==y) & (df_test.direction==direction)] = df.congestion.rolling(10).mean()[-36:].tolist()

In [None]:
submission.congestion = df_test.congestion.round().astype(int).tolist()
submission.congestion[submission.congestion<0] = 0
submission.congestion[submission.congestion>100] = 100
submission.to_csv('23SeptSmooth.csv', index=False)
submission.head()

# Daily Linear Regression

In [None]:
df_test['congestion'] = -1

for x in range(3):
    for y in range(4):
        for idx, direction in enumerate(df_train.direction.unique()):
            # extract data for each roadway
            df = df_train_new[(df_train_new.x == x) & (df_train_new.y==y) & (df_train_new.direction==direction)]
            df_new = pd.DataFrame()
            if df.shape[0]>0:                
                for idx, group in df.groupby(df.time.dt.dayofyear):
                    if len(group.congestion) == 72: # consider only days without missing data
                        df_new[f'day_{idx}'] = group.congestion.tolist()
            
                X_train = df_new[df_new.columns[:-1]][-36:]
                y_train = df_new[df_new.columns[-1]][-36:]

                X_test = df_new[df_new.columns[1:]][-36:]

                model = lr()
                model.fit(X_train, y_train)

                df_test.congestion[(df_test.x == x) & (df_test.y==y) & (df_test.direction==direction)] = model.predict(X_test).tolist()

In [None]:
submission.congestion = df_test.congestion.round().astype(int).tolist()
submission.congestion[submission.congestion<0] = 0
submission.congestion[submission.congestion>100] = 100
submission.to_csv('DailyLinearRegression.csv', index=False)
submission.head()

# Now Random Forest Regressor

In [None]:
df_test['congestion'] = -1

for x in range(3):
    for y in range(4):
        for idx, direction in enumerate(df_train.direction.unique()):
            # extract data for each roadway
            df = df_train_new[(df_train_new.x == x) & (df_train_new.y==y) & (df_train_new.direction==direction)]
            df_new = pd.DataFrame()
            if df.shape[0]>0:                
                for idx, group in df.groupby(df.time.dt.dayofyear):
                    if len(group.congestion) == 72: # consider only days without missing data
                        df_new[f'day_{idx}'] = group.congestion.tolist()
            
                X_train = df_new[df_new.columns[:-1]][-36:]
                y_train = df_new[df_new.columns[-1]][-36:]

                X_test = df_new[df_new.columns[1:]][-36:]

                model = rfr()
                model.fit(X_train, y_train)

                df_test.congestion[(df_test.x == x) & (df_test.y==y) & (df_test.direction==direction)] = model.predict(X_test).tolist()

In [None]:
submission.congestion = df_test.congestion.round().astype(int).tolist()
submission.congestion[submission.congestion<0] = 0
submission.congestion[submission.congestion>100] = 100
submission.to_csv('DailyRandomForestRegression.csv', index=False)
submission.head()

# Now Extra Tree

In [None]:
df_test['congestion'] = -1

for x in range(3):
    for y in range(4):
        for idx, direction in enumerate(df_train.direction.unique()):
            # extract data for each roadway
            df = df_train_new[(df_train_new.x == x) & (df_train_new.y==y) & (df_train_new.direction==direction)]
            df_new = pd.DataFrame()
            if df.shape[0]>0:                
                for idx, group in df.groupby(df.time.dt.dayofyear):
                    if len(group.congestion) == 72: # consider only days without missing data
                        df_new[f'day_{idx}'] = group.congestion.tolist()
            
                X_train = df_new[df_new.columns[:-1]][-36:]
                y_train = df_new[df_new.columns[-1]][-36:]

                X_test = df_new[df_new.columns[1:]][-36:]

                model = etr()
                model.fit(X_train, y_train)

                df_test.congestion[(df_test.x == x) & (df_test.y==y) & (df_test.direction==direction)] = model.predict(X_test).tolist()

In [None]:
submission.congestion = df_test.congestion.round().astype(int).tolist()
submission.congestion[submission.congestion<0] = 0
submission.congestion[submission.congestion>100] = 100
submission.to_csv('DailyExtraTreeRegression.csv', index=False)
submission.head()

# Now LGBM Regressor

In [None]:
df_test['congestion'] = -1

for x in range(3):
    for y in range(4):
        for idx, direction in enumerate(df_train.direction.unique()):
            # extract data for each roadway
            df = df_train_new[(df_train_new.x == x) & (df_train_new.y==y) & (df_train_new.direction==direction)]
            df_new = pd.DataFrame()
            if df.shape[0]>0:                
                for idx, group in df.groupby(df.time.dt.dayofyear):
                    if len(group.congestion) == 72: # consider only days without missing data
                        df_new[f'day_{idx}'] = group.congestion.tolist()
            
                X_train = df_new[df_new.columns[:-1]][-36:]
                y_train = df_new[df_new.columns[-1]][-36:]

                X_test = df_new[df_new.columns[1:]][-36:]

                model = lgb()
                model.fit(X_train, y_train)

                df_test.congestion[(df_test.x == x) & (df_test.y==y) & (df_test.direction==direction)] = model.predict(X_test).tolist()

In [None]:
submission.congestion = df_test.congestion.round().astype(int).tolist()
submission.congestion[submission.congestion<0] = 0
submission.congestion[submission.congestion>100] = 100
submission.to_csv('DailyLGBMRegression.csv', index=False)
submission.head()

# I inspired This Note book Thanks Sir you upload very Amazing Notebook
https://www.kaggle.com/sytuannguyen/tps-mar-2022-eda-model

# Thanks If you life This notebook PLease Upvote it.