In [None]:
import numpy as np
import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar

import plotly.express as px

In [None]:
train = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv', index_col='row_id')
test = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv', index_col='row_id')



def get_time_based_features(df, calendar):
    df["time"] = pd.to_datetime(df["time"],format="%Y-%m-%d %H:%M:%S")
    df["hour"] = df["time"].dt.hour
    df["dayofweek"] = df["time"].dt.weekday
    df["is_month_end"] = df['time'].dt.is_month_end
    df["minute"] = df['time'].dt.minute
    df['weekend'] = df['dayofweek']>4
    df['xy'] = df['x'].astype('str') + "_" + df['y'].astype('str')
    
    holidays = calendar.holidays(start=df['time'].min(), 
                                 end = df['time'].max())
    df['holiday'] = df['time'].isin(holidays)
    return df

calendar = USFederalHolidayCalendar()

get_time_based_features(train, calendar)
get_time_based_features(test, calendar)
''

In [None]:
train.head()

## Directions

Let's look at the directions in which the most traffic is happening

In [None]:
train.direction.value_counts()

In [None]:
px.bar(x=train.direction.value_counts().index, y=train.direction.value_counts().values, color=train.direction.value_counts().index)

- We see that East, West, North and South have almost equal number of samples
- The diagonal NE / SW have similar distributions and have almost 2/3rd samples as the other primary directions
- The other diagonal NW / SE have similar distributions and have almost 1/3rd samples as the other diagonal 

## Let's look at the average congestion along different directions

In [None]:
px.violin(train, y='congestion', x='direction', color='direction', box=True)

Let's look at the data on Monday from 1200 to 2400 hrs

In [None]:
px.violin(train[(train.dayofweek==0) & (train.hour>=12)], y='congestion', x='direction', color='direction', box=True)

- It looks like we have some outliers in most of the directions expecially around very low or very high congestion.
- It makes sense to look at the data at different time intervals or days

In [None]:
pd.crosstab(train['dayofweek'], pd.cut(train['congestion'], bins=10), normalize='index')

In [None]:
temp = pd.crosstab(train['dayofweek'], pd.cut(train['congestion'], bins=10), normalize='index')

px.imshow((temp*100).astype('int').values, text_auto=True,
         y = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
          x = np.arange(1, 11)*10, 
         )

In [None]:
temp = pd.crosstab(train['dayofweek'], pd.cut(train['congestion'], bins=10), normalize='columns')

px.imshow((temp*100).astype('int').values, text_auto=True,
         y = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
          x = np.arange(1, 11)*10, 
         )

## Average congestion by day and hour of the day

In [None]:
px.density_heatmap(train, x='dayofweek', y='hour', z='congestion', histfunc='avg', text_auto=True)

## Average congestion on Monday along different directions and hour of the day
As the test set has data only on Monday and that too between 1200 and 2400 hrs, we investigate how the traffic is along different directions and different time of the day on Monday

In [None]:
px.density_heatmap(train[train.dayofweek==0], x='direction', y='hour', z='congestion', histfunc='avg', text_auto=True)

In [None]:
px.density_heatmap(train[(train.dayofweek==0) & (train.hour>=12)], x='direction', y='hour', z='congestion', histfunc='avg', text_auto=True)

- North bound vehicles are the most affected by traffic around 1500 - 1700 Hrs
- Northwest bound vehicles are the least affected by traffic.