In [None]:
# Importing packages

import numpy as np 
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from urllib.request import urlopen
from PIL import Image

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2022/train.csv")
test  = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2022/test.csv")

In [None]:
train.head().style.applymap(lambda x : "background-color: #809e99")\
.set_table_styles([{'selector' : '', 'props' : [('border', '2px solid black')]}])

 About the Data


| Column      | Description |
| ----------- | ----------- |
| row_id | a unique identifier for this instance|
| time  | the 20-minute period in which each measurement was taken|
| x | the east-west midpoint coordinate of the roadway|
| y | the north-south midpoint coordinate of the roadway|
| direction |the direction of travel of the roadway. EB indicates "eastbound" travel, for example, while SW indicates a "southwest" direction of travel.|
| congestion  | congestion levels for the roadway during each hour; the target. The congestion measurements have been normalized to the range 0 to 100|

In [None]:
# Converting to datetime

train.time = pd.to_datetime(train.time)

print('Train time duration : ', train['time'].min(), train['time'].max())
print('Test time duration : ', test['time'].min(), test['time'].max())

# EDA

In [None]:
def EDA(df):
    
    print('\033[1m' +'EXPLORATORY DATA ANALYSIS :'+ '\033[0m\n')
    print('\033[1m' + 'Shape of the data (rows, columns):' + '\033[0m')
    print(df.shape, 
          '\n------------------------------------------------------------------------------------\n')
    
    print('\033[1m' + 'All columns from the dataframe :' + '\033[0m')
    print(df.columns, 
          '\n------------------------------------------------------------------------------------\n')
    
    print('\033[1m' + 'Datatpes and Missing values:' + '\033[0m')
    print(df.info(), 
          '\n------------------------------------------------------------------------------------\n')
    
    for col in df.columns:
        if df[col].dtype == 'object':
            print('\033[1m' + 'Total Unique values in {} :'.format(col) + '\033[0m',len(df[col].unique()))
            print('\t\033[1m' + 'Categories in {} :'.format(col) + '\033[0m', df[col].unique())
    print('\n------------------------------------------------------------------------------------\n')
    
    print('\033[1m' + 'Summary statistics for the data :' + '\033[0m')
    print(df.describe(), 
          '\n------------------------------------------------------------------------------------\n')
    
        
    print('\033[1m' + 'Memory used by the data :' + '\033[0m')
    print(df.memory_usage(), 
          '\n------------------------------------------------------------------------------------\n')
    
    print('\033[1m' + 'Number of duplicate values :' + '\033[0m')
    print(df.duplicated().sum())
          
EDA(train)

In [None]:
train.drop('row_id', axis=1, inplace=True)

# Variable distribution

In [None]:
fig = go.Figure()

fig.add_trace(go.Histogram(x=train.congestion))

fig.update_xaxes(
        tickfont = dict(size=15, family = 'monospace', color ='#d4c43b'),
        tickmode = 'array',
        ticklen = 6,
        showline = False,
        showgrid = False,
        ticks = 'outside')


fig.update_yaxes(
        tickfont = dict(size=15, family = 'monospace', color ='#d4c43b'),
        tickmode = 'array',
        showline = False,
        showgrid = False,
        ticks = 'outside')

fig.update_traces(                  marker_line_color='black',
                  marker_line_width= 1.2,
                  opacity=0.6,
                  )

fig.update_layout(font = dict(color='#d4c43b', family = 'monospace'),
                  title = dict(text = 'Target(Congestion) distribution',
                               x = 0.5, y =0.985,
                               font = dict(size = 22
                               )),
                  plot_bgcolor='#384543',
                  paper_bgcolor = '#384543',
                  showlegend = False)

fig.show()

In [None]:
data = train.copy()

# Converting 'congestion' to a categorical variable
category = pd.cut(data.congestion,
                  bins=[0,20,40,60,80,100],
                  labels=['0-20', '20-40', '40-60', '60-80', '80-100'])

data.congestion =  category

# Subplots
fig = make_subplots(
    rows=2, cols=2, subplot_titles=("column 'x'", "column 'y'", "direction", "congestion"))

# Add traces
fig.append_trace(go.Bar(x=data.x.value_counts().index.values,
                     y=data.x.value_counts().to_numpy()), row=1, col=1)

fig.append_trace(go.Bar(x=data.y.value_counts().index.values,
                     y=data.y.value_counts().to_numpy()), row=1, col=2)

fig.append_trace(go.Bar(x=data.direction.value_counts().index.values,
                     y=data.direction.value_counts().to_numpy()), row=2, col=1)

fig.append_trace(go.Bar(x=data.congestion.value_counts().index.values,
                     y=data.congestion.value_counts().to_numpy()), row=2, col=2)


fig.update_xaxes(
        tickfont = dict(size=15, family = 'monospace', color ='#d4c43b'),
        tickmode = 'array',
        tickangle = 60,
        ticklen = 6,
        showline = False,
        showgrid = False,
        ticks = 'outside')

fig.update_yaxes(
        tickfont = dict(size=15, family = 'monospace', color ='#d4c43b'),
        tickmode = 'array',
        showline = False,
        showgrid = False,
        ticks = 'outside')

fig.update_traces(
                  marker_line_color='black',
                  marker_line_width= 1.2,
                  opacity=0.6,
                  )



# Update title and height
fig.update_layout(height = 800, 
                  font = dict(color='#d4c43b', family = 'monospace'),
                  title = dict(text = 'Variable Distribution',
                               x = 0.5, y =0.985,
                               font = dict(size = 22, color ='#4dd43b',
                               )),
                  plot_bgcolor='#384543',
                  paper_bgcolor = '#384543',
                  showlegend = False)

fig.show()

# Congestion Trends

In [None]:
train_daily = train.set_index('time').groupby([pd.Grouper(freq='D')])[['congestion']].mean()
train_daily = train_daily.reset_index()

In [None]:
fig = px.line(train_daily,
              x="time",
              y= 'congestion',
              color_discrete_sequence = ['#4dd43b'])

fig.update_xaxes(
        tickfont = dict(size=15, family = 'monospace', color ='#d4c43b'),
        tickmode = 'array',
        ticklen = 6,
        showline = False,
        showgrid = False,
        ticks = 'outside')


fig.update_yaxes(
        tickfont = dict(size=15, family = 'monospace', color ='#d4c43b'),
        tickmode = 'array',
        showline = False,
        showgrid = False,
        ticks = 'outside')

fig.update_traces(
                  marker_line_color='black',
                  marker_line_width= 1.2,
                  opacity=0.6,
                  )

fig.update_layout(font = dict(color='#d4c43b', family = 'monospace'),
                  title = dict(text = 'Daily-Congestion trend',
                               x = 0.5, y =0.985,
                               font = dict(size = 22
                               )),
                  plot_bgcolor='#384543',
                  paper_bgcolor = '#384543',
                  showlegend = False)

fig.show()

In [None]:
train_hourly = train.set_index('time').groupby([pd.Grouper(freq='h')])[['congestion']].mean()
train_hourly = train_hourly.reset_index()

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=train_hourly["time"], y=train_hourly['congestion'],
                    mode='markers',
                    name='Mean Hourly-Congestion',
                    marker=dict(color="#eb4431")
                    ))

fig.add_trace(go.Scatter(x=train_daily["time"], y=train_daily['congestion'],
                    mode='lines',
                    name='Mean Daily-Congestion',
                    marker=dict(color="#d4c43b")
                    ))

fig.update_xaxes(
        tickfont = dict(size=15, family = 'monospace', color ='#d4c43b'),
        tickmode = 'array',
        ticklen = 6,
        showline = False,
        showgrid = False,
        ticks = 'outside')


fig.update_yaxes(
        tickfont = dict(size=15, family = 'monospace', color ='#d4c43b'),
        tickmode = 'array',
        showline = False,
        showgrid = False,
        ticks = 'outside')

fig.update_traces(
                  marker_line_color='black',
                  marker_line_width= 1.2,
                  opacity=0.6,
                  )

fig.update_layout(font = dict(color='#d4c43b', family = 'monospace'),
                  title = dict(text = 'Hourly-Congestion trend',
                               x = 0.5, y =0.985,
                               font = dict(size = 22
                               )),
                  plot_bgcolor='#384543',
                  paper_bgcolor = '#384543',
                  legend        = dict(yanchor="bottom", y=1,
                                       xanchor="center", x=0.5),
                  )

fig.show()

# Correlation

In [None]:
data = pd.get_dummies(train, columns = ['direction'])

In [None]:
df1 = data.copy()

cols = df1.columns
plt.figure(figsize = (16, 10), dpi = 150)

corr = df1.corr()

mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True


sns.heatmap(corr,
            mask = mask,
            cmap = 'YlGn_r',
            vmax=.3,
            annot = True,
            linewidths = 0.5,
            fmt = ".2f",
            alpha = 0.6)

hfont = {'fontname':'monospace'}
plt.xticks(**hfont)
plt.yticks(**hfont)

plt.title('Correlation : Pearson',
          family = 'monospace',
          fontsize = 20,
          weight = 'semibold',
          color = '#964545')

plt.show()

### Little processing... Thanks to INVERSION 

In [None]:
from math import sin, cos, pi

sin_vals = {
    'NB': 0.0,
    'NE': sin(1 * pi/4),
    'EB': 1.0,
    'SE': sin(3 * pi/4),
    'SB': 0.0,
    'SW': sin(5 * pi/4),    
    'WB': -1.0,    
    'NW': sin(7 * pi/4),  
}

cos_vals = {
    'NB': 1.0,
    'NE': cos(1 * pi/4),
    'EB': 0.0,
    'SE': cos(3 * pi/4),
    'SB': -1.0,
    'SW': cos(5 * pi/4),    
    'WB': 0.0,    
    'NW': cos(7 * pi/4),  
}


train['sin'] = train['direction'].map(sin_vals)
train['cos'] = train['direction'].map(cos_vals)

encoded_vals = {
    'NB': 0,
    'NE': 1,
    'EB': 2,
    'SE': 3,
    'SB': 4,
    'SW': 5,
    'WB': 6, 
    'NW': 7,
}

train['direction'] = train['direction'].map(encoded_vals)

train['year'] = train['time'].dt.year
train['month'] = train['time'].dt.month
train['day'] = train['time'].dt.day
train['hour'] = train['time'].dt.hour
train['minute'] = train['time'].dt.minute
train['weekday'] = train['time'].dt.weekday

train = train.drop('time', axis='columns')

In [None]:
X = train.copy()
y = X.pop('congestion')

num = int(len(X)*0.8)

# Potential time-dependent data, so no random splitting
X_train = X.iloc[:num, :]
X_test  = X.iloc[num:, :]

y_train = y.iloc[:num]
y_test  = y.iloc[num:]

# Model : Standard Regressor

In [None]:
import catboost as cb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
model = cb.CatBoostRegressor(loss_function= 'RMSE',
                             iterations = 200,
                             learning_rate=0.1, 
                             depth = 6)

model.fit(
    X_train, y_train,
    logging_level='Silent')  

pred = model.predict(X_test)
rmse = (np.sqrt(mean_squared_error(y_test, pred)))
r2 = r2_score(y_test, pred)

In [None]:
print("Testing performance")
print('RMSE: {:.2f}'.format(rmse))
print('R2: {:.2f}'.format(r2))

In [None]:
importance = pd.Series(model.feature_importances_,
                       index = X_train.columns)

importance = importance.sort_values() 


fig = go.Figure()

fig.add_trace(go.Scatterpolargl(r=importance.values,
                                theta=importance.index,
                                opacity = 0.6))


fig.update_traces(
                  marker_line_color='black',
                  marker_line_width= 2,
                  opacity=0.8,
                  )

fig.update_layout(font = dict(color='#d4c43b', family = 'monospace'),
                  title = dict(text = 'Feature Importance<br>concerning Congestion',
                               x = 0.5, y =0.965,
                               font = dict(size = 22
                               )),
                  paper_bgcolor = '#384543',
                  )
fig.show()

# Shap : Model Explaination

In [None]:
import shap

In [None]:
plt.figure(figsize = (10, 6), dpi = 120)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

shap.summary_plot(shap_values,
                  X_test,
                  feature_names = importance.index)

In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[0,:], X.iloc[0,:])

In [None]:
fig = plt.subplots(figsize=(6,6), dpi=100)

ax__= shap.plots._waterfall.waterfall_legacy(explainer.expected_value, 
                                             shap_values[50],
                                             feature_names = X_train.columns,
                                             max_display = 20)

In [None]:
plt.figure(figsize = (8, 6), dpi = 120)
shap.summary_plot(shap_values, X_test, feature_names=X_test.columns, plot_type="bar")

### Thanks for reading
#### Still in progress...