In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<a id = "table-of-contents"></a>
# Table of Contents

- [1 Introduction](#1)
    - [1.1 Problem Statement](#1.1)
    - [1.2 Data Dictionary](#1.2)
- [2 Preparations](#2)
    - [2.1 Importing Packages](#2.1)
    - [2.2 Loading The Dataset](#2.2)
- [3 Getting Basic Understanding of The Data](#3)
    - [3.1 Seeing the data and shape](#3.1)
    - [3.2 Statistics](#3.2)
    - [3.3 Number of Unique Values in Each Column](#3.3)
    - [3.4 Distribution of target Variable](#3.4)
- [4 Changing The Data Type Of Variables for Analysis and Space Saving](#4)
    - [4.1 Dropping The `id` Columns](#4.1)
    - [4.2 Changing The Data Type Of variables](#4.2)
- [5 Univariate Analysis](#5)
    - [5.1 Distribution of Categorical Variables](#5.1)
- [6 Bivariate Analysis](#6)
    - [6.1 Distribution of Target w.r.t. Categorical Variables](#6.1)
- [7 Model Building](#7)
    - [7.1 Data Preprocessing](#7.1)
    - [7.2 Predicting with Baseline Model](#7.2)
    - [7.3 Feature Engineering](#7.3)
    - [7.4 Setting Up a Cross-Validation Strategy](#7.4)
    - [7.5 Final Predictions](#7.5)


<a id="1"></a>
# 1. Introduction

In this competition, you'll forecast twelve-hours of traffic flow in a major U.S. metropolitan area. Time, space, and directional features give you the chance to model interactions across a network of roadways.

<a id="1.1"></a>
### 1. 1 Problem Statement

For the March edition of the 2022 Tabular Playground Series you're challenged to forecast twelve-hours of traffic flow in a U.S. metropolis. The time series in this dataset are labelled with both location coordinates and a direction of travel -- a combination of features that will test your skill at spatio-temporal forecasting within a highly dynamic traffic network.

<a id = "1.2"></a>
### 1.2 Data Dictionary

**row_id**     - a unique identifier for this instance

**time**       - the 20-minute period in which each measurement was taken

**x**          - the east-west midpoint coordinate of the roadway

**y**          - the north-south midpoint coordinate of the roadway

**direction**  - the direction of travel of the roadway. EB indicates "eastbound" travel, for example, while SW indicates a "southwest" direction of travel.

**congestion** `target variable` - congestion levels for the roadway during each hour; the target. The congestion measurements have been normalized to the range 0 to 100.

---

<a id="2"></a>
# 2. Preparations

Importing packages and loading the data that will be used in the analysis and modelling process. 

[back to top](#table-of-contents)
<a id="table-of-contents"></a>

<a id="2.1"></a>


### 2.1 Importing Packages

In [None]:
#### Data Manipulation
import pandas as pd
import numpy as np
import warnings

pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

#### Data Visulization 
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from matplotlib import ticker
import seaborn as sns
sns.set(style = 'white')

############## Libraries for Machine Learning Modeling ###############

# Data Preprocessing
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
le = LabelEncoder()
scaler = MinMaxScaler()

# Model Building
from sklearn.model_selection import train_test_split, KFold

# Machine Learning Models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

# Evaluation
from sklearn.metrics import mean_squared_error, mean_absolute_error


<a id = "2.2"></a>

### 2.2 Loading The Dataset

In [None]:
train = pd.read_csv("../input/tabular-playground-series-mar-2022/train.csv")
test = pd.read_csv("../input/tabular-playground-series-mar-2022/test.csv")
ss = pd.read_csv("../input/tabular-playground-series-mar-2022/sample_submission.csv")

---
<a id = '3'></a>
# 3. Getting Basic Understanding of The Dataset

[back to top](#table-of-contents)
<a id="table-of-contents"></a>


<a id = "3.1"></a>
### 3.1 Seeing the data and shape

In [None]:
train.head(2)

In [None]:
test.head(2)

In [None]:
print(f"Shape of the train set is: {train.shape}")
print(f"The train set has {len(train.columns) - 1} features and 1 target variable: {train.columns[-1]}")
print()
print()
print(f"Shape of the test set is: {test.shape}")
print(f"The test set has {len(test.columns)} features")


target = 'congestion'

<a id = "3.2"></a>

### 3.2 Statistics

In [None]:
train.describe().T.style.bar(color = '#eeb977').background_gradient(subset = ['std', '50%'], cmap='Reds', axis = 1)

In [None]:
test.describe().T.style.bar(color = '#eeb977').background_gradient(subset = ['std', '50%'], cmap='Reds', axis = 1)

<a id = "3.3"></a>
### 3.3 Number of Unique Values in Each Columns

In [None]:
pd.concat([train.nunique(), test.nunique()], axis = 1, keys = ['train', 'test']).sort_values(by = 'train').style.bar(color = '#eeb977').background_gradient(cmap = 'Reds', axis =1)

<a id = "3.4"></a>

### 3.4 Distribution of Target variable

In [None]:
fig = plt.figure(figsize=(22, 12))
gs = fig.add_gridspec(3, 1)

background_color = "#faf9f4"
color_palette = ["#8d9e8c", "#eeb977"]

# Fig 1
ax0 = fig.add_subplot(gs[0, 0])

fig.patch.set_facecolor(background_color)
ax0.set_facecolor(background_color)


ax0.grid(color='gray', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.kdeplot(train["congestion"], color="#eeb977", shade=True, ax=ax0, zorder=3)
plt.ticklabel_format(style='plain')
ax0.set_xlabel("")
ax0.set_ylabel("")

# Fig 2
ax1 = fig.add_subplot(gs[1, 0])

fig.patch.set_facecolor(background_color)
ax1.set_facecolor(background_color)

ax1.grid(color='gray', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.boxplot(train["congestion"], color="#77acee", ax=ax1, zorder=3)
ax1.set_xlabel("")
ax1.set_ylabel("")

_ = plt.title('Congestion Distribution',fontsize=30, y = 2.29, x = 0.5, fontweight='bold', fontfamily='serif', color="#323232")

---
<a id="1"></a>
# 4. Pre-Analysis Data Processing

[back to top](#table-of-contents)
<a id="table-of-contents"></a>

<a id = "4.1"></a>
### 4.1 Dropping The `id` Columns

In [None]:
train.drop('row_id', axis = 1, inplace = True)
test.drop('row_id', axis = 1, inplace = True)

<a id = "4.2"></a>
### 4.2 Changing The Data Type

In [None]:
train['time'] = pd.to_datetime(train['time'])
test['time'] = pd.to_datetime(test['time'])

In [None]:
cols = ['x', 'y']

train[cols] = train[cols].astype('uint8')
test[cols] = test[cols].astype('uint8')

train[target] = train[target].astype('int8')

# This way we have reduced the memory usage by more than 50%.

---
<a id = '5'></a>
# 5. Univariate Analysis

[back to top](#table-of-contents)
<a id="table-of-contents"></a>

In [None]:
cat_cols = ['x', 'y', 'direction']

<a id = '5.1'></a>
### 5.1 Distribution of Categorical Variables

In [None]:
color_palette = ["#eeb977","#8d9e8c"]

fig = plt.figure(figsize = (25, 35))
gs = fig.add_gridspec(3, 2)

ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])
ax2 = fig.add_subplot(gs[1, 0])
ax3 = fig.add_subplot(gs[1, 1])
ax4 = fig.add_subplot(gs[2, 0])
ax5 = fig.add_subplot(gs[2, 1])

background_color = "#faf9f4"
fig.patch.set_facecolor(background_color) # figure background color

ax0.set_facecolor(background_color) # axes background color
ax1.set_facecolor(background_color) # axes background color
ax2.set_facecolor(background_color) # axes background color
ax3.set_facecolor(background_color) # axes background color
ax4.set_facecolor(background_color) # axes background color
ax5.set_facecolor(background_color) # axes background color

for s in ['right', 'top']:
    ax0.spines[s].set_visible(False)
    ax1.spines[s].set_visible(False)
    ax2.spines[s].set_visible(False)
    ax3.spines[s].set_visible(False)
    ax4.spines[s].set_visible(False)
    ax5.spines[s].set_visible(False)

axis = { 'x' : [ax0, ax1],
         'y' : [ax2, ax3],
         'direction' : [ax4, ax5],
}

for col in axis.keys():
    temp = pd.DataFrame(train[col].value_counts())
    temp = temp.reset_index(drop=False)
    temp.columns = ['Number', 'Count']
    
    if col == 'direction':
        sns.countplot(ax = axis[col][0], data = train, x = col, zorder=2, linewidth=0, alpha=1, saturation=1, palette=["#eeb977","#8ebaf1", "#77eeb9", "#e8ee77", "#b977ee", "#ee77ac"])
    else:
        sns.countplot(ax = axis[col][0], data = train, x = col, zorder=2, linewidth=0, alpha=1, saturation=1, palette=["#eeb977","#8ebaf1", "#77eeb9", "#e8ee77", "#b977ee", "#ee77ac"])
    axis[col][0].grid(which='major', axis='x', zorder=0, color='#EEEEEE', linewidth=0.4)
    axis[col][0].grid(which='major', axis='y', zorder=0, color='#EEEEEE', linewidth=0.4)
    axis[col][0].set_ylabel('')
    axis[col][0].set_xlabel('')
    axis[col][0].tick_params(labelsize=10, width=0.5, length=1.5)
    #axis[col][0].yaxis.set_major_formatter(ticker.PercentFormatter())
    
    
    for p in axis[col][0].patches:
        percentage = f'{p.get_height()}\n'
        x = p.get_x() + p.get_width()/2
        y = p.get_height()
        axis[col][0].text(x, y, percentage, ha='center', va='center', fontsize = 10)
    
    # Pie Plot
    if col == 'direction':
        train[col].value_counts().plot.pie(autopct='%1.1f%%', colors = ["#eeb977","#8ebaf1", "#77eeb9", "#e8ee77", "#b977ee", "#ee77ac"], ax = axis[col][1])
    else:
        train[col].value_counts().plot.pie(autopct='%1.1f%%', colors = ["#eeb977","#8ebaf1", "#77eeb9", "#e8ee77", "#b977ee", "#ee77ac"], ax = axis[col][1])
    axis[col][1].set_ylabel('')
    axis[col][1].set_xlabel('')
    
_ = ax0.text(2.5, -25000, 'x', fontsize = 30, fontweight='bold', fontfamily='serif', color='#323232')
_ = ax2.text(3.5, -23000, 'y', fontsize = 30, fontweight='bold', fontfamily='serif', color='#323232')
_ = ax4.text(6.5, -15000, 'Direction', fontsize = 30, fontweight='bold', fontfamily='serif', color='#323232')

_ = plt.title('Distribution of Categorical Variables',fontsize=40, y = 3.5, x = -0.15, fontweight='bold', fontfamily='serif', color="#323232")

---
<a id = '6'></a>
# 6. Bivariate Analysis

[back to top](#table-of-contents)
<a id="table-of-contents"></a>

<a id = 6.1></a>
### 6.1 Distribution of Target w.r.t. Categorical Variables

In [None]:
color_palette = ["#eeb977","#8d9e8c"]

fig = plt.figure(figsize = (20, 10))
gs = fig.add_gridspec(1, 3)

ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])
ax2 = fig.add_subplot(gs[0, 2])

background_color = "#faf9f4"
fig.patch.set_facecolor(background_color) # figure background color
ax0.set_facecolor(background_color) # axes background color
ax1.set_facecolor(background_color) # axes background color
ax2.set_facecolor(background_color) # axes background color


axis = { 'x' : [ax0],
         'y' : [ax1],
         'direction' : [ax2]
       }

for col in axis.keys():
    if col == 'direction':
        sns.boxplot(data = train, x = col, y = target, ax = axis[col][0], palette=["#eeb977","#8ebaf1", "#77eeb9", "#e8ee77", "#b977ee", "#ee77ac"])
    else:
        sns.boxplot(data = train, x = col, y = target, ax = axis[col][0], palette= ["#eeb977","#8ebaf1", "#77eeb9", "#e8ee77", "#b977ee", "#ee77ac"])
    axis[col][0].set_ylabel('')
    axis[col][0].set_xlabel('')
    
_ = ax0.text(1, -15, 'X', fontsize = 20, fontweight='bold', fontfamily='serif', color='#323232')
_ = ax1.text(1.5, -15, 'Y', fontsize = 20, fontweight='bold', fontfamily='serif', color='#323232')
_ = ax2.text(2, -15, 'Direction', fontsize = 20, fontweight='bold', fontfamily='serif', color='#323232')

_ = plt.title('Distribution of Categorical Variables',fontsize=30, y = 1.03, x = -0.7, fontweight='bold', fontfamily='serif', color="#323232")

---
<a id = '7'></a>
# 7. Machine Learning

[back to top](#table-of-contents)
<a id="table-of-contents"></a>

Here, we will try to describe the relationship between Independent(Features) and Dependent(Target) Variable using Machine Learning Modeling.

<a id = 7.1></a>
## 7.1 Data Preprocessing

In [None]:
df = pd.concat([train, test], axis = 0).reset_index(drop=True)

In [None]:
# Basic Datetime Feature Retrival
df['Year'] = df['time'].dt.year
df['Month'] = df['time'].dt.month
df['Day'] = df['time'].dt.day
df['Hour'] = df['time'].dt.hour
df['Minute'] = df['time'].dt.minute
df['dayofweek'] = df['time'].dt.hour

In [None]:
# Label Encoding
df['direction'] = df[['direction']].apply(le.fit_transform)

In [None]:
# Separating Train - Test and Creating Training and Validation Sets

train_proc, test_proc = df[:train.shape[0]], df[train.shape[0]:].reset_index(drop = True)

target = 'congestion'
date = 'time'

features = [col for col in df.columns if col not in ([target, date])]

<a id = '7.2'></a>
## 7.2 Predicting With Baseline Model Building


In [None]:
trn, val = train_test_split(train_proc, test_size = 0.2, random_state = 1999)

##### Input for model
X_trn, X_val = trn[features], val[features]

##### Target column
y_trn, y_val = trn[target], val[target]

##### Features for test data that we will be predicting
X_test = test_proc[features]



In [None]:
model_dict = {}

seed = 1999

model_dict['Linear Regression'] = LinearRegression()
model_dict['DecisionTree Regressor'] = DecisionTreeRegressor(random_state = seed)
model_dict['Random Forest Regressor'] = RandomForestRegressor(random_state = seed)
model_dict['LGBM Regressor'] = LGBMRegressor(random_state = seed)
model_dict['XGB Regressor'] = XGBRegressor(random_state = seed)
model_dict['Catboost Regressor'] = CatBoostRegressor(random_state = seed, verbose=False)


In [None]:
def model_evaluation(X_train, X_test, y_train, y_test, model, model_name):
    """
    Dockstring:
            Shows the score of ML Model by training and evaluating it.
            
    Parameters:
    -----------
            X_train: Training Data for ML Model
            
            X_test: Validation Data for ML Model
            
            y_train: Target values of training data
            
            y_test:  Target values of validation data
            
            model: Machine Learning Model
            
            model_name: Name of Machine Learning Model
    """
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    MAE = mean_absolute_error(y_test, y_pred)
    print('======================================{}======================================='.format(model_name))
    print()
    print('Mean Absolute Error is : {}'.format(MAE))
    print()

In [None]:
%%time
for model_name,model in model_dict.items():
    model_evaluation(X_trn, X_val, y_trn, y_val, model, model_name)

**Using Complete Dataset For Predictions**

In [None]:
%%time
model = XGBRegressor(objective='reg:linear', random_state = 1999)

_ = model.fit(train_proc[features], train_proc[target])

preds = model.predict(test_proc[features])

In [None]:
ss[target] = preds
ss.to_csv('Baselin_XGB.csv', index = False)

---
# Stay Tuned For Next Steps:
        - Advance Feature Engineering
        - Building Cross validation Strategy
        - Hyperparameter Tuning
        - Ensembling, Stacking, Blending.

# ** Upvote The Notebook If You Like The Content **