## Problem Statement

For the March edition of the 2022 Tabular Playground Series you're challenged to forecast twelve-hours of traffic flow in a U.S. metropolis. The time series in this dataset are labelled with both location coordinates and a direction of travel -- a combination of features that will test your skill at spatio-temporal forecasting within a highly dynamic traffic network.

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns

## Loading Datasets
Train and Test datasets are loaded with the sample submission

In [None]:
df_train=pd.read_csv('/kaggle/input/tabular-playground-series-mar-2022/train.csv')
df_test=pd.read_csv('/kaggle/input/tabular-playground-series-mar-2022/test.csv')
df_submission=pd.read_csv('/kaggle/input/tabular-playground-series-mar-2022/sample_submission.csv')

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df_testing = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2022/test.csv')
#df_testing.head()

In [None]:
df_submission.head()

## Preprocessing the dataset


In [None]:
#Checking the null values

print(df_train.isnull().sum())
print(df_test.isnull().sum())

In [None]:
# Checking the data types

print(df_train.dtypes)
print(df_test.dtypes)

In [None]:
df_train.describe()

In [None]:
df_train.drop(['row_id'],axis=1,inplace=True)
df_test.drop(['row_id'],axis=1,inplace=True)

In [None]:
df_train['x']=df_train['x'].astype(int)
df_train['y']=df_train['y'].astype(int)

df_test['x']=df_test['x'].astype(int)
df_test['y']=df_test['y'].astype(int)

In [None]:
# convert object to datetime datatype
df_train['time'] = pd.to_datetime(df_train['time'], format="%Y-%m-%d %H:%M:%S")
df_test['time'] = pd.to_datetime(df_test['time'], format="%Y-%m-%d %H:%M:%S")

In [None]:
df_train

## Exploratory Data Analysis

In [None]:
sns.set_style("dark")
sns.set_color_codes(palette='deep')
f, ax = plt.subplots(figsize=(9, 8))

sns.distplot(df_train['congestion'], color="b")
ax.xaxis.grid(False)
ax.set(ylabel="Values")
ax.set(xlabel="Target")
ax.set(title="Target distribution")
sns.despine(trim=True, left=True)
plt.show()

In [None]:
sns.displot(df_train['congestion'])

In [None]:
sns.violinplot(df_train['congestion'])

In [None]:
plt.boxplot(df_train['congestion'])

In [None]:
corr = df_train.corr()
plt.subplots(figsize=(14,10))
sns.heatmap(corr, vmax=0.9, cmap="viridis", square=True)

In [None]:
corr = df_test.corr()
plt.subplots(figsize=(15,12))
sns.heatmap(corr, vmax=0.9, cmap="inferno", square=True)

## Format Data for the Model

In [None]:
target = df_train["congestion"]

df_train.drop(["congestion"], axis=1, inplace=True)
df_train

In [None]:
# Combining train and test dataset
df=df_train.append(df_test)

In [None]:
#Timestamp Data
df['time'] = pd.to_datetime(df['time'], format="%Y-%m-%d %H:%M:%S")
df

In [None]:
df['month'] = pd.DatetimeIndex(df['time']).month
df

In [None]:
df["weekday"] = df['time'].dt.dayofweek
df

In [None]:
df['hour'] = pd.to_datetime(df['time'], format='%Y-%m-%d %H:%M:%S').dt.hour
df

In [None]:
## Replace direction with numbers
df['direction'].replace({'EB':1, 'NB':2, 'SB':3, 'WB':4, 'NE':5, 'SW':6, 'NW':7,'SE':8}, inplace=True)
df

In [None]:
sns.displot(df['weekday'])

In [None]:
sns.displot(df['direction'])

In [None]:
## Dropping Unnecessary columns
df.drop(columns=['time'],axis=1,inplace=True)


In [None]:
df

## Input Split

In [None]:
y = target
X = df[: len(df_train)]
X_test = df[len(df_train) :]

In [None]:
#Split into training and validating

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.10, random_state=42, stratify=y, shuffle=True)
X_train.shape, X_val.shape, y_train.shape, y_val.shape, X_test.shape

In [None]:
# import lightgbm as lgb
# from sklearn.model_selection import GridSearchCV
# parameters = {'depth'         : [4,6,8,10,12,14,18,20],
#                   'learning_rate' : [0.005, 0.01, 0.035, 0.05, 0.1, 0.15, 0.2],
#                   'iterations'    : [300, 800, 1000, 1800, 3000, 4100, 5000]
#                  }
# LGB = lgb.LGBMRegressor()

# grid = GridSearchCV(estimator=LGB, param_grid = parameters, cv = 3, n_jobs=-1)
# grid.fit(X, y)
# CBR
# print(" Results from Grid Search " )
# print("\n The best estimator across ALL searched params:\n", grid.best_estimator_)
# print("\n The best score across ALL searched params:\n", grid.best_score_)
# print("\n The best parameters across ALL searched params:\n", grid.best_params_)

In [None]:
# LGB_Test = lgb.LGBMRegressor(depth=10, iterations=100, learning_rate=0.1)

# LGB_Test.fit(X,y)

In [None]:
import lightgbm as lgb
LGB = lgb.LGBMRegressor(random_state=33, n_estimators=4800, min_data_per_group=5, boosting_type='gbdt',
 num_leaves=246, max_dept=-1, learning_rate=0.005, subsample_for_bin=200000,
 lambda_l1= 1.074622455507616e-05, lambda_l2= 2.0521330798729704e-06, n_jobs=-1, cat_smooth=1.0, 
 importance_type='split', metric='rmse', min_child_samples=20, min_gain_to_split=0.0, feature_fraction=0.5, 
 bagging_freq=6, min_sum_hessian_in_leaf=0.001, min_data_in_leaf=100, bagging_fraction=0.82063411)

LGB.fit(X_train, y_train)

In [None]:
pred_LGB = LGB.predict(X_val)

In [None]:
from sklearn.metrics import mean_squared_error
rmse_LGB = np.sqrt(mean_squared_error(y_val, pred_LGB))
rmse_LGB

In [None]:
preds = LGB.predict(X_test)
preds = preds.astype(int)
preds[preds < 0] = 0
preds

In [None]:
output = pd.DataFrame({'row_id': df_testing.row_id, 'congestion': preds})
output.to_csv('Kaggle_Playground.csv', index=False)