**Referenced from: [HERE](https://www.kaggle.com/code/abdullahkocak/fresh-duck-kaggler-tps-mar22)**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder #Handling Categorical Data
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns #For Data Visualization

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Loading Data**

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2022/train.csv", index_col="row_id", parse_dates=['time'])
test =  pd.read_csv("/kaggle/input/tabular-playground-series-mar-2022/test.csv", index_col="row_id", parse_dates=['time'])

**Handling Categorical Variables and Feature Engineering**

In [None]:
def changetime(df):
    df['month'] = df['time'].dt.month
    df['day'] = df['time'].dt.day
    
    df['weekday'] = df['time'].dt.weekday
    df['weekend'] = (df['time'].dt.weekday >=5).astype(int)
    
    df['hours'] = df['time'].dt.hour
    df['minute'] = df['time'].dt.minute
    
    df['afternoon'] = (df['time'].dt.hour >=12).astype(int)
    
    return df

In [None]:
train = changetime(train)
test = changetime(test)
train.head()

In [None]:
train['road'] = train['x'].astype(str) + train['y'].astype(str) + train['direction']


test['road'] = test['x'].astype(str) + test['y'].astype(str) + test['direction']


le = LabelEncoder()
train['road'] = le.fit_transform(train['road'])
test['road'] = le.transform(test['road'])

train = train.drop(['x','y','direction'], axis=1)
test = test.drop(['x','y','direction'], axis=1)
train.road.nunique()

**Checking Null Values**

In [None]:
train.columns

In [None]:
print("Missing values in Train dataset:",train.isna().any().sum())
print("Missing values in Test dataset:",test.isna().any().sum())

In [None]:
train.describe()

**Mean, Median, Min, Max congestions**

In [None]:
tt = pd.concat([train,test], axis=0)

In [None]:
keys = ['weekday','hours', 'minute', 'road']

temp = tt.groupby(by=keys).mean().reset_index().set_index(keys)
temp['mean congestion'] = temp['congestion']
tt = tt.merge(temp['mean congestion'], how='left', left_on=keys, right_on=keys)

temp = tt.groupby(by=keys).median().reset_index().set_index(keys)
temp['median congestion'] = temp['congestion']
tt = tt.merge(temp['median congestion'], how='left', left_on=keys, right_on=keys)

temp = tt.groupby(by=keys).min().reset_index().set_index(keys)
temp['min congestion'] = temp['congestion']
tt = tt.merge(temp['min congestion'], how='left', left_on=keys, right_on=keys)

temp = tt.groupby(by=keys).max().reset_index().set_index(keys)
temp['max congestion'] = temp['congestion']
tt = tt.merge(temp['max congestion'], how='left', left_on=keys, right_on=keys)

In [None]:
tt.head()

In [None]:
tt_morning = tt[(tt['hours'] >=6) & (tt['hours'] < 12)]
morning_avg = pd.DataFrame(tt_morning.groupby(by=['month', 'day', 'road']).congestion.median().astype(int)).reset_index()
morning_avg = morning_avg.rename(columns={'congestion':'morning_avg'})
tt = tt.merge(morning_avg, on=['month', 'day', 'road'], how='left')

In [None]:
tt.head()

**Lag Function**

In [None]:
for delta in range(1,8):
    dy = tt.copy()
    dy['time'] = dy['time'] + pd.Timedelta(delta, unit="d")
    name =f'lag_{delta}'
    dy = dy.rename(columns={'congestion':name})[['time', 'road', name]]
    tt = tt.merge(dy, on=['time', 'road'], how='left')
tt=tt.fillna(tt["congestion"].median())
tt.head()

**Handling Outliers**

In [None]:
plt.figure(figsize=(10,10))
sns.boxplot(data=train['congestion'])

In [None]:
def outlier_detection(df, column):
    outlier_indices =[]
    
    for x in column:
        q1 = np.percentile(df[x], 25)
        q3 = np.percentile(df[x], 75)
        
        outliers = (q3 - q1)*1.5
        outlier_list_col = df[(df[x] < q1 - outliers) | (df[x] > q1 + outliers)].index
        
        outlier_indices.extend(outlier_list_col)
    
    return outlier_indices

In [None]:
outlier_indices = outlier_detection(tt, ["congestion"])

In [None]:
for i in outlier_indices:
    tt['congestion'][i] = tt['median congestion'][i]

**Data Visualization**

In [None]:
sns.heatmap(train[['congestion', 'hours', 'afternoon']].corr(), annot=True, fmt=".2f")

In [None]:
plt.figure(figsize=(100,10))
sns.barplot(x=train['road'],y=train['congestion'])

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(range(0,101),train.congestion.value_counts().sort_index(), width=1, color = "red")
plt.ylabel('Frequency')
plt.xlabel('Congestion')

**Creating a model**

In [None]:
tt.columns

In [None]:
x_train = tt[:len(train)]
y_train = x_train["congestion"]
x_test = tt[len(train):]

In [None]:
features = ["time","congestion", 'weekend',"weekday","month","day","morning_avg"]
x_train.drop(features, 1, inplace = True)
x_test.drop(features, 1, inplace = True)

In [None]:
x_train.columns

In [None]:
from catboost import CatBoostRegressor

cat_base = CatBoostRegressor(
    #ignored_features=ignore_cols,
    eval_metric='MAE')

In [None]:
cat_base.fit(x_train,y_train)

In [None]:
preds = pd.DataFrame(cat_base.predict(x_test), columns=['preds'])
preds = preds.round()
preds.head()

In [None]:
so = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2022/sample_submission.csv')
so.head()

In [None]:
output = pd.DataFrame({'row_id': so.row_id,
                       'congestion': preds['preds']})
output.head()

In [None]:
output.to_csv('submission.csv', index=False)