# 0. Introduction

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import warnings
warnings.filterwarnings('ignore')

In [None]:
#variables

ID = 'row_id'
TARGET = 'congestion'
RANDOM_SEED= 42

In [None]:
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2022/train.csv', parse_dates=['time'] , infer_datetime_format=True ) # I tried w/o infer_datetime_format its working fine
test_df = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2022/test.csv", parse_dates=['time'] , infer_datetime_format=True)

In [None]:
train_df.head(66)

**PS:** If you wonder why train_df.head() does not show time in "hh:mm:ss" format, read [this](https://www.kaggle.com/competitions/tabular-playground-series-mar-2022/discussion/312688) post.

In [None]:
train_df.shape

In [None]:
train_df.info()

In [None]:
train_df.head()

# 1. EDA

In [None]:
train_df.groupby(TARGET)[TARGET].count()

Drop the unique ID column in both dataset

In [None]:
train_df.drop(ID, axis=1, inplace=True)
test_df.drop(ID, axis=1, inplace=True)

In [None]:
train_df.direction.unique(), train_df.x.unique(), train_df.y.unique()

So, there are 8 directions. Let's do the Label Encoding for that.

In [None]:
from sklearn.preprocessing import LabelEncoder

def encode_direction(df):
    label = LabelEncoder()
    df.direction=label.fit_transform(df.direction)

    return df

In [None]:
encode_direction(train_df)
encode_direction(test_df)

train_df.head()

In [None]:
test_df.head()

In [None]:
train_df.groupby('time')['time'].count()

So, there are 65 observations for every 20 mins. Totally, 13059 set of 20mins observations from April to Sep. Also, train data have 30sep data till 11:40a. Test data starts from 12p. We need to find a model to distinguish these dates especially narrowed down to time

# 2. Feature Engineering

In [None]:
def feature_eng(df):
    df['day']= df.time.dt.dayofweek
    df['hour'] = df.time.dt.hour
    df['minute'] = df.time.dt.minute
    return df

In [None]:
train_df.head()

In [None]:
train_df = feature_eng(train_df)
test_df = feature_eng(test_df)

In [None]:
train_df.drop('time', axis=1, inplace=True)
test_df.drop('time', axis=1, inplace=True)

In [None]:
train_df.head()

Thanks to [@ambrosm](https://www.kaggle.com/ambrosm) for [this](https://www.kaggle.com/code/ambrosm/tpsmar22-eda-which-makes-sense) EDA analysis. Based on his notebook, I wanted to create the model which trains 6.5 days and predict the Monday afternoon traffic. I used Monday pm data as validation set.

In [None]:
mon_pm_data = (train_df.day == 0) & (train_df.hour >= 12)

X_train,X_valid=train_df[~mon_pm_data], train_df[mon_pm_data]

In [None]:
X_valid.shape, X_train.shape, train_df.shape

In [None]:
features = [ col for col in train_df.columns if col not in [TARGET]]

In [None]:
def std_feature_eng(df):
    df['mean'] = df[features].mean(axis=1)
    df['std'] = df[features].std(axis=1)
    df['min'] = df[features].min(axis=1)
    df['max'] = df[features].max(axis=1)
    return df

In [None]:
X_train = std_feature_eng(X_train)
X_valid = std_feature_eng(X_valid)
train_df = std_feature_eng(train_df)
test_df = std_feature_eng(test_df)

features.extend(['mean', 'std', 'min', 'max'])

In [None]:
y_train=X_train[TARGET]
y_valid=X_valid[TARGET]
X_train.drop(columns=[TARGET],inplace=True)
X_valid.drop(columns=[TARGET],inplace=True)

In [None]:
train_df.shape, X_train.shape, y_train.shape, X_valid.shape, y_valid.shape, test_df.shape

In [None]:
X=train_df
y=train_df[TARGET]
train_df.drop(columns=[TARGET],inplace=True)
X.shape, y.shape

# 3. Model

In [None]:
# sample_train_df =train_df.iloc[748835:848835,[1,5]]

# from statsmodels.tsa.stattools import adfuller

# def isTargetStationary():
#     print("Observations of Dickey-fuller test")
#     dftest = adfuller(sample_train_df['congestion'],autolag='AIC')
#     dfoutput=pd.Series(dftest[0:4],index=['Test Statistic','p-value','#lags used','number of observations used'])
#     for key,value in dftest[4].items():
#         dfoutput['critical value (%s)'%key]= value
#     print(dfoutput)

**How to find whether target is stationary in time series?**

There are few methods to check the stationary of the data. I used Dickey-fuller test based on [this](https://www.analyticsvidhya.com/blog/2021/04/how-to-check-stationarity-of-data-in-python/#h2_5) article.

I randomly selected 100k examples ( since running this test for entire training data set takes long time ). The test statistic is less than critical values at diff. percentage ( learn more by expanding the above code cell ). So in this case, we can reject our null hypothesis conclude that our data is stationary.

* Test Statistic                -6.529604e+00
* p-value                        9.945940e-09
* #lags used                     6.400000e+01
* number of observations used    9.993500e+04
* critical value (1%)           -3.430415e+00
* critical value (5%)           -2.861569e+00
* critical value (10%)          -2.566785e+00

*PS: In Best interest of execution time, I ran test separately and got this result.*

**Since, target is stationary, we can use XGBoost model. Learn more [here](https://towardsdatascience.com/xgboost-for-time-series-youre-gonna-need-a-bigger-boat-9d329efa6814)**

In [None]:
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape, test_df.shape

In [None]:
import xgboost as xgb

from sklearn.metrics import mean_absolute_error

models = {"XGB": xgb.XGBRegressor(eval_metric=mean_absolute_error)}

In [None]:
%%time 

xgbr = models.get('XGB')

xgbr.fit(X_train,
               y_train
               ,eval_set = [(X_train, y_train),(X_valid, y_valid)]
               ,verbose=False
                  )

In [None]:
eval_result = xgbr.evals_result()
training_rounds = range(len(eval_result['validation_0']['rmse']))

In [None]:
import matplotlib.pyplot as plt

def plot_curve(rounds, eval_result):
    plt.scatter(x=training_rounds,y=eval_result['validation_0']['rmse'],label='Training Error')
    plt.scatter(x=training_rounds,y=eval_result['validation_1']['rmse'],label='Validation Error')
    plt.grid(True)
    plt.xlabel('Iteration')
    plt.ylabel('MAE')
    plt.title('Training Vs Validation Error')
    plt.legend()
    plt.show()

In [None]:
plot_curve(training_rounds,eval_result)

In [None]:
# make predictions for test data and evaluate
y_pred = xgbr.predict(X_valid)
predictions = [round(value) for value in y_pred]
mae= mean_absolute_error(y_valid, y_pred)

print("Mean absolute error={}".format(round(mae,6)))

**3.1 Hyper Parameter Tunning using XGBoost**

In [None]:
def buildandTrainModelwithCV(alg, X, y,predictors,useTrainCV, cv_folds, early_stopping_rounds):
    
    test_predictions= []
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(data=X, label=y)
        
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds
            ,metrics='rmse', early_stopping_rounds=early_stopping_rounds
                         )
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(X, y,eval_metric=['rmse']
            ,eval_set = [(X, y), (X_valid, y_valid)]
            ,verbose=False            
           )
        
    #Predict training set
    dtrain_predictions = alg.predict(X)
    mae=mean_absolute_error(y, dtrain_predictions)
    
    #Print model report:
    print("\nModel Report")
    print("Validation Mean absolute error={}".format(round(mae,6)))
    
    
    return alg.evals_result()          

In [None]:
xgbr_cv = models.get('XGB')

In [None]:
%%time
eval_result=buildandTrainModelwithCV(xgbr_cv, X, y, features, True, 5, 50)

In [None]:
plot_cv_rounds = range(len(eval_result['validation_0']['rmse']))
plot_curve(plot_cv_rounds,eval_result)

In [None]:
from xgboost import plot_importance
plot_importance(xgbr_cv)

In [None]:
y_pred = xgbr_cv.predict(X_valid)
y_pred = [round(value) for value in y_pred]   

X_valid_copy=X_valid.copy()
X_valid_copy[TARGET] = y_valid
X_valid_copy['error'] = y_valid - y_pred
X_valid_copy['abs_error'] = X_valid_copy['error'].apply(np.abs)
error_by_day = X_valid_copy.groupby(['x','y','direction','hour','minute']).mean()[[TARGET,'error','abs_error']]

In [None]:
error_by_day.sort_values('error', ascending=True).head(10)

In [None]:
# Best predicted days
error_by_day.sort_values('abs_error', ascending=True).head(10)

In [None]:
# Worst absolute predicted days
error_by_day.sort_values('abs_error', ascending=False).head(10)

# 4. Submission

In [None]:
test_predictions = xgbr_cv.predict(test_df[features])
test_predictions = [round(value) for value in test_predictions]   

In [None]:
sub_df = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2022/sample_submission.csv')

In [None]:
sub_df[TARGET]=test_predictions
sub_df.head()

In [None]:
sub_df.to_csv('submission.csv', index=False)