In [None]:
# General imports:- 

import numpy as np; 
from scipy.stats import iqr;
import pandas as pd;
from re import findall;

import matplotlib.pyplot as plt;
%matplotlib inline
import seaborn as sns;

from warnings import filterwarnings;
from termcolor import colored;

from holidays import CountryHoliday;

np.random.seed(10);

In [None]:
# Model specific imports:-

from sklearn.base import BaseEstimator, TransformerMixin;
from sklearn.preprocessing import LabelEncoder, FunctionTransformer;
from sklearn.pipeline import Pipeline;
from sklearn_pandas import DataFrameMapper;
from sklearn.model_selection import GridSearchCV, LeaveOneGroupOut, TimeSeriesSplit;
from sklearn.metrics import r2_score, mean_squared_error;

from sklearn.tree import DecisionTreeRegressor;
from sklearn.ensemble import RandomForestRegressor;
from xgboost import XGBRegressor;
from lightgbm import LGBMRegressor;
from catboost import CatBoostRegressor;

# Tabular Playground Series- March 2022

* This is a panel data based regression problem that is developed using ensemble methods and time series analysis
* Data visualization and pre-processing steps are also outlined and envisaged

In [None]:
# Loading the data-sets:-
xytrain = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv', encoding= 'utf8', 
                      parse_dates=True);
xtest = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv', encoding= 'utf8', 
                    parse_dates=True);
sub_fl = pd.read_csv('../input/tabular-playground-series-mar-2022/sample_submission.csv', encoding= 'utf8', 
                     parse_dates=True);

print(colored(f"Train data set length = {len(xytrain):,.0f}; test set length = {len(xtest):,.0f}", color= 'blue',
             attrs= ['bold', 'dark']));

print(colored(f"\nSample submission file\n", color = 'blue', attrs= ['bold', 'dark']));
display(sub_fl.head(5));

print(colored(f"\nTraining data-set head\n", color = 'blue', attrs= ['bold', 'dark']));
display(xytrain.head(5));
print(colored(f"\nTraining data-set tail\n", color = 'blue', attrs= ['bold', 'dark']));
display(xytrain.tail(5));

print(colored(f"\nTest data-set head\n", color = 'blue', attrs= ['bold', 'dark']));
display(xtest.head(5));

## Section1:- Data processing and visualization:-

* This section aims to elicit information from the data that can be used to select an algorithm for the model development.
* It also aims to plot the necessary columns and gain further data-insights

In [None]:
# Data processing and insights:-
print(colored(f"\nTraining data-set information\n", color = 'blue', attrs= ['bold', 'dark']));
display(xytrain.info());

print(colored(f"\nTest data-set information\n", color = 'blue', attrs= ['bold', 'dark']));
display(xtest.info());

print(colored(f"\nTraining data-set numerical attribute description\n", color = 'blue', attrs= ['bold', 'dark']));
display(xytrain.describe().transpose().style.format('{:,.0f}'));

# Direction column insights:-
print(colored(f"\nTraining data-set direction attribute insights\n", color = 'blue', attrs= ['bold', 'dark']));
display(xytrain.groupby(['direction'])[['congestion']].describe().style.format('{:,.1f}'));

In [None]:
# Creating date-parts for further analysis and visualization:-
def Create_DateParts(df: pd.DataFrame):
    """
    This function creates date-parts from the time column for visualization and further analysis.
    Input- df (dataframe):- analysis dataframe
    Returns- df (dataframe):- modified dataframe
    """;
    
    # Changing the data-type of the 'time' column from 'object' to pd.Datetime:-
    df['time'] = pd.to_datetime(df.time);
    
    (df['Date'], df['Year_Nb'], df['Month_Nb'], df['Day_Nb'], df['Hour_Nb'], df['Minute_Nb'], 
     df['DayofWeek'], df['Week_Nb'], df['is_AM'], df['is_Holiday']) = \
    (df['time'].dt.date, df['time'].dt.year, df['time'].dt.month, df['time'].dt.day, 
     df['time'].dt.hour, df['time'].dt.minute, df['time'].dt.day_of_week, df['time'].dt.isocalendar().week,
     np.where(df['time'].dt.hour < 12, 1, 0), 
     np.where(df['time'].isin(list(CountryHoliday(country= 'US', years= [1991]).keys())), 1,0)    
    );
    
    df['Week_Nb'] = df['Week_Nb'].astype(np.int32);
   
    return df;

In [None]:
# Creating the data processing pipeline, including label encoding, extra features creation:-
Data_Xformer = Pipeline(steps= [('LblEnc', 
                                 DataFrameMapper(input_df= True, df_out= True, default= None, drop_cols= ['row_id'],
                                                features= [(['direction'], LabelEncoder(), {'prefix': 'Enc_'})])
                                ),
                                ('DtPartGen', FunctionTransformer(Create_DateParts))
                               ]);

In [None]:
# Implementing the pipeline onto the data-sets:-
filterwarnings('ignore')
Data_Xformer.fit(xytrain.drop('congestion', axis= 1), np.ravel(xytrain[['congestion']]));
mdl_xytrain = Data_Xformer.transform(xytrain);
mdl_xtest = Data_Xformer.transform(xtest);

# Mapping the direction text column to the model training/ test set for plots:-
mdl_xytrain = pd.concat((mdl_xytrain, xytrain[['direction']]), axis=1);
mdl_xtest = pd.concat((mdl_xtest, xtest[['direction']]), axis=1);

print(colored(f"\nColumns in the train set after pipeline and adjustment are:-\n",
              color = 'blue', attrs= ['bold', 'dark']));
print(colored(f"{list(mdl_xytrain.columns)}", color= 'blue'));

print(colored(f"\nData-types after data processing in the train-set are:-\n", color= 'blue', attrs= ['bold', 'dark']));
print(colored(f"{mdl_xytrain.dtypes}", color = 'blue'));

#### Basic inferences after initial data preprocessing:-

* The data does not have any null values, null treatment step is not required
* The data contains spatial attributes (direction, x, y) and time attributes (time). Both of them need to considered during model development
* Data visualization will help in gaining more insights into the congestion grouped by direction

In [None]:
# Analyzing traffic flow across all directions:-
fig, ax = plt.subplots(1,1, figsize= (10,8));
sns.boxplot(data = mdl_xytrain[['time', 'direction', 'congestion']].\
            pivot_table(index= 'time', values= 'congestion', columns= 'direction'), palette= 'Blues', ax= ax);
ax.set_title(f"Distribution analysis for traffice flow across all directions", color = 'tab:blue', fontsize= 14);
ax.grid(visible=True, which='major', axis='both', color = 'grey', linestyle= '--', linewidth = 0.50);
ax.set_yticks(range(0,110,10), fontsize= 8);
ax.set_ylabel(f"Congestion Level", color= 'tab:blue', fontsize= 10);
ax.set_xlabel(f"Direction", color= 'tab:blue', fontsize= 10);
plt.show();

In [None]:
def Plot_LineGraph(df:pd.DataFrame, per_lbl: str):
    """
    This function plots a line-plot for the directional columns in the pivot table created from the xytrain data
    Inputs- 
    df (dataframe):- Analysis dataframe, 
    per_lbl (string):- Period string for axes title
    """;
    
    fig, ax= plt.subplots(8,1, figsize = (15,40), sharex= True); 
    for ax_nb, dir_lbl in enumerate(_.columns):
        sns.lineplot(y= df.loc[:, dir_lbl].values, x= df.index, color= 'tab:blue', linewidth= 1.50, ax= ax[ax_nb]);
        ax[ax_nb].set_title(f"{per_lbl} congestion for {dir_lbl} direction", fontsize= 10, color= 'tab:blue');
        ax[ax_nb].grid(visible= True, color= 'grey', linewidth= 0.75, linestyle= '--');
    plt.show();

In [None]:
# Daily average congestion per direction:-
_ = mdl_xytrain.pivot_table(index= 'Date', values= 'congestion', columns= 'direction', aggfunc= np.mean);
Plot_LineGraph(df= _, per_lbl = 'Daily');
del _;

In [None]:
# Daily morning congestion per direction:-
_ = mdl_xytrain.query("is_AM == 1").pivot_table(index= 'Date', values= 'congestion', columns= 'direction', aggfunc= np.mean);
Plot_LineGraph(df= _, per_lbl = 'Daily Morning');
del _;

In [None]:
# Daily afternoon congestion per direction:-
_ = mdl_xytrain.query("is_AM == 0").pivot_table(index= 'Date', values= 'congestion', columns= 'direction', aggfunc= np.mean);
Plot_LineGraph(df= _, per_lbl = 'Daily Afternoon');
del _;

In [None]:
# Plotting weekly average traffic per direction:-
_ = mdl_xytrain.pivot_table(index= 'Week_Nb', values= 'congestion', columns= 'direction', aggfunc= np.mean);
Plot_LineGraph(df= _, per_lbl = 'Weekly');
del _;

In [None]:
# Plotting weekly morning average traffic per direction:-
_ = mdl_xytrain.query("is_AM==1").pivot_table(index= 'Week_Nb', values= 'congestion', columns= 'direction', 
                                              aggfunc= np.mean);
Plot_LineGraph(df= _, per_lbl = 'Weekly morning');
del _;

In [None]:
# Plotting weekly afternoon average traffic per direction:-
_ = mdl_xytrain.query("is_AM==0").pivot_table(index= 'Week_Nb', values= 'congestion', columns= 'direction', 
                                              aggfunc= np.mean);
Plot_LineGraph(df= _, per_lbl = 'Weekly afternoon');
del _;

In [None]:
# Displaying monthly average traffic per direction:-
print(colored(f"\nMonthly average traffic congestion per direction\n", color = 'blue', attrs= ['bold', 'dark']));
display(mdl_xytrain.pivot_table(index= 'Month_Nb', values= 'congestion', 
                                columns= 'direction', aggfunc= np.mean).\
        style.set_precision(2).highlight_max(color= 'lightblue', axis=1).highlight_min(color= 'lightgrey', axis=1));


print(colored(f"\nDaily traffic congestion by direction by morning/ afternoon\n", 
              color= 'blue', attrs= ['bold', 'dark']));
_ = mdl_xytrain.pivot_table(index= 'Month_Nb', columns= ['is_AM', 'direction'], values= 'congestion', aggfunc= np.mean);
_.columns = [j+ '_PM'if i==0 else j+'_AM' for i, j in _.columns.to_flat_index()];
display(_.style.set_properties(**{'background-color': 'azure'}, 
                               subset= [col for col in _.columns if findall(r"_PM", col) != []]).\
        set_precision(2).highlight_max(color= 'lightblue', axis= 1).highlight_min(color= 'lightgrey', axis=1)
       )

In [None]:
# Displaying weekday average traffic per direction:-

print(colored(f"\nWeekday average traffic congestion per direction\n", color = 'blue', attrs= ['bold', 'dark']));
display(mdl_xytrain.pivot_table(index= 'DayofWeek', values= 'congestion', 
                                columns= 'direction', aggfunc= np.mean).\
        style.set_precision(2).highlight_max(color= 'lightblue', axis=1).highlight_min(color= 'lightgrey', axis=1));


print(colored(f"\nDaily traffic congestion by direction by morning/ afternoon\n", 
              color= 'blue', attrs= ['bold', 'dark']));
_ = mdl_xytrain.pivot_table(index= 'DayofWeek', columns= ['is_AM', 'direction'], values= 'congestion', aggfunc= np.mean);
_.columns = [j+ '_PM'if i==0 else j+'_AM' for i, j in _.columns.to_flat_index()];
display(_.style.set_properties(**{'background-color': 'azure'}, 
                               subset= [col for col in _.columns if findall(r"_PM", col) != []]).\
        set_precision(2).highlight_max(color= 'lightblue', axis=1).highlight_min(color= 'lightgrey', axis=1)
       )

In [None]:
# Checking traffic flow on holidays versus other days:-
_ = pd.concat((mdl_xytrain.pivot_table(index= 'direction', values= 'congestion', columns= 'is_Holiday', aggfunc= np.mean),
               mdl_xytrain.query("DayofWeek >=5")[['direction', 'congestion']].groupby(['direction'])['congestion'].mean()),
              axis= 1)
_.columns= ['Non_Holiday', 'Holiday', 'Weekend'];

print(colored(f"\nMean congestion by direction by holiday/ weekend\n", color= 'blue', attrs= ['bold', 'dark']));
display(_.style.set_precision(2).highlight_min(color= 'lightgrey', axis=1));

In [None]:
# Checking correlation between the directional traffic flow:-

fig, ax = plt.subplots(1,1, figsize= (20,8));
sns.heatmap(mdl_xytrain[['time', 'direction', 'congestion']].\
            pivot_table(index= 'time', values= 'congestion', columns= 'direction').corr(),
            vmin=0.0, vmax=1.0, cmap= 'Blues', annot= True, fmt='.2%',linewidths=1.0, linecolor='darkblue', ax = ax);
ax.set_title(f'Correlation heatmap for congestion across all directions\n', color= 'tab:blue', fontsize= 14);
ax.set(xlabel= '', ylabel= '');
plt.yticks(rotation= 0, color= 'tab:blue', fontsize= 10);
plt.xticks(color= 'tab:blue', fontsize= 10);
plt.show();

# Section 2:- Model development:-

**This section is divided into 2 parts as illustrated below-**

A. If the user chooses the LeaveOneGroupOut CV strategy, then the below steps are executed- 
1. Catboost model is executed.
2. Grid search is not executed
3. Test predictions are stored according to the day number (split by weekday)/ month number (monthly split)
 
B. If the user chooses any other cross-validation strategy, then the below steps are done-

1. Model parameters and output prediction storage objects are initialized. 
2. Model is loaded, one at a time
3. Model is fitted on the train-set 
4. RSquare is printed for the train set
5. Test-set predictions are stored in the predictions output object

In [None]:
# Setting the CV strategy and associated parameters- options (LOGO/ cv object):-
# cv = TimeSeriesSplit(5);
# ftre_split = '';
cv = 'LOGO';
ftre_split = 'DayofWeek';

# Splitting the data into xtrain-ytrain with relevant column removal:-
xtrain = mdl_xytrain.drop(['time', 'Date', 'direction', 'congestion'], axis=1);
ytrain = mdl_xytrain[['congestion']].values.ravel();

mdl_mst_dict = \
{
'Dtree': [DecisionTreeRegressor(), {'max_depth': range(8, 20, 2)}],
'XgBoost': [XGBRegressor(), {}],
'RandomForest': [RandomForestRegressor(random_state= 10), 
                 {'n_estimators': range(100, 410, 100), 'max_depth': range(12, 21, 2)}],
'LGBM': [LGBMRegressor()],
'CatBoost': [CatBoostRegressor(iterations= 500, loss_function= 'MAE', verbose=100, early_stopping_rounds= 5),
             {'max_depth': range(12, 21,2), 'learning_rate': [0.05, 0.08, 0.12]}] 
};

In [None]:
# Implementing CatBoost regressor without grid-search for CV strategy = Leave One Group Out:-
filterwarnings('ignore');
if cv == 'LOGO':
    # Developing output storage object:-
    n_splits = len(mdl_xytrain.loc[:, ftre_split].unique());
    mdl_pred_prf = pd.DataFrame(data= None, index= xtest.row_id, columns= ['Col'+str(i) for i in range(0,n_splits,1)]);
    print(colored(f"Splits = {n_splits} based on {ftre_split}\n", color = 'blue'));
    
    print(colored(f"CATBOOST REGRESSOR\n", color = 'green', attrs= ['bold']));
    
    for nb,idx in enumerate(list(LeaveOneGroupOut().split(xtrain, ytrain, groups= xtrain['DayofWeek'].values))):
        print(colored(f"\nCurrent split is by day number {nb}\n", color= 'red', attrs = ['bold']));

        mdl = CatBoostRegressor(iterations= 500, loss_function= 'MAE', verbose=100, early_stopping_rounds= 5,
                                max_depth = 15, learning_rate = 0.05);

        mdl.fit(xtrain.iloc[idx[0]], ytrain[idx[0]]); 
        ytrain_pred = mdl.predict(xtrain.iloc[idx[0]]);
        ydev_pred = mdl.predict(xtrain.iloc[idx[1]]);
        ytest_pred = mdl.predict(mdl_xtest.drop(['time', 'Date', 'direction'], axis=1));

        print(colored(f"Train RSquare = {r2_score(ytrain[idx[0]], ytrain_pred):.2%}", color= 'blue'));
        print(colored(f"Dev-set RSquare = {r2_score(ytrain[idx[1]], ydev_pred):.2%}\n", color= 'blue'));

        mdl_pred_prf['Col'+str(nb)] = ytest_pred;        
        del ytrain_pred, ydev_pred, ytest_pred;     

# Implementing model grid for all other CV strategies:-
else:
    mdl_pred_prf = pd.DataFrame(data= None, index= xtest.row_id, columns= mdl_mst_dict.keys());
    print(colored(f"Models used are\n {list(mdl_mst_dict.keys())}", color = 'blue'));
    
    print(colored(f"REGRESSOR WITH GRID SEARCH\n", color = 'green', attrs= ['bold']));
    
    for mdl_lbl, mdl_param in mdl_mst_dict.items():
        print(colored(f"\nCurrent model is {mdl_lbl}\n", color= 'red', attrs = ['bold']));

        grid = GridSearchCV(mdl_param[0], param_grid= mdl_param[1], scoring='neg_mean_absolute_error', cv= cv); 
        grid.fit(mdl_xytrain.drop(['time', 'Date', 'direction', 'congestion'], axis=1), 
                 mdl_xytrain.loc[:, 'congestion']);  
        ytrain_pred = grid.predict(mdl_xytrain.drop(['time', 'Date', 'direction', 'congestion'], axis=1));
        ytest_pred = grid.predict(mdl_xtest.drop(['time', 'Date', 'direction'], axis=1));

        print(colored(f"Best parameters\n{grid.best_params_}\n", color= 'blue'));  
        print(colored(f"Train RSquare = {r2_score(mdl_xytrain.loc[:, 'congestion'], ytrain_pred):.2%}", color= 'blue')); 

        mdl_pred_prf[mdl_lbl] = ytest_pred;
        del ytrain_pred, ytest_pred;     

# Section3:- Submission file preparation

In [None]:
print(colored(F"Select final column from\n{list(mdl_pred_prf.columns)}", color = 'blue', attrs= ['dark', 'bold']));

In [None]:
col_nm = 'Col2';
mdl_pred_prf[[col_nm]].reset_index().rename({col_nm: 'congestion'}, axis=1).to_csv('Submission.csv', index= False)