# ***[Tabular Playground Series - Mar 2022] PyCaret***

<img src="https://rightcode.co.jp/wp-content/uploads/2020/06/pycaret_logo_001.png" width="500">

# Import the libraries

In [None]:
%%capture
!pip install pycaret[full]

import pandas as pd
import numpy as np 
from pycaret.regression import *

import warnings
warnings.filterwarnings("ignore")

# Import train data and test data

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv')
train_df.head()

In [None]:
test_df = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv')
test_df.head()

# Feature engineering

In [None]:
def data_preprocessing(df):
    df['time'] = pd.to_datetime(df['time'])
    df['month']= df['time'].dt.month
    df['hour'] = df['time'].dt.hour
    df['wkday'] = df['time'].dt.weekday
    df['moment']= (df['time'].dt.hour - 12)*3 + df['time'].dt.minute//20
    df['road'] = df['x'].astype(str)+df['y'].astype(str)+df['direction']
    df['road_time']= df['road']+df['moment'].astype(str)
    
    return df

In [None]:
train_df = data_preprocessing(train_df)
test_df = data_preprocessing(test_df)

#### Delete data that is not relevant to prediction

In [None]:
train_df['offical_holiday'] = train_df['time'].dt.date.astype(str).str.contains('1991-05-27|1991-07-04|1991-09-02').astype('int')
train_df=train_df[train_df['offical_holiday']==0]
train_df=train_df.drop(['offical_holiday','time', 'row_id'],axis=1)
test_df = test_df.drop(['row_id', 'time'], axis=1)

train_df=train_df[(train_df['wkday']< 4) & (train_df['month'] > 4)]  

#### create congestion Median columns group by 'road', 'weekday', 'hour', 'minute

In [None]:
mapper_avg = train_df[['road_time','congestion']].groupby(['road_time']).median().to_dict()['congestion']

train_df['avg']= train_df['road_time'].map(mapper_avg)
test_df['avg']= test_df['road_time'].map(mapper_avg)

In [None]:
train_df = train_df[train_df['moment'] >=0]

In [None]:
train_df.head()

In [None]:
test_df.head()

# Modeling (Pycaret)

### 1. Setup the regressor

In [None]:
reg = setup(data = train_df,
            target = 'congestion',
            session_id=999,
            data_split_shuffle = True, 
            create_clusters = False,
            fold_strategy = 'groupkfold',
            fold_groups = 'wkday',
            use_gpu = False,
            silent = True,
            fold=4,
            ignore_features = ['road_time'],
            n_jobs = -1)

### 2. Compare models

In [None]:
top3 = compare_models(sort = 'MAE', n_select=3, exclude = ['lar',  'rf', 'et', 'gbr', 'xgboost'])

### 3. Ensemble top3 models

In [None]:
blender = blend_models(top3)

In [None]:
final = finalize_model(blender)

### 4. Prediction

In [None]:
test_df['pred'] = (predict_model(final, data=test_df)['Label']).round()

sep = train_df[train_df['month'] >= 9]
lower = sep.groupby(['moment', 'x', 'y', 'direction']).congestion.quantile(0.15).values
upper = sep.groupby(['moment', 'x', 'y', 'direction']).congestion.quantile(0.7).values

test_df['pred'] = test_df['pred'].clip(lower, upper)

#### for roadways with low value count replace prediction with nearest value from the training data

In [None]:
for road in set(test_df['road']):
    
    road_counts = train_df.loc[train_df['road'] == road,'congestion'].value_counts()
    
    l = road_counts[(road_counts > 200)]
    if len(l) > 2:
        l = list(l.index)
        test_df.loc[test_df['road'] == road,'pred']= test_df.loc[test_df['road'] == road,'pred'].map(lambda y: min(l, key=lambda x:abs(x-y)))

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-mar-2022/sample_submission.csv')
submission['congestion'] = test_df['pred']
submission.to_csv('submission.csv', index=False)

In [None]:
submission

# Reference Notebook

[TPS_2022_03_LGBM][1]

[1]:https://www.kaggle.com/code/kotrying?kernelSessionId=91336591