In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
# Importing the needed lib
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing The Libraries

In [None]:
# Import darts libraries
!pip install pycaret[full]
from pycaret.regression import *
import warnings
warnings.filterwarnings("ignore")


# Loading the train data into a new dataframe

In [None]:
# few modifications to https://www.kaggle.com/code/kotrying/tps-2022-03-lgbm
train = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv',index_col='row_id')
test = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv',index_col='row_id')
train['time'] = pd.to_datetime(train.time)

# delete official holiday                                                 
train['offical_holiday'] = train.time.dt.date.astype(str).str.contains('1991-05-27|1991-07-04|1991-09-02').astype('int')
train=train[train['offical_holiday']==0]
train=train.drop('offical_holiday',axis=1)

# train on data for Mondays to Thursdays and complete months only
train=train[(train.time.dt.weekday< 4) & (train.time.dt.month > 4)]  

In [None]:
def pre_process(df): 
    df['time'] = pd.to_datetime(df['time'])
    df['month'] = df.time.dt.month
    df['day'] = df.time.dt.dayofyear
    df['am'] = (df.time.dt.hour < 12) & (df.time.dt.hour >6)
    df['wkday'] = df.time.dt.weekday
    df['time'] = (df.time.dt.hour-12)*3+df.time.dt.minute/20
    df['xydirday'] = df.x.astype(str)+df.y.astype(str)+df.direction+df.day.astype(str)
    df['xydir'] = df.x.astype(str)+df.y.astype(str)+df.direction
    df['all'] = df['xydir']+df.time.astype(str)
    
pre_process(train)
pre_process(test)

# calculate average traffic flow for each time of day 
mapper_avg = train[['all','congestion']].groupby(['all']).median().to_dict()['congestion']

In [None]:
train['avg'] = train['all'].map(mapper_avg)
test['avg'] = test['all'].map(mapper_avg)
train= train[train.time >=0]
display(train.head(2),test.head(2))

# Training and forecasting

In [None]:
reg = setup(data = train,
            target = 'congestion',
            session_id=999,
            data_split_shuffle = True, 
            create_clusters = False,
            fold_strategy = 'groupkfold',
            fold_groups = 'xydir',
            use_gpu = False,
            silent = True,
            pca = True, 
            pca_components = 10,
            fold=10,
            ignore_features = ['all','day','xydirday'],
            n_jobs = -1)

In [None]:
top3 = compare_models(sort = 'MAE', n_select = 3, exclude = ['lar',  'rf', 'et', 'gbr', 'xgboost'])

In [None]:
blender = blend_models(top3)

In [None]:
final = finalize_model(blender)

In [None]:
test['pred'] = (predict_model(final, data = test)['Label']).round()

# Idea from https://www.kaggle.com/code/ambrosm/tpsmar22-generalizing-the-special-values
sep = train[(train.day >= 246) & (train.time >= 0)]
lower = sep.groupby(['time', 'x', 'y', 'direction']).congestion.quantile(0.15).values
upper = sep.groupby(['time', 'x', 'y', 'direction']).congestion.quantile(0.7).values
test.pred = test.pred.clip(lower, upper)

# for roadways with low value count replace prediction with nearest value from the training data
for xydir in set(test.xydir):
    
    xydir_counts = train.loc[train.xydir == xydir,'congestion'].value_counts()
    
    l = xydir_counts[(xydir_counts > 200)] # experimental
    if len(l) > 2: # experimental
        l = list(l.index)
        test.loc[test.xydir == xydir,'pred'] = test.loc[test.xydir == xydir,'pred'].map(lambda y: min(l, key = lambda x:abs(x-y)))
test.rename(columns = {'pred':'congestion'}, inplace = True)

# QC of the data and model 

In [None]:
!pip install deepchecks --user
from deepchecks.suites import full_suite
from deepchecks.base import Dataset

In [None]:
# Now the magic
train_dataset = Dataset(train, label = 'congestion', cat_features = ['xydirday','xydir','all'])
test_dataset = Dataset(test, label = 'congestion', cat_features = ['xydirday','xydir','all'])
suite = full_suite()
suite.run(train_dataset = train_dataset, test_dataset = test_dataset, model = final)

# Postprocessing for Submission

In [None]:
sub = pd.DataFrame(list(zip(test.index,test.congestion.values)),columns = ['row_id', 'congestion'])
sub.to_csv('submission.csv', index = False)
sub.head()