# introduction

I would like to share my code to achive Score 4.941.  
The congestion is predicted with 3 features of date&time and road names.  
I use LightGBM and Optuna to search hyper parameters.

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

import lightgbm as lgb
from lightgbm import early_stopping
from lightgbm import log_evaluation

import optuna.integration.lightgbm as lgb_o
import optuna

import warnings
warnings.simplefilter('ignore', UserWarning)
optuna.logging.set_verbosity(optuna.logging.WARNING)

# import data

In [None]:
#import data
train_o = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv')
test_o = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv')
#add a column to identify train data & test data
train_o['train'] = 1
test_o['train'] = 0
#connect train & test data
alldata = pd.concat([train_o,test_o])
alldata.reset_index(inplace=True,drop=True)

# create features

In [None]:
#features relating date & time
alldata['time'] = pd.to_datetime(alldata['time'])
alldata['dow'] = alldata['time'].apply(lambda x:x.dayofweek) # dow: day of week
alldata['doy'] = alldata['time'].apply(lambda x:x.dayofyear) # doy: day of year
alldata['hm'] = alldata['time'].apply(lambda x:x.hour*60+x.minute) # hm: hour and minute
alldata['date'] = alldata['time'].apply(lambda x:x.date())
alldata['date'] = pd.to_datetime(alldata['date'])

In [None]:
#road: direction + x + y
alldata['road'] = alldata['direction'].str.cat(alldata['x'].astype(str).str.cat(alldata['y'].astype(str),sep=''),sep='')
alldata.columns

# define category features

In [None]:
rd_dummy = pd.get_dummies(alldata['road'],prefix='rd')
dummies = rd_dummy
categorical_features = dummies.columns

# prepara data for training/validation/test

In [None]:
#correct data
data_cong = alldata.loc[:,['congestion','doy','dow','hm']]
data_cong = pd.concat([data_cong,dummies],axis=1)

#data period of validation data
valid_s = pd.to_datetime('1991-09-23 12:00')
valid_f = pd.to_datetime('1991-09-23 23:40')

#index: train / valid / test
train_index = alldata.query('train==1 & time<@valid_s').index
valid_index = alldata.query('train==1 & @valid_s<=time<=@valid_f').index
test_index = alldata.query('train==0').index

#create data of train/valid/test
X_train = data_cong.loc[train_index,'doy':].values
y_train = data_cong.loc[train_index,'congestion'].astype(int).values
X_valid = data_cong.loc[valid_index,'doy':].values
y_valid = data_cong.loc[valid_index,'congestion'].astype(int).values
X_test = data_cong.loc[test_index,'doy':].values

#feature names
feature_names = data_cong.loc[train_index, 'doy':].columns.to_list()

print('categorical features')
print(categorical_features)
print('-'*30)
print('feature names')
print(feature_names)

# search best hyper parameters

In [None]:
#data for hyper parameter tuning & validation
dtrain = lgb_o.Dataset(X_train, label=y_train)
dvalid = lgb_o.Dataset(X_valid, label=y_valid)

In [None]:
%%time

study = optuna.create_study(direction='minimize')
kf = KFold(n_splits=3)

params = {
        'objective': 'regression',
        'metric': 'rmse',
        'verbosity': -1,
        'boosting_type': 'gbdt',                
        'random_seed': 0,
        'learning_rate': 0.1
        }

tuner = lgb_o.LightGBMTunerCV(
    params,
    dtrain, 
    study=study,
    folds=kf,#rkf,
    num_boost_round=1000,
    verbose_eval=False,
    early_stopping_rounds=250,
    optuna_seed = 0,
    show_progress_bar=True,
    time_budget=19800, # Time budget of 5 hours, we will not really need it
    return_cvbooster=True
)

tuner.run()

In [None]:
def display_tuning_results(model):
    print('best parameters')
    print(model.best_params)
    print('-'*30)
    print(f'best score: {model.best_score}')

display_tuning_results(tuner)

# training with the searched hyper parameters

In [None]:
#define model
tunedmodel = lgb.train(tuner.best_params,dtrain)
#predict congestion with train/validation data
pred_train = np.round(tunedmodel.predict(X_train))
pred_valid = np.round(tunedmodel.predict(X_valid))

In [None]:
#score calculation
r2_train = r2_score(y_train,pred_train)
r2_valid = r2_score(y_valid,pred_valid)
rmse_train = np.sqrt(mean_squared_error(y_train,pred_train))
rmse_valid = np.sqrt(mean_squared_error(y_valid,pred_valid))
mae_train = np.sqrt(mean_absolute_error(y_train,pred_train))
mae_valid = np.sqrt(mean_absolute_error(y_valid,pred_valid))
rmse_mae_train = rmse_train / mae_train
rmse_mae_valid = rmse_valid / mae_valid

print('R2')
print(f'train: {r2_train}')
print(f'valid: {r2_valid}')
print('-'*30)
print('RMSE - Root Mean Squared Error')
print(f'train: {rmse_train}')
print(f'valid: {rmse_valid}')
print('-'*30)
print('MAE - Mean Absolute Error')
print(f'train: {mae_train}')
print(f'valid: {mae_valid}')
print('-'*30)
print('RMSE/MAE')
print(f'train: {rmse_mae_train}')
print(f'valid: {rmse_mae_valid}')

# prediction of test data

In [None]:
pred_test = tunedmodel.predict(X_test)
pred_test = np.round(pred_test)

# create submitting file

In [None]:
#train data in September 1991
sep = alldata.copy().query('train==1 & date>"1991-09-01"')
sep['hour'] = sep['time'].apply(lambda x:x.hour)
sep['minute'] = sep['time'].apply(lambda x:x.minute)

#choose data of weekday afternoon in September excepting LaborDay (2 September 1991)
sep = sep.query('hour>=12 & dow<5 & date!="1991-09-02"')
lower = sep.groupby(['hour', 'minute', 'x', 'y', 'direction']).congestion.quantile(0.15).values
upper = sep.groupby(['hour', 'minute', 'x', 'y', 'direction']).congestion.quantile(0.7).values

#input predicted values
alldata.loc[alldata.query('train==0').index,'congestion'] = pred_test

#clipping special values
sub_data = alldata.loc[alldata.query('train==0').index,['row_id','congestion']]
sub_data.rename(columns={'row_id':'row_Id','congestion':'congestion_before_clipping'},inplace=True)
sub_data['congestion'] = submitted_file['congestion_before_clipping'].clip(lower,upper)
sub_data

#create file to submit
submitted_file = sub_data.loc[:,['row_Id','congestion']].astype(int)
submitted_file.to_csv('submition.csv', index=False)