# Import Library

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load Data

In [None]:
df = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv')

In [None]:
df.info()

In [None]:
for column in df.columns:
    print('-----------------------')    
    print(column)
    print(df[column].unique())

In [None]:
df.describe()

In [None]:
def feat_eng(test = False):
    if test:
        df = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv')
    else:
        df = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv')
    df['time'] = pd.to_datetime(df['time'])
    df['year'] = df['time'].dt.year
    df['month'] = df['time'].dt.month
    df['day'] = df['time'].dt.day
    df['hour'] = df['time'].dt.hour
    df['minute'] = df['time'].dt.minute
    dir_map = {'EB': [1,0], 
              'NB': [0,1], 
              'SB': [0,-1], 
              'WB': [-1,0], 
              'NE': [1,1], 
              'SW': [-1,-1], 
              'NW': [-1,1], 
              'SE': [1,-1]}
    df['direction0'] = df['direction'].map(lambda x: dir_map[x][0])
    df['direction1'] = df['direction'].map(lambda x: dir_map[x][1])
    df.drop(['row_id', 'time','direction'], axis=1, inplace=True)       
    if test:
        return df
    else:
        y = df['congestion']
        df.drop(['congestion'], axis = 1, inplace = True)        
        return df, y

In [None]:
train_x, train_y = feat_eng(test = False)
test_x = feat_eng(test = True)

In [None]:
test_x

In [None]:
def SMAPE(y_true, y_pred):
    return np.abs(y_true - y_pred) / (y_true + np.abs(y_pred)) * 200

In [None]:
from sklearn.metrics import log_loss, mean_squared_error
from sklearn.model_selection import KFold,TimeSeriesSplit
from sklearn.linear_model import LinearRegression
import xgboost as xgb
import lightgbm as lgb
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# Train Model

In [None]:
# fold5
kf = KFold(n_splits = 5, shuffle = True, random_state = 70)

# modeling and training
for fold, (tr_idx, va_idx) in enumerate(kf.split(train_x)):
    print(f'--------fold:{fold+1}--------')
    fold+=1
    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
    

    lgb_param = {'boosting_type':'gbdt',
            'learning_rate': 0.1, 
            'bagging_fraction' : 0.9,
            'bagging_freq': 20,
            'colsample_bytree': 0.9,
             'metric': 'rmse',
            'min_child_weight': 0.01,
             'zero_as_missing': True,
            'objective': 'regression',
            'device' : 'gpu',
            
            }
    
    train_set1 = lgb.Dataset(tr_x, tr_y, silent=False, params={'verbose': -1})
    valid_set1 = lgb.Dataset(va_x, va_y, silent=False, params={'verbose': -1})
    model = lgb.train(params = lgb_param, train_set = train_set1 , num_boost_round=5000, early_stopping_rounds=100,verbose_eval=500, valid_sets=valid_set1)
    # Training the model
    

    val_pred = model.predict(va_x)
    # Convert the target back to non-logaritmic.
    print(f' SMAPE: {SMAPE(np.exp(va_y), np.exp(val_pred))}')

# Submission

In [None]:
x = feat_eng(test = True)
pred = model.predict(x)
submission = pd.read_csv('../input/tabular-playground-series-mar-2022/sample_submission.csv')
submission['congestion'] = pred
submission.to_csv('./submission.csv', index = False)