## Imports

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from matplotlib import pyplot as plt
import sys
import warnings
import zipfile
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
import optuna
from sklearn.model_selection import KFold

warnings.filterwarnings("ignore")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Exploring the dataset

In [None]:
# Declaring data directory
data_dir = os.path.join('/kaggle/input', 'tabular-playground-series-mar-2022')

In [None]:
# Declaring path of training and testing data
train_path = os.path.join(data_dir, 'train.csv') 
test_path = os.path.join(data_dir, 'test.csv')

# Creating dataframes for training and testing data
df_train = pd.read_csv(train_path, encoding='latin1')
df_test = pd.read_csv(test_path, encoding='latin1')

In [None]:
df_train.head()

In [None]:
# Dimensions of the training data
df_train.shape

In [None]:
df_test.head()

In [None]:
# Dimensions of the testing data
df_test.shape

In [None]:
# Checking for null values in the training data

if (df_train.isnull().values.any() == False) :
    print('There are no null values in Training data')
else :
    print("There are null values in Training data")

In [None]:
# Checking for null values in the testing data

if (df_test.isnull().values.any() == False) :
    print('There are no null values in Testing data')
else :
    print("There are null values in Testing data")

In [None]:
# Info of df_train

df_train.info()

In [None]:
# Info of df_test

df_test.info()

## Preparing the dataset for training

In [None]:
# Converting Dtype of df_train['time'] and df_test['time'] to datetime64[ns]

df_train['time'] = pd.to_datetime(df_train['time'])
df_test['time'] = pd.to_datetime(df_test['time'])

In [None]:
# Info of df_train

df_train.info()

In [None]:
# Info of df_test

df_test.info()

In [None]:
# Splitting 'time' column into individual columns of 'dayofweek', 'hour' and 'minute'
# And dropping 'time' column

df_train['dayofweek'] = df_train['time'].dt.dayofweek
df_train['hour'] = df_train['time'].dt.hour
df_train['minute'] = df_train['time'].dt.minute
df_train.drop('time', inplace=True, axis=1)

df_test['dayofweek'] = df_test['time'].dt.dayofweek
df_test['hour'] = df_test['time'].dt.hour
df_test['minute'] = df_test['time'].dt.minute
df_test.drop('time', inplace=True, axis=1)

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
X = df_train.drop(['row_id', 'congestion'], axis=1)
Y = df_train['congestion']

In [None]:
X

In [None]:
Y

## Direction Hashing

In [None]:
# As 'direction' is in string format, we have to encode them into integers to train the model
# Creating hash for directions
dict = {
    
    'EB' : 0,
    'NB' : 1,
    'NE' : 2,
    'NW' : 3,
    'SB' : 4,
    'SE' : 5,
    'SW' : 6,
    'WB' : 7
}

In [None]:
# Encoding the values of 'direction' column

X['direction'] = X['direction'].apply(lambda i : dict[i])

In [None]:
X

## Splitting the training data

In [None]:
# Splitting training and testing data with test size = 0.25 as we have enough data to do so

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0)

In [None]:
df_test['direction'] = df_test['direction'].apply(lambda i : dict[i])
df_test.drop(['row_id'], axis=1, inplace=True)

In [None]:
df_test.head()

## Training the model with random parameters

In [None]:
# Preparing the regressor and fitting data with some parameters
xg_reg = xgb.XGBRegressor(objective ='reg:linear', 
                          colsample_bytree = 1, 
                          learning_rate = 0.3,
                          max_depth = 15, 
                          alpha = 10, 
                          n_estimators = 100, 
                          verbose=1, 
                          min_child_weight = 1, 
                          colsample_bylevel = 1, 
                          reg_alpha = 2) 

xg_reg.fit(X_train, Y_train)

## Finding mean absolute error

In [None]:
preds = xg_reg.predict(X_test)
preds = preds.astype(int)
mae = np.abs(Y_test-preds).mean()
mae

## Finding the best parameters with Optuna

In [None]:
# Defining the objective

def objective(trial,data=X,target=Y):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.25,random_state=0)
    param = {
        # 'tree_method':'gpu_hist',  # this parameter means using the GPU when training our model to speedup the training process 
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'colsample_bylevel': trial.suggest_categorical('colsample_bylevel', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [1e-5,1e-4,1e-3,0.001,0.002,0.003,0.004,0.005,0.006,0.007,0.008,0.009,0.01,0.04,0.08,0.12,0.20, 0.24, 0.30, 0.38, 0.42, 0.48, 0.50, 0.60, 0.70, 0.80, 0.90, 1.0]),
        'n_estimators': trial.suggest_int('n_estimators', 1, 300),
        'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17,20,25,30,35,40]),
        'random_state': trial.suggest_categorical('random_state', [24,48,2020]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
    }
    model = xgb.XGBRegressor(**param)  
    
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    
    preds = model.predict(test_x)
    
    mae = mean_absolute_error(test_y, preds)
    
    return mae

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=150)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

## k-fold Cross Validation

In [None]:
xgb_preds = []
K = 5 # 5 fold cross validation
kf = KFold(n_splits = K, random_state = 42, shuffle = True)

In [None]:
# Creating gini function

def gini_xgb(
    preds, dtrain) :
    labels = dtrain.get_label()
    mae = np.abs(labels-preds).mean()
    return [('mae', mae)]

In [None]:
# Let us make predictions for each of the 5 models and find mean 
# of those predictions

from tqdm import tqdm
train = np.array(X)
test = np.array(df_test)
target_train = df_train['congestion'].values
test_preds = np.zeros((len(test)))
oof_preds = np.zeros((len(train)))
for train_index, val_index in tqdm(kf.split(X), total=5):
    train_X, valid_X = train[train_index], train[val_index]
    train_y, valid_y = target_train[train_index], target_train[val_index]
    xgb_params = study.best_trial.params
    model = xgb.XGBRegressor(**xgb_params)
    model.fit(train_X, train_y)
    preds = model.predict(valid_X)
    oof_preds[val_index] = preds
    preds = model.predict(test)
    test_preds += (preds)/5
print(np.abs(oof_preds-target_train).mean())

## Submission

In [None]:
df_sam = pd.read_csv(os.path.join(data_dir, 'sample_submission.csv'), encoding='latin1')
df_sam['congestion'] = test_preds
df_sam.to_csv('submission-2.csv', index=False)