In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Import Library 

In [None]:
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
import optuna
from optuna.samplers import TPESampler
from sklearn.model_selection import KFold

### Load Data

In [None]:
train = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv', parse_dates=['time'])
test = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv', parse_dates=['time'])

### Data Preprocessin

In [None]:
train['year'] = train['time'].dt.year
train['month'] = train['time'].dt.month
train['day'] = train['time'].dt.day
train['hour'] = train['time'].dt.hour
train['minute'] = train['time'].dt.minute
train['weekday'] = train['time'].dt.weekday

In [None]:
test['year'] = test['time'].dt.year
test['month'] = test['time'].dt.month
test['day'] = test['time'].dt.day
test['hour'] = test['time'].dt.hour
test['minute'] = test['time'].dt.minute
test['weekday'] = test['time'].dt.weekday


In [None]:
train = train.drop(['row_id', 'time'], axis=1)
test = test.drop(['row_id','time'], axis=1)

In [None]:
enc = LabelEncoder()
enc.fit(train.direction)
train.direction = enc.transform(train.direction)
test.direction = enc.transform(test.direction)

In [None]:
x_data = train.drop('congestion', axis=1)
y_data = train.congestion

x_test = test

### Model Training

In [None]:
def objective(trial):

    param_grid = {
              'n_estimators': trial.suggest_int('n_estimators', 500, 5000),
              'learning_rate': trial.suggest_discrete_uniform('learning_rate',0.01,0.1,0.01),
              'subsample': trial.suggest_categorical ('subsample', [0.2,0.3,0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
              'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree',0.1,1.0, 0.1),
              'max_depth': trial.suggest_int('max_depth', 2, 20),
              'booster': 'gbtree',
              'gamma': trial.suggest_uniform('gamma',1.0,10.0),
              'reg_alpha': trial.suggest_int('reg_alpha',50,100),
              'reg_lambda': trial.suggest_int('reg_lambda',50,100),
              'random_state': 42,
              'enable_categorical': True
                 }

    x_train_, x_val, y_train_, y_val = train_test_split(x_data, y_data, test_size=0.3, random_state=50)
    xgb_model = XGBRegressor(**param_grid, tree_method='gpu_hist', predictor='gpu_predictor')

    xgb_model.fit(x_train_, y_train_, verbose=False)
    y_pred = xgb_model.predict(x_val)
    return mean_absolute_error(y_val, y_pred)

In [None]:
train_time = 1 * 30 * 60 # h * m * s
study = optuna.create_study(direction='minimize', sampler=TPESampler(), study_name='XGBRegressor')
study.optimize(objective, timeout=train_time)

print('Number of finished trials: ', len(study.trials))
print('Best trial:')
trial = study.best_trial

print('\tValue: {}'.format(trial.value))
print('\tParams: ')
for key, value in trial.params.items():
    print('\t\t{}: {}'.format(key, value))

In [None]:
xgb_params = trial.params
xgb_params['tree_method'] = 'gpu_hist'
xgb_params['predictor'] = 'gpu_predictor'


In [None]:
x_data = x_data.values
y_data = y_data.values

### Best params training and prediction

In [None]:
n_split = 10
kfold = KFold(n_split)

val_pred = np.zeros(y_data.shape)
y_test = np.zeros((n_split, x_test.shape[0]))

for i, (train_index, val_index) in enumerate(kfold.split(x_data)):
    # train model
    print("fold {} training".format(i))
    model = XGBRegressor(**xgb_params)
#     print( pd.value_counts(y_train[train_index]))
    model.fit(x_data[train_index], y_data[train_index])
    
    # predict val and test
    val_pred[val_index] = model.predict(x_data[val_index])
    vla_score = mean_absolute_error(y_data[val_index], val_pred[val_index])
    print("fold {} validation mae score {}".format(i, vla_score))
    
    y_test[i] = model.predict(x_test)

In [None]:
y_test = y_test.mean(axis=0)

In [None]:
y_test = y_test.round().astype(int)

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-mar-2022/sample_submission.csv')
submission.congestion = y_test
submission.to_csv('submission.csv', index=False)

thanks for reading