In [None]:
import numpy as np
import pandas as pd
import warnings
import gc
import holidays

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder

from xgboost import XGBRegressor
from xgboost import plot_importance

warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
def evaluate_model(model, x, y):
    y_pred = model.predict(x)
    acc = mean_absolute_error(y, y_pred)
    return {'mae' : acc}

In [None]:
def preprocess_dates(df, date_column='time'):
    df = df.copy()
    df[date_column] = pd.to_datetime(df[date_column])
    df['weekday'] = df[date_column].dt.weekday
    df['quarter'] = df[date_column].dt.quarter
    df['day_of_year'] = df[date_column].dt.day_of_year
    df['is_month_start'] = df[date_column].dt.is_month_start.astype("int8")
    df['is_month_end'] = df[date_column].dt.is_month_end.astype("int8")
    df['month'] = df[date_column].dt.month
    return df

In [None]:
def preprocess_holidays(df, date_column='time'):
    holiday_us = holidays.CountryHoliday(country='US', years=[1991])
    dates = list(holiday_us.keys())
    dates = sorted(pd.to_datetime(dates))
    df = df.copy()
    df['is_holiday'] = df[date_column].apply(lambda x : 1 if x in dates else 0)
    return df

In [None]:
def preprocess_timeseries(df):
    df = df.copy()
    df['sin_day_of_year'] = np.sin(df['day_of_year'])
    df['sin_month'] = np.sin(df['month'])
    return df

In [None]:
seed = 47

# Reading the dataset

In [None]:
train_df = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2022/train.csv", sep=',')

In [None]:
le = LabelEncoder()
train_df['direction'] = le.fit_transform(train_df['direction'])
train_df = preprocess_dates(train_df)
train_df = preprocess_holidays(train_df)
train_df = preprocess_timeseries(train_df)
x_train = train_df.drop(['row_id', 'time', 'congestion'], axis=1)
y_train = train_df['congestion']
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=seed, shuffle=False)

# XGBoost Baseline

In [None]:
def get_xgb_baseline(params={}):
    return XGBRegressor(**params,
                        random_state=seed,
                        tree_method='gpu_hist',
                        predictor='gpu_predictor',
                        objective='reg:linear',
                        verbosity=0)

In [None]:
model = get_xgb_baseline()
model.fit(x_train, y_train)
results = evaluate_model(model, x_test, y_test)
print(results)

In [None]:
params = {'n_estimators': 200, 
          'max_depth': 23, 
          'subsample': 1.0,
          'eta': 0.3,
          'colsample_bytree': 1.0,
          'gamma': 0.0, 
          'min_child_weight': 1,
          'reg_alpha': 1
         }

model = get_xgb_baseline(params)
model.fit(x_train, y_train)
results = evaluate_model(model, x_test, y_test)
print(results)

# Feature Engineering

In [None]:
geomean = lambda x, axis : np.exp(np.mean(np.log(x), axis=axis))
harmonic_mean = lambda x, axis : len(x) / np.sum(1.0/x, axis=axis) 

funcs = {'mean' : np.mean, 
         'std' : np.std, 
         'var' : np.var, 
         'geo_mean' : geomean, 
         'harmonic_mean' : harmonic_mean, 
         'median' : np.median,
         'none': None}

In [None]:
results, names = list(), list()

for key in funcs.keys():
    x_train = train_df.drop(['row_id', 'time', 'congestion'], axis=1).copy()
    y_train = train_df['congestion']
    if funcs[key] is not None:
        x_train[key] = funcs[key](x_train, axis=1)
    x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=seed)       
    model = get_xgb_baseline()

    model.fit(x_train, y_train)
    result = evaluate_model(model, x_test, y_test)
    names.append(key)
    results.append(result['mae'])
    
for name, score in zip(names, results):
    print('>%s: %f' % (name, score))

index_best = np.argmin(results)
print('Best result is of:', names[index_best], 'with score:', results[index_best])

# Individual parameter Search

In [None]:
x_train = train_df.drop(['row_id', 'time', 'congestion'], axis=1).copy()
x_train['geo_mean'] = geomean(x_train, axis=1)
y_train = train_df['congestion']
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=seed)

In [None]:
params = {}

# 1 - Testing different number of estimators

In [None]:
results_trees = {}
trees = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 600, 700, 800, 900, 950, 975, 1000, 1025, 1050, 1100, 1150, 1290, 1295, 1300, 1305, 1310, 1315, 1325, 2000, 3000, 4000, 5000, 10000]

for n in trees:
    params['n_estimators'] = n
    model = get_xgb_baseline(params)
    model.fit(x_train, y_train)
    result = evaluate_model(model, x_test, y_test)
    results_trees[n] = result['mae']
    print('n_estimators:', n, 'mae:', results_trees[n])

best_nestimator = min(results_trees, key=results_trees.get)
print('\nBest n_estimators:', best_nestimator, 'MAE score:', results_trees[best_nestimator])

# 2 - Testing different max_depth

In [None]:
results_max_depths = {}
params['n_estimators'] = best_nestimator
max_depths = [i for i in range(1,10)]

for max_depth in max_depths:
    params['max_depth'] = max_depth
    model = get_xgb_baseline(params)
    model.fit(x_train, y_train)
    result = evaluate_model(model, x_test, y_test)
    results_max_depths[max_depth] = result['mae']
    print('max_depth:', max_depth, 'mae:', results_max_depths[max_depth])

best_max_depth = min(results_max_depths, key=results_max_depths.get)
print('\nBest max_depth:', best_max_depth, 'MAE score:', results_max_depths[best_max_depth])


# 3 - Testing different subsamples

In [None]:
results_subsamples = {}
params['max_depth'] = best_max_depth
subsamples = [i for i in np.arange(0.1, 1.1, 0.1)]

for subsample in subsamples:
    params['subsample'] = subsample
    model = model = get_xgb_baseline(params)
    model.fit(x_train, y_train)
    result = evaluate_model(model, x_test, y_test)
    results_subsamples[subsample] = result['mae']
    print('subsample:', subsample, 'mae:', results_subsamples[subsample])

best_subsample = min(results_subsamples, key=results_subsamples.get)
print('\nBest subsample:', best_subsample, 'MAE score:', results_subsamples[best_subsample])

# 4 - Testing different learning rates

In [None]:
results_etas = {}
params['subsample'] = best_subsample
etas = [0.0001, 0.001, 0.003, 0.005, 0.01, 0.03, 0.05, 0.1, 0.12, 0.13, 0.3, 0.5, 1.0]

for eta in etas:
    params['eta'] = eta
    model = get_xgb_baseline(params)
    model.fit(x_train, y_train)
    result = evaluate_model(model, x_test, y_test)
    results_etas[eta] = result['mae']
    print('eta:', eta, 'mae:', results_etas[eta])

best_eta = min(results_etas, key=results_etas.get)
print('\nBest eta:', best_eta, 'MAE score:', results_etas[best_eta])

# 5 - Testing different number of features

In [None]:
results_colsample_bytrees = {}
params['eta'] = best_eta
colsample_bytrees = [i for i in np.arange(0.1, 1.1, 0.1)]

for colsample_bytree in colsample_bytrees:
    params['colsample_bytree'] = colsample_bytree
    model = get_xgb_baseline(params)
    model.fit(x_train, y_train)
    result = evaluate_model(model, x_test, y_test)
    results_colsample_bytrees[colsample_bytree] = result['mae']
    print('colsample_bytree:', colsample_bytree, 'mae:', results_colsample_bytrees[colsample_bytree])

best_colsample_bytree = min(results_colsample_bytrees, key=results_colsample_bytrees.get)
print('\nBest colsample_bytree:', best_colsample_bytree, 'MAE score:', results_colsample_bytrees[best_colsample_bytree])

# 6 - Testing different values for min_child_weight

In [None]:
results_min_child_weight = {}
params['colsample_bytree'] = best_colsample_bytree
min_child_weights = [i for i in range(1,10)]

for min_child_weight in min_child_weights:
    params['min_child_weight'] = min_child_weight
    model = get_xgb_baseline(params)
    model.fit(x_train, y_train)
    result = evaluate_model(model, x_test, y_test)
    results_min_child_weight[min_child_weight] = result['mae']
    print('min_child_weight:', min_child_weight, 'mae:', results_min_child_weight[min_child_weight])

best_min_child_weight = min(results_min_child_weight, key=results_min_child_weight.get)
print('\nBest min_child_weight:', best_min_child_weight, 'MAE score:', results_min_child_weight[best_min_child_weight])

# 7 - Testing different values for gamma

In [None]:
results_gamma = {}
params['min_child_weight'] = best_min_child_weight
gammas = [0.01, 0.02, 0.03, 0.1, 0.3, 0.5, 1, 1.1, 1.5, 2, 5, 7, 9, 10]

for gamma in gammas:
    params['gamma'] = gamma
    model = get_xgb_baseline(params)
    model.fit(x_train, y_train)
    result = evaluate_model(model, x_test, y_test)
    results_gamma[gamma] = result['mae']
    print('gamma:', gamma, 'mae:', results_gamma[gamma])

best_gamma = min(results_gamma, key=results_gamma.get)
print('\nBest gamma:', best_gamma, 'Accuracy score:', results_gamma[best_gamma])

In [None]:
params['gamma'] = best_gamma
print('Best Hyperparameters:', params)

# Train model with the best found params

In [None]:
model = get_xgb_baseline(params)
model.fit(x_train, y_train)
result = evaluate_model(model, x_test, y_test)
print(result)

<h2>Importance features plot</h2>

In [None]:
plot_importance(model)

# Submission

In [None]:
del train_df, x_train, y_train, x_test, y_test
gc.collect()

In [None]:
test_df = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2022/test.csv", sep=',')

In [None]:
test_df['direction'] = le.transform(test_df['direction'])
test_df = preprocess_dates(test_df)
test_df = preprocess_holidays(test_df)
test_df = preprocess_timeseries(test_df)
x_test = test_df.drop(['row_id', 'time'], axis=1)
x_test['geo_mean'] = geomean(x_test, axis=1)

In [None]:
target = model.predict(x_test).squeeze()
row_id =  test_df['row_id'].values
submission = pd.DataFrame({'row_id' : row_id, 'congestion' : target})

In [None]:
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)