<center><h1>Tabular Playground Series - January 2022</h1></center>
<center><h2>XGBoost + Optuna (Time Series)</h2></center>
<center><h2>By Tariq Hussain</h2></center>

In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<h1>Importing and viewing data</h1>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
train_fp = '../input/tabular-playground-series-jan-2022/train.csv'
train_df = pd.read_csv(train_fp)

test_fp = '../input/tabular-playground-series-jan-2022/test.csv'
test_df = pd.read_csv(test_fp)

In [None]:
train_df

In [None]:
train_df.info()
print(" ")
train_df.isnull().sum()


In [None]:
test_df

In [None]:
test_df.info()
print(" ")
test_df.isnull().sum()

In [None]:
train_copy = train_df.copy()
test_copy = test_df.copy()

<h1>Data preprocessing and encoding</h1>
<h3>Converting objects to dates using to_datetime and date</h3>

In [None]:
def convert_dates(df):
    df['date'] = pd.to_datetime(df['date'])
    
    df['day'] = df.date.dt.day
    df['month'] = df.date.dt.month
    df['year'] = df.date.dt.year
    df['weekday'] = df.date.dt.weekday
    return df

train = convert_dates(train_copy)
test = convert_dates(test_copy)

In [None]:
cat_cols = train.select_dtypes('object').columns.tolist()
train = pd.get_dummies(train, columns=cat_cols)
test  = pd.get_dummies(test, columns=cat_cols)

In [None]:
#test1 = test_copy.drop(columns=['row_id'])

In [None]:
X = train.drop(columns=['row_id', 'date', 'num_sold'])
y = train_copy.num_sold

test.drop(columns=['row_id', 'date'], inplace=True)

In [None]:
display(X)

In [None]:
'''
# All categorical columns
object_cols = [col for col in X.columns if X[col].dtype == "object"]

# Columns that can be safely label encoded
good_label_cols = [col for col in object_cols if 
                   set(X[col]) == set(X[col])]
        
# Problematic columns that will be dropped from the dataset
bad_label_cols = list(set(object_cols)-set(good_label_cols))
        
print('Categorical columns that will be label encoded:', good_label_cols)
print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols)
'''

In [None]:
'''
from sklearn.preprocessing import LabelEncoder

country_le = LabelEncoder()
X['country_le'] = country_le.fit_transform(X['country'])
test['country_le'] = country_le.transform(test['country'])

store_le = LabelEncoder()
X['store_le'] = store_le.fit_transform(X['store'])
test['store_le'] = store_le.transform(test['store'])

product_le = LabelEncoder()
X['product_le'] = product_le.fit_transform(X['product'])
test['product_le'] = product_le.transform(test['product'])
'''

In [None]:
#from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score

#X_train, X_val, y_train, y_val = train_test_split(X2, y, random_state=0, test_size=0.2)

<h1>Optuna and hyperparameter tuning</h1>

In [None]:
#'''
import optuna
from optuna.samplers import TPESampler
from hyperopt import STATUS_OK,Trials,fmin,hp,tpe
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import log_loss, accuracy_score, mean_absolute_error, r2_score, roc_auc_score
from optuna.integration import XGBoostPruningCallback
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score

def smape(actual, predicted):
    numerator = np.abs(predicted - actual)
    denominator = (np.abs(actual) + np.abs(predicted)) / 2
    
    return np.mean(numerator / denominator)*100


def objective(trial, X=X, y=y):
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0, test_size=0.2)
    
    params = {
        "objective": trial.suggest_categorical('objective', ["reg:squarederror"]),
        "eval_metric": trial.suggest_categorical('eval_metric', ["mape"]),
        "use_label_encoder": trial.suggest_categorical('use_label_encoder', [False]),
        "n_estimators": trial.suggest_categorical('n_estimators', [40000]),
        "learning_rate": trial.suggest_loguniform('learning_rate', 0.15, 1.0),
        "subsample": trial.suggest_float('subsample', 0.1, 1, step=0.01),
        "colsample_bytree": trial.suggest_float('colsample_bytree', 0.05, 1, step=0.01),
        "max_depth": trial.suggest_int("max_depth", 1, 8),
        "booster": trial.suggest_categorical('booster', ["gbtree"]),
        "gamma": trial.suggest_float('gamma', 0, 100, step=0.1),
        "tree_method": trial.suggest_categorical('tree_method', ["gpu_hist"]),
        "reg_lambda": trial.suggest_loguniform('reg_lambda', 0.1, 100),
        "reg_alpha": trial.suggest_loguniform('reg_alpha', 0.1, 100),
        "random_state": trial.suggest_categorical('random_state', [42]),
        "n_jobs": trial.suggest_categorical('n_jobs', [4]),
        "min_child_weight": trial.suggest_categorical("min_child_weight", [256]),
            }
    
    #opt_params = params
    #opt_params['n_estimators'] = 80000
    
    model = XGBRegressor(**params)

    model.fit(
        X_train, 
        y_train,
        early_stopping_rounds=100,
        eval_set=[(X_val, y_val)],
        #eval_metric='auc',
        verbose=False
    )

    yhat = model.predict(X_val)
    return smape(y_val, yhat)

study = optuna.create_study(direction="minimize", sampler=TPESampler())
study.optimize(objective, n_trials=100)
print(study.best_params)
#'''

In [None]:
print("Best value: {:.5f}".format(study.best_value))
print("Best params:")

for key, value in study.best_params.items():
    print("{}: {}".format(key, value))


In [None]:
xgb_params = study.best_params
xgb_params

In [None]:
#from xgboost import XGBRegressor

#xgb = XGBRegressor()

#model = xgb.fit(X_train, y_train)

In [None]:
#preds = model.predict(X_val)

In [None]:
#from sklearn.metrics import mean_absolute_error, r2_score

#print("Mean absolute error: {}\n".format(mean_absolute_error(y_val, preds)))
#print("r2 score: {}".format(r2_score(y_val, preds)))


In [None]:
#final_preds = model.predict(test_le)

<h1>K-Fold Cross Validation</h1>

In [None]:
#from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import TimeSeriesSplit

#kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

#preds = []
#scores = []

#%%time
folds = TimeSeriesSplit(10)

preds = np.zeros(len(test))
scores = []

for fold, (idx_train, idx_valid) in enumerate(folds.split(X, y)):
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
    
    xgb = XGBRegressor(**xgb_params, 
          predictor='gpu_predictor', 
          gpu_id=0)

    xgb.fit(
        X_train, 
        y_train,
        eval_metric='rmse',
        early_stopping_rounds=100,
        eval_set=[(X_valid, y_valid)], 
        verbose=False
        )
    
    pred_valid = xgb.predict(X_valid)#[:,1]
    #fpr, tpr, _ = roc_curve(y_valid, pred_valid)
    score = smape(y_valid, pred_valid)
    scores.append(score)
    
    print(f"Fold: {fold + 1} Score: {score}")
    print('||'*40)
    
    #test_preds = xgb.predict(test)#[:,1]
    #preds.append(test_preds)
    
    preds += xgb.predict(test) / folds.n_splits
    
print(f"Overall Validation Score: {np.mean(scores)}")


<h1>Submission</h1>

In [None]:
sample_fp = '../input/tabular-playground-series-jan-2022/sample_submission.csv'
sample = pd.read_csv(sample_fp)

In [None]:
submission = sample.copy()

submission['num_sold'] = preds

submission

In [None]:
submission.to_csv('submission.csv', index=False)