# XGB Pipeline

This is a  xgboost pipeline with hyperparameter tuning incorporated.

oof predictions are stored in csv along with submission file.This will be helpful towards end of competition for stacking.

In [None]:
from datetime import datetime
import numpy as np
import pandas as pd

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.utils import shuffle

import optuna
import seaborn as sns

from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import RobustScaler

import gc

# Configuration

In [None]:
datenow = datetime.now().strftime('%d%m%Y_%H%M%S')  # will be appended to oof and submission files
modelname = 'xgboost'                               # will be appended to oof and submission files

seed = 42 
n_folds = 5 # or 10

early_stopping_rounds = 50 # early stopping rounds for Xgboost

run_optuna_hyperparam_search = True  # Switch for optuna hyper param tuning

In [None]:
if not run_optuna_hyperparam_search:  
    
    #if not running optuna (i.e run_optuna_hyperparam_search is False) use these parameters for xgboost else best param 
    # from Optuna will be picked up
    
    params = {'n_estimators': 40000, 
              'learning_rate': 0.004, 
              'max_depth': 8, 
              'colsample_bytree': 0.30, 
              'subsample': 0.670, 
              'eval_metric': 'auc', 
              'use_label_encoder': False, 
              'gamma': 1.0, 
              'reg_lambda': 100.0, 
              'tree_method': 'gpu_hist', 
              'gpu_id': 0, 
              'predictor': 'gpu_predictor', 
              'random_state': seed}
    
else:  #will run optuna first and pick best parameter and run xgboost
    
    n_folds_for_optuna = 1  # how many folds to be considered for optuna hyper parameter tuning.
    n_trials = 3            # how many optuna trials (change it to 50-100-200 depending on number of estimators you choose in objective(trial) function below)
        
    
    def objective(trial):
    # Even if some parameter is constant e.g n_estimators, mention it as trial.suggest_categorical like done below.This notebook picks best_params
    # from trial and runs Xgboost. Hence if 'n_estimators' is mentioned as 'n_estimators': 4000 instead of 'n_estimators':trial.suggest_categorical('n_estimators',[4000])
    # then it will have to be defined explicitly in the xgboost initialization in the last cell of this notebook
    
        params  = {
          'n_estimators': trial.suggest_categorical('n_estimators',[4000]), 
          'learning_rate': trial.suggest_float('learning_rate',1e-3,5e-1,log=True),
          'max_depth': trial.suggest_int('max_depth',3,12),
          'colsample_bytree': trial.suggest_float('colsample_bytree',0.2,0.99,log=True),
          'subsample': trial.suggest_float('subsample',0.2,0.99,log=True),
          'eval_metric': trial.suggest_categorical('eval_metric',['auc']),
          'use_label_encoder':trial.suggest_categorical('use_label_encoder',[False]),
          'gamma': trial.suggest_categorical('gamma',[0, 0.25, 0.5, 1.0]),
          'reg_lambda': trial.suggest_categorical('reg_lambda',[0.1, 1.0, 5.0, 10.0, 50.0, 100.0]),
          'tree_method': trial.suggest_categorical('tree_method',['gpu_hist']),
          'gpu_id': trial.suggest_categorical('gpu_id',[0]),
          'predictor' : trial.suggest_categorical('predictor',['gpu_predictor']),
          'random_state': trial.suggest_categorical('random_state',[seed])
         }  
        model = XGBClassifier(**params)

        auc = fit_n_folds(model,n=n_folds_for_optuna,optimize=True)

        return auc

# Read Data

In [None]:
train_df_l = pd.read_csv(f'../input/stratifiedkfoldsplits-oct2021/Stratified{n_folds}Fold_OCT2021_TPS.csv',index_col='id')
test_df_l = pd.read_csv('../input/tabular-playground-series-oct-2021/test.csv',index_col='id')
submission_df = pd.read_csv('../input/tabular-playground-series-oct-2021/sample_submission.csv')

In [None]:
features = [col for col in train_df_l.columns if col.startswith('f')]
target = ['target']

# Reduce Size

In [None]:
# sources: 
# https://www.kaggle.com/dmitryuarov/tps-soft-voting-xgb-cb-lgbm#Basic-information
# https://www.kaggle.com/rinnqd/reduce-memory-usage
# https://www.kaggle.com/heiswicked/smtm-s-tps-sep-catboost

def reduce_size(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2

    print(f"After Reduction : {round(end_mem, 2)}MB")
    print(f"Reduced: {round(100*(start_mem - end_mem)/(start_mem), 2)}%")

    return df

In [None]:
train_df = reduce_size(train_df_l)

In [None]:
test_df = reduce_size(test_df_l)

In [None]:
del train_df_l
del test_df_l
gc.collect()

# Pre Processing

In [None]:
# scaler = StandardScaler()
# train_df[features] = scaler.fit_transform(train_df[features])
# test_df[features] = scaler.transform(test_df[features])

# Helper Methods

In [None]:
def get_nth_fold(n=0):
    train_idx = (train_df['Fold'] != n)
    val_idx = (train_df['Fold'] == n)
    
    return train_df.loc[train_idx][features],train_df.loc[train_idx][target],train_df.loc[val_idx][features],train_df.loc[val_idx][target]

In [None]:
def fit_n_folds(model,n=10,optimize=False):
    
    val_auc_scores = []
    test_preds = []
    final_valid_predictions = {}
    for i in range(n): 
        
        if not optimize:
            print(f'** Processing Fold {i} ***')
        
        train_auc_score = 0
        val_auc_score = 0
        
        X_train,y_train,X_val,y_val = get_nth_fold(n=i)
        
        
        
        model.fit(X_train,y_train.values.ravel(), eval_set=[(X_val, y_val.values.ravel())],eval_metric='auc',early_stopping_rounds=early_stopping_rounds,verbose=False)
        y_pred = model.predict_proba(X_train)[:,1]
        train_auc_score = roc_auc_score(y_train.values.ravel(),y_pred)
        
        y_val_pred = model.predict_proba(X_val)[:,1]
        
        valid_index = y_val.index.values
        final_valid_predictions.update(dict(zip(valid_index,y_val_pred)))

        val_auc_score = roc_auc_score(y_val.values.ravel(),y_val_pred)
        val_auc_scores.append(val_auc_score)
        
        
        if not optimize:
            test_pred = model.predict_proba(test_df[features])[:,1]
            test_preds.append(test_pred)
            print(f'Fold {i} Train AUC - {train_auc_score},Val AUC - {val_auc_score}')
        
        del X_train
        del y_train
        del X_val
        del y_val
        gc.collect()
        
    
    if optimize:
        return np.mean(val_auc_scores)
    else: 
        final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions,orient='index').reset_index()
        final_valid_predictions.to_csv(f'oof_{modelname}_{datenow}.csv',index=0)  # Save OOF File.
        del final_valid_predictions
        _ = gc.collect()
        print(f'Average Val AUC across folds - {np.mean(val_auc_scores)} std - {np.std(val_auc_scores)}')
        return test_preds

# Optuna Hyperparameter tuning

In [None]:
if run_optuna_hyperparam_search: 
    
    study = optuna.create_study(direction='maximize')
    study.optimize(objective,n_trials=n_trials)
    print('**BEST TRIAL**')
    print(study.best_trial)
    
    params = study.best_trial.params # pick the best param from optuna and assign to params. 
                                     # This will be used in next cell for xgb.

# Model Fit and Submission

In [None]:
model = XGBClassifier(**params)

preds = fit_n_folds(model,n=n_folds,optimize=False)
submission_df.iloc[:,1:] = np.mean(np.stack(preds,axis=0),axis=0)
submission_df.to_csv(f'submission_{modelname}_{datenow}.csv',index=0)