# **Training Notebook for Santander Dataset**

Second Part: https://www.kaggle.com/akhilnasser/santander-customer-transaction-training-2

**Recommended: GPU**

## **1. Required Libraries & Setup**

In [None]:
!nvidia-smi

In [None]:
!nvcc --version

In [None]:
# General Data Manipulation Libraries
import numpy as np; print('Numpy Version:', np.__version__)
import pandas as pd; print('Pandas Version:', pd.__version__)

# Model & Helper Libraries
import xgboost; print('XGBoost Version:', xgboost.__version__)
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, StratifiedKFold
import torch; print('PyTorch Version:', torch.__version__)

# Plotting Tools
import matplotlib.pyplot as plt
import plotly; print('Plotly Version:', plotly.__version__)
from xgboost import plot_importance
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_param_importances

# Hyper-parameter Optimization
import optuna; print('Optuna Version:', optuna.__version__)

In [None]:
if torch.cuda.is_available():
    import cudf; print('cuDF Version:', cudf.__version__)

## **2. Short EDA of Data**

In [None]:
# Load Data
input_dir = '/kaggle/input/santander-customer-transaction-prediction/'
if torch.cuda.is_available():
    df_train = cudf.read_csv(input_dir + '/train.csv')
else:
    df_train = pd.read_csv(input_dir + '/train.csv')
df_train

In [None]:
print(f'There are {len(df_train)} rows and {len(df_train.columns)} columns.')

In [None]:
df_train.describe()

In [None]:
# Check for NaN values
print(f'Are there Nan values? {df_train.isnull().values.any()}')

Since there are no unique identifiable characteristics among the column labels we now proceed with the rest of the Data pipeline.

## **3. Data Preperation**

In [None]:
var_colums = [c for c in df_train.columns if c not in ['ID_code','target']]
X = df_train.loc[:, var_colums]
y = df_train.loc[:, 'target']

# We are performing a 80-20 split for Training and Validation
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

## **4. Model Setup & Training**

In [None]:
# View of Xgboost Parameters
xgboost.XGBClassifier().get_params()

### **4.1 XGBoost Parameter Selection**

1. Learning Rate: Weightage of each tree in the XGBoost Classifier.
2. Maximum Depth: The maximum depth of each tree in the XGBoost Classifier.
3. Number of Estimators: The Maximum number of trees to be created.
4. Subsample: The sampling percentage of the Training data used to create a Tree. Each Tree is trained on a new subsample of the trainign data.
5. Colsample By Tree: Percentage of Features to be used while building a tree in the model. Similar to Subsample. Each Tree is trained on a new subset of the original feature space.
6. Evaluation Metric: Evaluation Metric for the model.
7. Use Label Encoding: The target labels have to be encoded as integers startign with 0. This will be removed soon in a new release.
8. Verbosity: Verbosity of printing messages.
9. Early Stopping Rounds: The stopping Criteria for the training phase. If the Validation score does not improve for the specified number of iterations the training is stopped.

### **4.2 Cross-validation with XGBoost**

Refer: <a href = 'https://blog.cambridgespark.com/hyperparameter-tuning-in-xgboost-4ff9100a3b2f'>Hyperparameter tuning in XGBoost</a>

The cross-validation function is splitting the train dataset into `nfolds` and iteratively keeps one of the folds for validation purposes. `cv` returns a table where the rows correspond to the number of boosting trees used. The 4 columns correspond to the mean and standard deviation of MAE on the validation dataset and on the train dataset.

In [None]:
# Model instantiation

# GPU Parameter
device_method = 'gpu_hist' if torch.cuda.is_available() else 'auto'
model_xgboost = xgboost.XGBClassifier(learning_rate=0.1,
                                      max_depth=5,
                                      n_estimators=5000,
                                      subsample=0.5,
                                      colsample_bytree=0.5,
                                      eval_metric='auc',
                                      use_label_encoder=False,
                                      tree_method = device_method,
                                      verbosity=1)
# Validation Set
eval_set = [(X_valid, y_valid)]

# Creating the DMatrix
d_matrix = xgboost.DMatrix(data=X_train, label=y_train)

xgb_param = model_xgboost.get_xgb_params()

cv_folds = 10
early_stopping_rounds = 10
# Cross-validation with 10 folds
cvresult = xgboost.cv(xgb_param, d_matrix, num_boost_round=model_xgboost.get_params()['n_estimators'], 
            nfold=cv_folds, metrics='auc', early_stopping_rounds=early_stopping_rounds, verbose_eval=True)

model_xgboost.set_params(n_estimators=cvresult.shape[0])

In [None]:
# Training
model_xgboost.fit(X_train,
                  y_train,
                  early_stopping_rounds=10,
                  eval_set=eval_set,                  
                  verbose=True)

In [None]:
# Print Results
print("AUC Train Mean Score: {:.4f} with Standard Deviation {:.4f}\nAUC Valid Mean Score: {:.4f} with Standard Deviation {:.4f}".format(cvresult['train-auc-mean'].iloc[-1],
                                                    cvresult['train-auc-std'].iloc[-1], cvresult['test-auc-mean'].iloc[-1], cvresult['test-auc-std'].iloc[-1]))

In [None]:
# Print Results on Test-Data
y_train_pred = model_xgboost.predict_proba(X_train)[:,1]
y_valid_pred = model_xgboost.predict_proba(X_valid)[:,1]

if torch.cuda.is_available():
    y_train = y_train.to_array()
    y_valid = y_valid.to_array()

print("AUC Train: {:.4f}\nAUC Test: {:.4f}".format(roc_auc_score(y_train, y_train_pred),
                                                    roc_auc_score(y_valid, y_valid_pred)))

### **4.3 Plot of Results of Training**

We have multiple choices for ranking feature Importance.

Refer: <a href = 'https://towardsdatascience.com/be-careful-when-interpreting-your-features-importance-in-xgboost-6e16132588e7'> The Multiple faces of ‘Feature importance’ in XGBoost </a>


* Gain:  Implies the relative contribution of the corresponding feature to the model calculated by taking each feature’s contribution for each tree in the model. A higher value of this metric when compared to another feature implies it is more important for generating a prediction.
* Coverage: Metric means the relative number of observations related to this feature. How many times is this feature used in the classification process for all constructed trees. Expressed as a percentage for all features’ cover metrics.
* Frequency (R)/Weight (python): Percentage representing the relative number of times a particular feature occurs in the trees of the model. 

*The Gain is the most relevant attribute to interpret the relative importance of each feature.*

In [None]:
# Feature Importance Plot
plot_importance(model_xgboost, max_num_features=15, importance_type='gain')
plt.figure(figsize = (25, 16))
plt.show()

## **5. Hyper-parameter Optimization**

In [None]:
def objective(trial, X_train, y_train, X_valid, y_valid):
    
    # Model Parameters to be optimized
    xgboost_params = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-7, 0.3, log=True),
        "n_estimators": trial.suggest_int(name="n_estimators", low=100, high=2000, step=100),
        "max_depth": trial.suggest_int("max_depth", 3, 8), 
        "subsample": trial.suggest_categorical(name="subsample", choices=[0.4, 0.5, 0.6]),
        "colsample_bytree": trial.suggest_categorical(name="colsample_bytree", choices=[0.4, 0.5, 0.6]),
        "random_state": 1121217
    }
    
    # Model Initialisation
    model_xgboost = xgboost.XGBClassifier(eval_metric='auc', use_label_encoder=False,
                                      tree_method = device_method, verbosity=0, **xgboost_params)
    eval_set = [(X_valid, y_valid)]
    
    # Model Training
    model_xgboost.fit(X_train, y_train, early_stopping_rounds=10, eval_set=eval_set, verbose=False)
    
    # Model Prediction
    y_valid_pred = model_xgboost.predict_proba(X_valid)[:,1]
    
    # Optimization Metric    
    return roc_auc_score(y_valid, y_valid_pred)

In [None]:
# Create Study Object for Optuna
study = optuna.create_study(direction="maximize")
# Optimize
study.optimize(lambda trial: objective(trial, X_train, y_train, X_valid, y_valid), n_trials=100)

In [None]:
print(f"Optimized roc_auc_score: {study.best_value:.5f}")

In [None]:
print("Best params:")

for key, value in study.best_params.items():
    print(f"\t{key}: {value}")

### **5.1 Plots of Results**

In [None]:
# Check if Plotly library is available
optuna.visualization.is_available()

In [None]:
# Optimization History Plot
plot_optimization_history(study)

In [None]:
# Plot Hyperparameter Importance
plot_param_importances(study)