# Assessment of a plain LightGBM+mixup on SCTP


In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from scipy.stats import beta
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
import copy

In [None]:
# needed for deterministic output
SEED = 2
np.random.seed(SEED)

***
## data preparation

In [None]:
dataset = pd.read_csv("../input/santander-customer-transaction-prediction/train.csv")
dataset

In [None]:
dataset.info()

In [None]:
dataset.groupby("target")["ID_code"].count() / len(dataset)

In [None]:
# dataset stratified split: train 60% - valid 20% - test 20%

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2)
split = skf.split(dataset, dataset.target)
_,valid_index = next(split)
_,test_index = next(split)

train_dset = dataset.drop(valid_index).drop(test_index).reset_index(drop=True)
valid_dset = dataset.loc[valid_index].reset_index(drop=True)
test_dset = dataset.loc[test_index].reset_index(drop=True)

In [None]:
display(train_dset.groupby("target")["ID_code"].count() / len(train_dset))
display(valid_dset.groupby("target")["ID_code"].count() / len(valid_dset))
display(test_dset.groupby("target")["ID_code"].count() / len(test_dset))

In [None]:
input_features = dataset.columns[2:].tolist()
target = "target"

***
## LightGBM without mixup

In [None]:
train_dataset = lgb.Dataset(
    train_dset[input_features].values,
    train_dset[target].values,
    free_raw_data=False
)

valid_dataset = lgb.Dataset(
    valid_dset[input_features].values,
    valid_dset[target].values,
    free_raw_data=False
)

In [None]:
model_params = dict(
    objective = "cross_entropy",
    learning_rate = 0.05,
    num_leaves = 32,
    feature_fraction = 0.8,
    bagging_fraction = 0.8,
    seed = 2,
    deterministic = True,
    metric = "auc"
)

In [None]:
model = lgb.train(
    model_params, 
    train_dataset, 
    valid_sets=[valid_dataset,],
    num_boost_round=2000,
    early_stopping_rounds=50,
    verbose_eval=50,
)

In [None]:
# AUC on validation dataset
model.best_score["valid_0"]["auc"]

In [None]:
# AUC on test dataset
preds = model.predict(test_dset[input_features].values)
roc_auc_score(test_dset[target].values, preds)

***
## LightGBM with mixup

In [None]:
num_boosting_rounds = 2000
early_stopping_rounds = 50
verbose_eval = 50
alpha = 0.25

X_valid = valid_dset[input_features].values
y_valid = valid_dset[target].values

best_metric = 0.
best_iteration = 0
best_model = None
no_improvement = 0

model_params["verbosity"] = -1

for iteration in range(num_boosting_rounds):
    
    X = train_dset[input_features].values
    y = train_dset[target].values
    
    index = np.arange(len(X))
    np.random.shuffle(index)
    X = X[index,:].copy()
    y = y[index].copy()
    
    n = len(X)//2
    lam = np.random.beta(alpha,alpha,size=n)
    X_mixed = lam.reshape(-1,1)*X[:n,:] + (1-lam.reshape(-1,1))*X[n:,:]
    y_mixed = lam*y[:n] + (1-lam)*y[n:]
    
    train_dataset = lgb.Dataset(
        X_mixed, y_mixed,
        free_raw_data=False
    )
    
    if iteration == 0:
        model = lgb.train(
            model_params, 
            train_dataset, 
            num_boost_round=1,
        )
    else:
        model = lgb.train(
            model_params, 
            train_dataset, 
            num_boost_round=1,
            init_model=model
        )
    
    y_hat = model.predict(X_valid)
    metric = roc_auc_score(y_valid, y_hat)
    
    if metric > best_metric:
        best_metric = metric
        best_iteration = iteration
        best_model = copy.deepcopy(model)
        no_improvement = 0
    else:
        no_improvement += 1
    
    if no_improvement == early_stopping_rounds:
        print(f"Eearly stopping. Best score: {best_metric} and reached at: {best_iteration}")
        break
    
    if iteration%verbose_eval == 0:
        print(f"Iteration: {iteration} - AUC: {metric}")
        

In [None]:
# AUC on test dataset
preds = best_model.predict(test_dset[input_features].values)
roc_auc_score(test_dset[target].values, preds)

****