In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from lightgbm import LGBMClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
import optuna
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from torch.optim.lr_scheduler import ExponentialLR
from pytorch_lightning.callbacks import LearningRateMonitor
from sklearn.utils.class_weight import compute_class_weight
import time
import gc
import torchmetrics


# Pandas setting to display more dataset rows and columns
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# def seed_torch(seed):
#     random.seed(seed)
#     os.environ['PYTHONHASHSEED'] = str(seed)
#     np.random.seed(seed)
#     torch.manual_seed(seed)
#     torch.cuda.manual_seed(seed)
#     torch.backends.cudnn.deterministic = True

# seed_torch(42)

pl.utilities.seed.seed_everything(seed=42, workers=True)

# **Data import**

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-nov-2021/train.csv", low_memory=False)#, nrows=10000)
test = pd.read_csv("/kaggle/input/tabular-playground-series-nov-2021/test.csv", low_memory=False)#, nrows=10000)
train.info(memory_usage="deep")

In [None]:
test.info(memory_usage="deep")

In [None]:
# Colors to be used for plots
colors = ["lightcoral", "sandybrown", "darkorange", "mediumseagreen",
          "lightseagreen", "cornflowerblue", "mediumpurple", "palevioletred",
          "lightskyblue", "sandybrown", "yellowgreen", "indianred",
          "lightsteelblue", "mediumorchid", "deepskyblue"]

In [None]:
train.head()

In [None]:
fig, ax = plt.subplots(figsize=(5, 6))
pie = ax.pie([len(train), len(test)],
             labels=["Train dataset", "Test dataset"],
             colors=["salmon", "teal"],
             textprops={"fontsize": 15},
             autopct='%1.1f%%')
ax.axis("equal")
ax.set_title("Dataset length comparison", fontsize=18)
fig.set_facecolor('white')
plt.show();

In [None]:
train.describe()

In [None]:
features = [x for x in train.columns if x[0]=="f"]

df = pd.concat([train[features], test[features]], axis=0)
df.reset_index(inplace=True, drop=True)

unique_values = df[features].nunique() < 30
cat_features = unique_values[unique_values==True].index
unique_values = df[features].nunique() >= 30
num_features = unique_values[unique_values==True].index

print(f"There are {len(cat_features)} categorical features: {cat_features}")
print(f"There are {len(num_features)} continuous features: {num_features}")

In [None]:
fig, ax = plt.subplots(figsize=(6, 6))

bars = ax.bar(train["target"].value_counts().index,
              train["target"].value_counts().values,
              color=colors,
              edgecolor="black",
              width=0.4)
ax.set_title("Target values distribution", fontsize=20, pad=15)
ax.set_ylabel("Amount of values", fontsize=14, labelpad=15)
ax.set_xlabel("Target value", fontsize=14, labelpad=10)
ax.set_xticks(train["target"].value_counts().index)
ax.tick_params(axis="both", labelsize=14)
ax.bar_label(bars, [f"{x:2.2f}%" for x in train["target"].value_counts().values/(len(train)/100)],
                 padding=5, fontsize=15)
ax.bar_label(bars, [f"{x:2d}" for x in train["target"].value_counts().values],
                 padding=-30, fontsize=15)
ax.margins(0.2, 0.12)
ax.grid(axis="y")

plt.show();

The target value classes are balanced which is good.

In [None]:
train.isna().sum().sum(), test.isna().sum().sum()

There are no missing values in the both datasets.

Let's check feature values distribution in the both datasets.

In [None]:
df = pd.concat([train[num_features], test[num_features]], axis=0)
columns = df.columns.values

cols = 5
rows = len(columns) // cols + 1

fig, axs = plt.subplots(ncols=cols, nrows=rows, figsize=(16,65), sharex=False)

plt.subplots_adjust(hspace = 0.3)
i=0

for r in np.arange(0, rows, 1):
    for c in np.arange(0, cols, 1):
        if i >= len(columns):
            axs[r, c].set_visible(False)
        else:
            hist1 = axs[r, c].hist(train[columns[i]].values,
                                   range=(df[columns[i]].min(),
                                          df[columns[i]].max()),
                                   bins=40,
                                   color="deepskyblue",
                                   edgecolor="black",
                                   alpha=0.7,
                                   label="Train Dataset")
            hist2 = axs[r, c].hist(test[columns[i]].values,
                                   range=(df[columns[i]].min(),
                                          df[columns[i]].max()),
                                   bins=40,
                                   color="palevioletred",
                                   edgecolor="black",
                                   alpha=0.7,
                                   label="Test Dataset")
            axs[r, c].set_title(columns[i], fontsize=12, pad=5)
            axs[r, c].set_yticks(axs[r, c].get_yticks())
            axs[r, c].set_yticklabels([str(int(i/1000))+"k" for i in axs[r, c].get_yticks()])
            axs[r, c].tick_params(axis="y", labelsize=10)
            axs[r, c].tick_params(axis="x", labelsize=10)
            axs[r, c].grid(axis="y")
            if i == 0:
                axs[r, c].legend(fontsize=10)
                                  
        i+=1
#plt.suptitle("Numerical feature values distribution in both datasets", y=0.99)
plt.show();

As you can see, the datasets are well balanced. So target distribution should probably be the same for test predictions.

In [None]:
print("Numerical features with the least amount of unique values:")
train[num_features].nunique().sort_values().head(5)

Let's look at feature correlation.

In [None]:
# Plot dataframe
df = train[features].corr().round(5)

# Mask to hide upper-right part of plot as it is a duplicate
mask = np.zeros_like(df)
mask[np.triu_indices_from(mask)] = True

# Making a plot
plt.figure(figsize=(16,16))
ax = sns.heatmap(df, annot=False, mask=mask, cmap="RdBu", annot_kws={"weight": "bold", "fontsize":13})
ax.set_title("Feature correlation heatmap", fontsize=17)
plt.setp(ax.get_xticklabels(), rotation=90, ha="right",
         rotation_mode="anchor", weight="normal")
plt.setp(ax.get_yticklabels(), weight="normal",
         rotation_mode="anchor", rotation=0, ha="right")
plt.show();

There is very weak linear correlation between the features.

# **Data preprocessing**

In [None]:
# Scaling all values
s_scaler = StandardScaler()
for col in features:
    train[col] = s_scaler.fit_transform(np.array(train[col]).reshape(-1,1))
    test[col] = s_scaler.transform(np.array(test[col]).reshape(-1,1))

In [None]:
X = train[features].copy()
X_test = test[features].copy()
y = train["target"]

# **Model training**

## LinearSVC

In [None]:
# Fold splitting parameters
splits = 10
skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)

# Two zero-filled arrays for out-of-fold and test predictions
linear_oof_preds = np.zeros((X.shape[0],))
linear_test_preds = np.zeros((X_test.shape[0],))
total_mean_auc = 0

# Generating folds and making training and prediction for each of them
for num, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    X_train, X_valid = X.loc[train_idx], X.loc[valid_idx]
    y_train, y_valid = y.loc[train_idx], y.loc[valid_idx]

    linear_model = LinearSVC(tol=1e-7, penalty='l2', dual=False, max_iter=2000, random_state=42)
    linear_model.fit(X_train, y_train)
    
    # Getting validation data predictions. Each fold model makes predictions on an unseen data.
    # So in the end it will be completely filled with unseen data predictions.
    # It will be used to evaluate hyperparameters performance only.    
    linear_oof_preds[valid_idx] = linear_model.decision_function(X_valid)
    
    # Getting mean test data predictions (i.e. devided by number of splits)
    linear_test_preds += linear_model.decision_function(X_test) / splits
    
    # Getting score for a fold model
    fold_auc = roc_auc_score(y_valid, linear_oof_preds[valid_idx])
    print(f"Fold {num} ROC AUC: {fold_auc}")
    
    # Getting mean score of all fold models (i.e. devided by number of splits)
    total_mean_auc += fold_auc / splits

print(f"\nOverall ROC AUC: {total_mean_auc}")

## Neural network

In [None]:
X_nn = X.copy()
X_test_nn = X_test.copy()
# X_nn["linear_preds"] = linear_oof_preds
# X_test_nn["linear_preds"] = linear_test_preds

# Scaling all values
mm_scaler = MinMaxScaler()
for col in X_nn.columns:
    X_nn[col] = mm_scaler.fit_transform(np.array(X_nn[col]).reshape(-1,1))
    X_test_nn[col] = mm_scaler.transform(np.array(X_test_nn[col]).reshape(-1,1))
    
# Transforming test data into tensors
X_test_nn = torch.tensor(X_test_nn.to_numpy()).float()

In [None]:
def prepare_datasets(X_nn, X_valid_nn, y_nn, y_valid_nn):
    # Transforming data into tensors
    X_nn = torch.tensor(X_nn.to_numpy()).float()
    y_nn = torch.tensor(y_nn.to_numpy()).int()
    X_valid_nn = torch.tensor(X_valid_nn.to_numpy()).float()
    y_valid_nn = torch.tensor(y_valid_nn.to_numpy()).int()
    
    print("Using these datasets:")
    # Transforming tensors into tensor datasets
    train_ds = TensorDataset(X_nn, y_nn)
    valid_ds = TensorDataset(X_valid_nn, y_valid_nn)
#     test_ds = TensorDataset(X_test_nn)
    print(f"Train_ds elements: {len(train_ds)}")
    print(f"Valid_ds elements: {len(valid_ds)}")
#     print(f"Test_ds elements: {len(test_ds)}")

    # Transforming into dataloader objects using batches
    BATCH_SIZE = 2048
    train_ds = DataLoader(train_ds, batch_size=BATCH_SIZE)#, drop_last=True)
    valid_ds = DataLoader(valid_ds, batch_size=BATCH_SIZE)#, drop_last=True)
#     test_ds = DataLoader(test_ds, batch_size=BATCH_SIZE)

    
    for data, label in train_ds:
        print(f"Train_ds batch: {data.shape}, {label.shape}")
        break
    for data, label in valid_ds:
        print(f"Valid_ds batch: {data.shape}, {label.shape}")
        break
#     for data in test_ds:
#         print(f"Test_ds batch: {data[0].shape}")
#         break
    return train_ds, valid_ds

In [None]:
# # Splitting data into train and valid
# X_nn, X_valid_nn, y_nn, y_valid_nn = train_test_split(X_nn, y, test_size=0.2, random_state=42, stratify=y)
# X_nn.shape, X_valid_nn.shape, y_nn.shape, y_valid_nn.shape, X_test_nn.shape

In [None]:
# Computing class weights to be used in training
class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y), y=y)
class_weights_dict={}
for label in np.sort(y.unique()):
    class_weights_dict[label] = class_weights[label]
class_weights_dict

In [None]:
# A function to initialize weights using Glorot normal initialization
def initialize_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_normal_(m.weight.data)

In [None]:
# Defining model parameters
class Model(pl.LightningModule):
    def __init__(self, input_shape):
        super().__init__()
        
        # Input layer
        self.input = nn.Linear(input_shape,128)
        # Hidden layers
        self.hidden1 = nn.Linear(128,64)
        self.hidden2 = nn.Linear(64,32)
#         self.hidden3 = nn.Linear(128,128)
        # Output layer
        self.output = nn.Linear(32,1)
        
        # Dropout rate
        self.dr = 0.25
        # Activation functions
        self.swish = F.hardswish
        self.sigmoid = torch.sigmoid
        # Metrics
        self.train_roc_auc_metric = torchmetrics.AUROC(pos_label=1)
        self.val_roc_auc_metric = torchmetrics.AUROC(pos_label=1)
    
    def forward(self, x):
        x = self.swish(self.input(x))
        x = F.dropout(x,p=self.dr,training=self.training)
        x = self.swish(self.hidden1(x))
        x = F.dropout(x,p=self.dr,training=self.training)
        x = self.swish(self.hidden2(x))
        x = F.dropout(x,p=self.dr,training=self.training)
#         x = self.swish(self.hidden3(x))
#         x = F.dropout(x,p=self.dr,training=self.training)
        x = self.sigmoid(self.output(x))
        return x
    
    # Training loop with loss and metric computing for each train data batch
    def training_step(self, batch, batch_idx):
        X, y = batch
        y_hat = self(X).squeeze(1)
        loss = F.binary_cross_entropy(y_hat, y.float())
        self.train_roc_auc_metric(y_hat, y)
        self.log('loss', loss, prog_bar=True, on_epoch=True, logger=True)
        self.log('rocauc', self.train_roc_auc_metric, prog_bar=True, on_epoch=False, logger=True)
        return {'loss': loss,}

    # Uses batch loss and metric values to compute and print
    # overall train data loss and metric 
    def training_epoch_end(self, outputs):
        train_roc_auc = self.train_roc_auc_metric.compute()
        self.log('train_roc_auc', train_roc_auc, prog_bar=True, on_epoch=True, on_step=False, logger=True)
#         print(f"Epoch {self.current_epoch} train_roc_auc: {train_roc_auc:.4f}")
        self.train_roc_auc_metric.reset()


    # Computes loss and metric score for each valid data batch
    def validation_step(self, batch, batch_idx):
        X, y = batch
        y_hat = self(X).squeeze(1)
        self.val_roc_auc_metric(y_hat, y)
        val_loss = F.binary_cross_entropy(y_hat, y.float())
        self.log('val_loss',val_loss, prog_bar=True, on_epoch=True, logger=True)
        return {'val_loss': val_loss}

    # Uses batch loss and metric values to compute and print
    # overall valid data loss and metric
    def validation_epoch_end(self, outputs):
        val_roc_auc = self.val_roc_auc_metric.compute()
        self.log('val_roc_auc', val_roc_auc, prog_bar=True, on_epoch=True, on_step=False, logger=True)
#         print(f"Epoch {self.current_epoch} valid_roc_auc: {val_roc_auc:.4f}")
        self.val_roc_auc_metric.reset()
        return {'val_roc_auc': val_roc_auc}


#     def validation_epoch_end(self, outputs):
#         val_roc_auc = self.roc_auc_metric.aggregate()
#         self.roc_auc_metric.reset()
#         self.log('val_roc_auc', val_roc_auc, logger=True, prog_bar=True)
#         return {'val_roc_auc': val_roc_auc}
        
    def predict_step(self, X, batch_idx, dataloader_idx = None):
        return self(X[0])    
    
    # Setting optimizer and learning rate scheduler parameters if any
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=1e-3, eps=1e-8, weight_decay=1e-2, amsgrad=False)
#         optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        #lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.9, last_epoch=-1, verbose=False)
        lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.3,
                                                                  patience=12, min_lr=1e-05, eps=1e-08,
                                                                  verbose=False)
        return {"optimizer": optimizer, "lr_scheduler": lr_scheduler, "monitor": "val_loss"}

In [None]:
# A custom callback to log required parameters like metrics and learning rate
# since TensorBoard is blocked on Kaggle.
class ParamsTracker(pl.callbacks.Callback):

    def __init__(self, verbose=True):
        self.verbose = verbose
        # Defining empty lists for metric values
        self.train_loss = []
        self.train_roc_auc = []
        self.val_loss = []
        self.val_roc_auc = []
        self.lr_epoch_start = []
    
    # Getting learning rate from an optimizer params at epoch start
    def on_train_epoch_start(self, trainer, module):
        current_learning_rate = trainer.optimizers[0].state_dict()["param_groups"][0]["lr"]
        self.lr_epoch_start.append(current_learning_rate)
#         print(f"Epoch start lr {current_learning_rate}")
        
    def on_validation_epoch_end(self, trainer, module):
        metrics_logs = trainer.logged_metrics
        self.val_loss.append(metrics_logs["val_loss"].item())
        self.val_roc_auc.append(metrics_logs["val_roc_auc"].item())
#         print(f"Valid loss {metrics_logs['val_loss'].item():.4f}, roc_auc {metrics_logs['val_roc_auc'].item():.4f}")  
    
    # Getting last saved metrics from a trainer object and appending
    # the required values to the corresponding lists
    def on_train_epoch_end(self, trainer, module):
        metrics_logs = trainer.logged_metrics
        self.train_loss.append(metrics_logs["loss_epoch"].item())
        self.train_roc_auc.append(metrics_logs["train_roc_auc"].item())
#         print(f"Train loss {metrics_logs['loss_epoch'].item():.4f}, roc_auc {metrics_logs['train_roc_auc'].item():.4f}")
        
        # Print all metrics at the end of current epoch if verbose is set to True.
        # It is done here because on_train_epoch_end event happens after on_validation_epoch_end event.
        if self.verbose == True:
            print(f"Epoch {module.current_epoch} start learning rate: {self.lr_epoch_start[-1]:.6f}, "
                  f"train_loss: {self.train_loss[-1]:.4f}, "
                  f"train_roc_auc: {self.train_roc_auc[-1]:.4f}, "
                  f"val_loss: {self.val_loss[-1]:.4f}, "
                  f"val_roc_auc: {self.val_roc_auc[-1]:.4f}")  


In [None]:
# Trains the model using given train and valid datasets
# and returns filepath of the best saved model
def train_ann(train_ds, valid_ds, Model=Model, input_shape=X_nn.shape[1]):
    
    model = Model(input_shape)
    # Weights initialization using custom function
    model.apply(initialize_weights)

    # A callback to save the best model
    checkpoint_callback = pl.callbacks.ModelCheckpoint(
        dirpath="models",
        filename=f'model_' + '{val_loss:.4}',
        monitor='val_loss',
        mode='min',
        save_weights_only=True)

    # A callback to stop training when there is no improvement
    early_stop_callback = EarlyStopping(
        monitor='val_loss',
        min_delta=0.00,
        patience=50,
        verbose=False,
        mode='min'
    )

    # A callback to check learning rate if it is not constant
    lr_monitor = LearningRateMonitor(logging_interval='epoch')
    
    params_tracker_callback = ParamsTracker(verbose=True)

    # print(ModelSummary(model))

    # Setting training parameters
    trainer = pl.Trainer(
        fast_dev_run=False,
        max_epochs=200,
    #         gpus=1,
        precision=32,
        limit_train_batches=1.0,
        limit_val_batches=1.0, 
        num_sanity_val_steps=0,
        check_val_every_n_epoch=1,
        val_check_interval=1.0, 
        callbacks=[checkpoint_callback, early_stop_callback, params_tracker_callback],
     )

    # Training 
    trainer.fit(model, train_ds, valid_ds)

    # Switching model to evaluation mode
    model.eval()

    # Getting path of the best saved model
    best_model_path = checkpoint_callback.best_model_path
    
    return best_model_path, params_tracker_callback

In [None]:
%%time
# Fold splitting parameters
splits = 10
skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)

# Two zero-filled arrays for out-of-fold and test predictions
nn_oof_preds = np.zeros((X_nn.shape[0],))
nn_test_preds = np.zeros((X_test_nn.shape[0],))
total_mean_auc = 0

# Generating folds and making training and prediction for each of them
for num, (train_idx, valid_idx) in enumerate(skf.split(X_nn, y)):
#     if num > 0:
#         break
    print(f"\n\n===Training with fold {num}")
    X_train, X_valid = X_nn.loc[train_idx], X_nn.loc[valid_idx]
    y_train, y_valid = y.loc[train_idx], y.loc[valid_idx]
    
    # Preparing datasets
    train_ds, valid_ds = prepare_datasets(X_train, X_valid, y_train, y_valid)
    
    # Training model
    best_model_path, tracked_values = train_ann(train_ds, valid_ds, Model, X_nn.shape[1])
    
    # Loading weights of the best model
    model = Model(X_nn.shape[1])
    model.load_state_dict(torch.load(best_model_path)['state_dict'])
    model.eval()
    
    # Making valid data preds and plotting their histogram
    preds = model(torch.tensor(X_valid.to_numpy()).float()).detach().numpy().flatten()
#     display(pd.DataFrame(preds).hist(bins=50))
    
    # Calculating and printing this fold's model ROC AUC score
    fold_score = roc_auc_score(y_valid, preds)
    print(f"\n===Fold {num} valid data ROC AUC score is {fold_score}")
    
    # Making test data preds and plotting their histogram
    test_preds = model(X_test_nn).detach().numpy().flatten()
#     display(pd.DataFrame(test_preds).hist(bins=50))
    
    # Saving preds in corresponding arrays
    nn_oof_preds[valid_idx] = preds
    nn_test_preds = test_preds / splits
    
    total_mean_auc += fold_score / splits
    
print(f"Average ROC AUC score of all models is {total_mean_auc}")

In [None]:
# del model, train_ds, valid_ds, X_nn, X_test_nn
# gc.collect()

## LightGBM

### LightGBM hyperparameters optimization

Hyperparameters used in this notebook were optimized using Optuna. The code used or that is shown below. They are commented in order to save runtime as optimization has been already done.

In [None]:
# def train_model_optuna(trial, X_train, X_valid, y_train, y_valid):
#     """
#     A function to train a model using different hyperparamerters combinations provided by Optuna. 
#     Loss of validation data predictions is returned to estimate hyperparameters effectiveness.
#     """
    
        
#     #A set of hyperparameters to optimize by optuna
#     lgbm_params = {
#                     "objective": trial.suggest_categorical("objective", ['binary']),
#                     "boosting_type": trial.suggest_categorical("boosting_type", ['gbdt']),
#                     "num_leaves": trial.suggest_int("num_leaves", 2, 256),
#                     "max_depth": trial.suggest_int("max_depth", 1, 6),
# #                     "max_depth": trial.suggest_categorical("max_depth", [8]),
#                     "learning_rate": trial.suggest_float("learning_rate", 0.15, 1, step=0.01),
#                     "n_estimators": trial.suggest_categorical("n_estimators", [40000]),        
#                     "reg_alpha": trial.suggest_float("reg_alpha", 0.1, 100.0, step=0.1),
#                     "reg_lambda": trial.suggest_float("reg_lambda", 0.1, 100.0, step=0.1),
#                     "random_state": trial.suggest_categorical("random_state", [42]),
#                     "bagging_seed": trial.suggest_categorical("bagging_seed", [42]),
#                     "feature_fraction_seed": trial.suggest_categorical("feature_fraction_seed", [42]), 
#                     "n_jobs": trial.suggest_categorical("n_jobs", [4]), 
#                     "subsample": trial.suggest_float("subsample", 0.1, 1, step=0.01),
#                     "subsample_freq": trial.suggest_int("subsample_freq", 1, 7),
#                     "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1, step=0.01),
# #                     "colsample_bytree": trial.suggest_categorical("colsample_bytree", [1]),
# #                     "device_type": trial.suggest_categorical("device_type", ["GPU"]),
#                     'min_child_samples': trial.suggest_int('min_child_samples', 5, 300),
#                     'min_child_weight': trial.suggest_int('min_child_weight', 256, 512),
        
#                     }



#     # Model loading and training
#     model = LGBMClassifier(**lgbm_params)
#     model.fit(X_train, y_train,
#               eval_set=[(X_valid, y_valid)],
#               eval_metric="auc",
#               early_stopping_rounds=100,
#               verbose=False)
    
#     print(f"Number of boosting rounds: {model.best_iteration_}")
#     oof = model.predict_proba(X_valid)[:, 1]
    
#     return roc_auc_score(y_valid, oof)

In [None]:
# %%time
# # Splitting data into train and valid folds using target bins for stratification
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# # Setting optuna verbosity to show only warning messages
# # If the line is uncommeted each iteration results will be shown
# # optuna.logging.set_verbosity(optuna.logging.WARNING)

# time_limit = 3600 * 4

# study = optuna.create_study(direction='maximize')
# study.optimize(lambda trial: train_model_optuna(trial, X_train, X_valid,
#                                                     y_train, y_valid),
# #                n_trials = 2
#                timeout=time_limit
#               )

# # Showing optimization results
# print('Number of finished trials:', len(study.trials))
# print('Best trial parameters:', study.best_trial.params)
# print('Best score:', study.best_value)

In [None]:
# LinearSVC and NN predictions will be used as features for LightGBM
X["linear_preds"] = linear_oof_preds
X_test["linear_preds"] = linear_test_preds
X["nn_preds"] = nn_oof_preds
X_test["nn_preds"] = nn_test_preds

In [None]:
# Model hyperparameters
lgbm_params = {'objective': 'binary',
               'boosting_type': 'gbdt',
               'num_leaves': 41,
               'max_depth': 1,
               'learning_rate': 0.15,
               'n_estimators': 40000,
               'reg_alpha': 17.6,
               'reg_lambda': 74.7,
               'random_state': 42,
               'bagging_seed': 42,
               'feature_fraction_seed': 42,
               'n_jobs': 4,
               'subsample': 0.86,
               'subsample_freq': 4, 
               'colsample_bytree': 0.16, 
               'min_child_samples': 151, 
               'min_child_weight': 361}

In [None]:
%%time
# Setting up fold parameters
splits = 10
skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)

# Creating an array of zeros for storing "out of fold" predictions
oof_preds = np.zeros((X.shape[0],))
preds = 0
model_fi = 0
total_mean_auc = 0

# Generating folds and making training and prediction for each of 10 folds
for num, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    X_train, X_valid = X.loc[train_idx], X.loc[valid_idx]
    y_train, y_valid = y.loc[train_idx], y.loc[valid_idx]
    
    model = LGBMClassifier(**lgbm_params)
    model.fit(X_train, y_train,
              verbose=False,
              # These three parameters will stop training before a model starts overfitting 
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              eval_metric="auc",
              early_stopping_rounds=300,
              )
    
    # Getting mean test data predictions (i.e. devided by number of splits)
    preds += model.predict_proba(X_test)[:, 1] / splits
    
    # Getting mean feature importances (i.e. devided by number of splits)
    model_fi += model.feature_importances_ / splits
    
    # Getting validation data predictions. Each fold model makes predictions on an unseen data.
    # So in the end it will be completely filled with unseen data predictions.
    # It will be used to evaluate hyperparameters performance only.
    oof_preds[valid_idx] = model.predict_proba(X_valid)[:, 1]
    
    # Getting score for a fold model
    fold_auc = roc_auc_score(y_valid, oof_preds[valid_idx])
    print(f"Fold {num} ROC AUC: {fold_auc}")

    # Getting mean score of all fold models (i.e. devided by number of splits)
    total_mean_auc += fold_auc / splits
    
print(f"\nOverall ROC AUC: {total_mean_auc}")

# **LightGBM feature importances**

In [None]:
# Creating a dataframe to be used for plotting
df = pd.DataFrame()
df["Feature"] = X.columns
# Extracting feature importances from the trained model
df["Importance"] = model_fi / model_fi.sum()
# Sorting the dataframe by feature importance
df.sort_values("Importance", axis=0, ascending=False, inplace=True)

In [None]:
fig, ax = plt.subplots(figsize=(13, 35))
bars = ax.barh(df["Feature"], df["Importance"], height=0.4,
               color="mediumorchid", edgecolor="black")
ax.set_title("Feature importances", fontsize=30, pad=15)
ax.set_ylabel("Feature name", fontsize=20, labelpad=15)
ax.set_xlabel("Feature importance", fontsize=20, labelpad=15)
ax.set_yticks(df["Feature"])
ax.set_yticklabels(df["Feature"], fontsize=13)
ax.tick_params(axis="x", labelsize=15)
ax.grid(axis="x")
# Adding labels on top
ax2 = ax.secondary_xaxis('top')
ax2.set_xlabel("Feature importance", fontsize=20, labelpad=13)
ax2.tick_params(axis="x", labelsize=15)
ax.margins(0.05, 0.01)

# Inverting y axis direction so the values are decreasing
plt.gca().invert_yaxis()

# **Predictions submission**

In [None]:
predictions = pd.DataFrame()
predictions["id"] = test["id"]
predictions["target"] = preds

predictions.to_csv('submission.csv', index=False, header=predictions.columns)
predictions.head()