# Version

- V7:
    - load the last checkpoint with the best model
    - Learning Rate Scheduler (CosineAnnealingLR)
- V6:
    - es_patience=20
    - lr_patience=7
    - epochs=125
- V5:
    - kaiming_uniform_ weights initializer
    - One extra Dense layer
- V4: 
    - Add Early Stopping: patience=5, 
    - Learning Rate Scheduler (ReduceLROnPlateau): patience=3
    - Adam optimizer: learning_rate=0.01
- V3: Swish activation
- V2: 
    - Epochs=30
    - n_folds=10
- V1: 
    - SGD optimizer
    - ReLU activation
    - Epochs=5
    - n_folds=5
    - Model: 3 Linear layers (128-64-1)

# References

- https://www.kaggle.com/mmellinger66/tps-nov-21-keras-tuner
- https://www.kaggle.com/lucamassaron/feature-selection-by-boruta-shap
- https://www.kaggle.com/hiro5299834/tps-nov-2021-pytorch-lightning
- https://www.kaggle.com/yasufuminakama/ventilator-pressure-lstm-starter

In [None]:
# ! pip install torchsummary

In [None]:
import os
import time
import math

import pandas as pd
import numpy as np
from pathlib import Path

import random
import feather

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import roc_auc_score
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold,StratifiedKFold, GroupKFold

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import torch
from torch import nn
import torch.optim as optim
from torch.utils import data
import torch.nn.functional as F
# from torchsummary import summary

In [None]:
class Config:
    competition = "TPS_202111"
    seed = 42
    n_folds = 10
    batch_size = 1024
    epochs = 125
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    es_patience = 20
    lr_patience = 7
    lr = 0.01

In [None]:
# this func follows pytorch lightning's seed_everythin --> https://pytorch-lightning.readthedocs.io/en/latest/_modules/pytorch_lightning/utilities/seed.html#seed_everything
def seed_everything(seed=Config.seed):
    os.environ["PL_GLOBAL_SEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

In [None]:
data_dir = Path('../input/tabular-playground-series-nov-2021')

In [None]:
%%time
# train_df = pd.read_csv(data_dir / "train.csv", 
# #                        nrows=10000
#                       )
# test_df = pd.read_csv(data_dir / "test.csv",
# #                      nrows=1000
#                      )

# Loading files in feather format
train_df = feather.read_dataframe('../input/tpsnov21/train.feather')
test_df = feather.read_dataframe('../input/tpsnov21/test.feather')
sample_submission = pd.read_csv(data_dir / "sample_submission.csv")

print(f"train data: Rows={train_df.shape[0]}, Columns={train_df.shape[1]}")
print(f"test data : Rows={test_df.shape[0]}, Columns={test_df.shape[1]}")

In [None]:
# features = [col for col in train_df.columns if col not in ('id', 'target')]
features = ['f1', 'f10', 'f11', 'f14', 'f15', 'f16', 'f17', 'f2', 'f20', 'f21', 'f22', 'f24', 'f25', 'f26', 'f27', 'f28', 'f3', 'f30', 'f31', 'f32', 'f33', 'f34', 'f36', 'f37', 'f4', 'f40', 'f41', 'f42', 'f43', 'f44', 'f45', 'f46', 'f47', 'f48', 'f49', 'f5', 'f50', 'f51', 'f53', 'f54', 'f55', 'f57', 'f58', 'f59', 'f60', 'f61', 'f62', 'f64', 'f66', 'f67', 'f70', 'f71', 'f76', 'f77', 'f8', 'f80', 'f81', 'f82', 'f83', 'f87', 'f89', 'f9', 'f90', 'f91', 'f93', 'f94', 'f95', 'f96', 'f97', 'f98']

In [None]:
class TPSDataset(data.Dataset):
    def __init__(self, X, y=None):
        super(TPSDataset).__init__()
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        if self.y is not None:
            return {
                'X' : torch.tensor(self.X.values[idx], dtype=torch.float),
                'y' : torch.tensor(self.y.values[idx], dtype=torch.float)
            }
        else:
            return {
                'X' : torch.tensor(self.X.values[idx], dtype=torch.float),
            }


In [None]:
scaler = StandardScaler()

train_df[features] = scaler.fit_transform(train_df[features])
test_df[features] = scaler.transform(test_df[features])

In [None]:
y_train = train_df.target

X_test = test_df.drop(columns=["id"], axis=1)
X_train = train_df.drop(columns=["id", "target"], axis=1)

In [None]:
# remove the unimportant features
X_train = X_train[features]
X_test = X_test[features]

In [None]:
train_dataset = TPSDataset(X_train, y_train)
test_dataset = TPSDataset(X_test)

In [None]:
test_loader = data.DataLoader(test_dataset, batch_size = 1024)

## PyTorch Model

In [None]:
def initialize_weights(model):
    if isinstance(model, nn.Linear):
#         nn.init.normal_(model.weight.data)
#         nn.init.xavier_uniform_(model.weight.data)
        nn.init.kaiming_uniform_(model.weight.data, nonlinearity="relu")
        nn.init.constant_(model.bias.data, 0)
    elif isinstance(model, nn.Conv2d):
        nn.init.kaiming_uniform_(model.weight.data, nonlinearity="relu")
        if model.bias is not None:
            nn.init.constant_(model.bias.data, 0)

In [None]:
class Model(nn.Module):
    def __init__(self, in_features, activation=F.relu):
        super(Model, self).__init__()
        self.fc1 = nn.Linear(in_features, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.fc2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.fc3 = nn.Linear(64, 32)
        self.bn3 = nn.BatchNorm1d(32)
        self.fc4 = nn.Linear(32, 1)
        self.flatten = nn.Flatten()
        self.activation = activation

    def forward(self, x):
        x = self.flatten(x)
        x = self.bn1(self.activation(self.fc1(x)))
        x = self.bn2(self.activation(self.fc2(x)))
        x = self.bn3(self.activation(self.fc3(x)))
        x = torch.sigmoid(self.fc4(x))
        
        return torch.squeeze(x, dim=1)        

In [None]:
class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""

    def __init__(
        self, patience=7, verbose=False, delta=0, path="checkpoint.pt", trace_func=print
    ):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement.
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
            trace_func (function): trace print function.
                            Default: print
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path
        self.trace_func = trace_func

    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            self.trace_func(
                f"EarlyStopping counter: {self.counter}/{self.patience}"
            )
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.counter = 0
            self.save_checkpoint(val_loss, model)

    def save_checkpoint(self, val_loss, model):
        """Saves model when validation loss decrease."""
        checkpoint = {"config": Config, "model_state_dict": model.state_dict()}

        if self.verbose:
            self.trace_func(
                f"Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ..."
            )
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

In [None]:
model = Model(in_features=len(features), activation=F.hardswish).to(Config.device)
model.apply(initialize_weights)

num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of parameters: {num_params}")

# summary(model, (len(features)), batch_size=-1, device="cpu")

## Training with Cross Validation

In [None]:
%%time
seed_everything()

final_test_predictions = []
final_valid_predictions = {}
scores = []
histories = []

kf = StratifiedKFold(n_splits=Config.n_folds, random_state=Config.seed, shuffle=True)

for fold, (train_idx, valid_idx) in enumerate(kf.split(X = X_train, y = y_train)):
    print(10*"=", f"Fold={fold+1}/{Config.n_folds}", 10*"=")
    start_time = time.time()

    train_subset = data.Subset(train_dataset, train_idx)
    valid_subset = data.Subset(train_dataset, valid_idx)
    train_loader = data.DataLoader(train_subset, batch_size = Config.batch_size, shuffle=True)
    valid_loader = data.DataLoader(valid_subset, batch_size = Config.batch_size)
        
    model = Model(in_features=len(features), activation=F.hardswish).to(Config.device)
    model.apply(initialize_weights)
    criterion = nn.BCELoss()
#     optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
    optimizer = optim.Adam(model.parameters(), Config.lr)
    
    # initialize the early_stopping object
    early_stopping = EarlyStopping(
        patience=Config.es_patience, verbose=True, path=f"./model_checkpoint_{fold}.pt"
    )   
    
    # initialize the learning rate scheduler
#     lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
#         optimizer, patience=Config.lr_patience, verbose=True
#     )
    Q = math.floor(len(train_idx)/Config.batch_size)
    lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, T_max=Q, verbose=True
    )
    
    val_loss = []
    for epoch in range(Config.epochs):
        epoch_loss = 0.0
        for idx, batch in enumerate(train_loader):
            X, y = batch["X"].to(Config.device), batch["y"].to(Config.device)

            # zero the parameter gradients
            optimizer.zero_grad()

            pred = model(X)            
            loss = criterion(pred, y)
#             score = roc_auc_score(y, pred.detach()

            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()

        # print statistics
        print('Epoch %d/%d -> loss: %.4f' % (epoch + 1, Config.epochs, epoch_loss))
                
        model.eval()
        running_val_loss = 0.0
        for idx, batch in enumerate(valid_loader):
            with torch.no_grad():
                X, y = batch["X"].to(Config.device), batch["y"].to(Config.device)
                val_pred = model(X)
                loss = criterion(val_pred, y)
                running_val_loss += loss.item()
                
                
        # early_stopping needs the validation loss to check if it has decreased,
        # and if it has, it will make a checkpoint of the current model
        early_stopping(running_val_loss, model)

        if early_stopping.early_stop:
            print("Early stopping...")
            break
            
#         print(f"running val loss: {running_val_loss}")
#         lr_scheduler.step(running_val_loss) # use with ReduceLROnPlateau scheduler
        lr_scheduler.step() # use with CosineAnnealingLR scheduler

    # load the last checkpoint with the best model
    model.load_state_dict(torch.load(f"./model_checkpoint_{fold}.pt"))
    
    # Predictions for OOF
    print("--- Predicting OOF ---")
    valid_preds = []
    scores = []
    model.eval()
    for idx, batch in enumerate(valid_loader):
        with torch.no_grad():
            X, y = batch["X"].to(Config.device), batch["y"].to(Config.device)
            pred = model(X)
            auc = roc_auc_score(y.cpu().numpy(), pred.detach().cpu().numpy())
            valid_preds.extend(pred.detach().cpu().numpy()) 
            scores.append(auc)


    final_valid_predictions.update(dict(zip(valid_idx, valid_preds)))
    
#     auc = roc_auc_score(y_valid,  valid_preds)
#     scores.append(auc)
    
    # Predictions for Test Data
    print("--- Predicting Test Data ---")
    test_preds = []
    model.eval()
    for idx, batch in enumerate(test_loader):
        with torch.no_grad():
            X = batch["X"].to(Config.device)
            pred = model(X)
            test_preds.extend(pred.detach().cpu().numpy()) 

    final_test_predictions.append(test_preds)
    
    run_time = time.time() - start_time
    print(f"Fold={fold+1}, auc: {auc:.8f}, Run Time: {run_time:.2f}")

In [None]:
print(f"Scores -> corrected: {np.mean(scores)-np.std(scores):.8f}, mean: {np.mean(scores):.8f}, std: {np.std(scores):.8f}")

In [None]:
sample_submission['target'] = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.to_csv("test_pred_2.csv",index=None)
sample_submission.to_csv("submission.csv",index=None)
sample_submission