# Update
* Added uniform weight initialization.
* Loss graphs.
* Early Stopping.

# Imports

In [None]:
import gc
import numpy as np
import pandas as pd
import math
import torch
import copy
import matplotlib.pyplot as plt
import seaborn as sns
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import roc_auc_score
from datetime import datetime
import torch.nn.functional as F

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/test.csv')
sub = pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/sample_submission.csv')

In [None]:
device = "cuda" if torch.cuda.is_available() else 'cpu'
BATCH_SIZE = 512
EPOCHS = 50

In [None]:
FEATURES = [col for col in train.columns if col not in ['id','target']]
y = train.target
train_df = train.drop(columns=['id', 'target'])
test_df = test.drop(columns=['id'])

# Target Distribution

In [None]:
def target_dist():
    palette = 'Set2'
    plt=sns.countplot(x=y,palette=palette)
    plt.set_title('Target distribution')
    sns.despine()

target_dist()

In [None]:
# training data
print(f'Numerical attributes: {len(train_df._get_numeric_data().columns)}' )
print(f'Categorical attributes: {abs(len(train_df.columns) - len(train_df._get_numeric_data().columns))}')
# testing data
print(f'Numerical attributes: {len(test_df._get_numeric_data().columns)}' )
print(f'Categorical attributes: {abs(len(test_df.columns) - len(test_df._get_numeric_data().columns))}')

# Preprocessing

In [None]:
scaler_standard = StandardScaler()
train_df[FEATURES] = scaler_standard.fit_transform(train_df[FEATURES])
test_df[FEATURES] = scaler_standard.transform(test_df[FEATURES])

In [None]:
class CustomDataset:
    def __init__(self, X, y=None):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        if self.y is None:
            return torch.tensor(self.X.values[idx], dtype=torch.float)
        else:
            return torch.tensor(self.X.values[idx], dtype=torch.float), torch.tensor(self.y.values[idx], dtype=torch.float)

# Weights Init

* A uniform distribution has the equal probability of picking any number from a set of numbers.
* The general rule for setting the weights in a neural network is to set them to be close to zero without being too small.

In [None]:
# takes in a module and applies the specified weight initialization
def weights_init_uniform_rule(m):
    classname = m.__class__.__name__
    if classname.find('Linear') != -1:
        n = m.in_features
        y = 1.0/np.sqrt(n)
        m.weight.data.uniform_(-y, y)
        m.bias.data.fill_(0)

# Model

In [None]:
def fc_block(in_f, out_f):
        return nn.Sequential(
            nn.Linear(in_f, out_f),
            nn.SiLU(),
            nn.Dropout(0.3),
        )    
class Net(nn.Module):
    def __init__(self, n):
        super(Net, self).__init__()
        self.flatten = nn.Flatten()
        self.fc1 = fc_block(n, 192)
        self.fc2 = fc_block(192, 96)
        self.fc3 = fc_block(96, 48)
        self.out = nn.Sequential(
            nn.Linear(48, 1),
            nn.Sigmoid()
        )    
    def forward(self, x):
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        x = self.out(x)
        return x

net_model = Net(len(FEATURES)).to(device)
net_model.apply(weights_init_uniform_rule)

# Training

In [None]:
def batch_gd(model, train_loader, test_loader, epochs, val_score_best):
    train_losses = np.zeros(epochs)
    test_losses = np.zeros(epochs)
    epochs_no_improve = 0
    for it in range(epochs):
        t0 = datetime.now()
        model.train()
        train_loss = []
        train_roc = []
        for batch_idx, (inputs, targets) in enumerate(train_loader):
            # move data to GPU
            inputs, targets = inputs.to(device), targets.to(device)
            # zero the parameter gradients
            optimizer.zero_grad()
            # Forward pass
            outputs = model(inputs)
            targets = targets.unsqueeze(1)
            loss = criterion(outputs, targets)
            # Backward and optimize
            loss.backward()
            optimizer.step()
            train_loss.append(loss.item())
            train_roc.append(roc_auc_score(targets.cpu().data.numpy(), outputs.cpu().data.numpy()))
            
        else:
            model.eval()
            test_loss = []
            test_roc = []
            for inputs, targets in test_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = model(inputs)
                targets = targets.unsqueeze(1)
                loss = criterion(outputs, targets)
                test_loss.append(loss.item())
                test_roc.append(roc_auc_score(targets.cpu().data.numpy(), outputs.cpu().data.numpy()))
            #get train and test loss
            test_loss = np.mean(test_loss)
            train_loss = np.mean(train_loss)
            lr_scheduler.step(test_loss)
            ###    
            print('learning_rate: {}'.format(optimizer.state_dict()['param_groups'][0]['lr']))
            # Save losses
            train_losses[it] = train_loss
            test_losses[it] = test_loss
            test_roc_auc = np.mean(test_roc)
            # saving best weights
            if test_loss < val_score_best:
                epochs_no_improve = 0
                val_score_best = test_loss
                print(f'--- saving best weights ---')
                torch.save(model.state_dict(), 'best_weights.pth')
            else:
                epochs_no_improve += 1
            # getting the duration
            dt = datetime.now() - t0
            print(f'Epoch {it+1}/{epochs}, Train Loss: {train_loss:.4f}, Train ROC: {(np.mean(train_roc)):.4f}, \
                    Test Loss: {test_loss:.4f}, Test ROC: {test_roc_auc:.4f}, Improvement: {epochs_no_improve}, Duration: {dt}')
            if epochs_no_improve == 10:
                print(f'Early Stopping..\n')
                break
    return train_losses, test_losses

In [None]:
# garbage collection
gc.collect()
# creating and loading test data
test_dataset = CustomDataset(test_df)
test_loader = DataLoader(test_dataset, batch_size = 512)
# defining folds dictionary
folds_train_losses = {}
folds_test_losses = {}
# test data predictions
test_predictions = []
# defining skfolds
skf = StratifiedKFold(n_splits=5, random_state=47, shuffle=True)
for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, y)):
    X_train, y_train = train_df.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = train_df.iloc[val_idx], y.iloc[val_idx]
    
    train_dataset = CustomDataset(X=X_train, y=y_train)
    val_dataset = CustomDataset(X=X_val, y=y_val)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
        
    criterion = nn.BCELoss()    
    optimizer = torch.optim.Adam(net_model.parameters(), lr=0.001)
    lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=2, 
                                                          verbose=True, min_lr=1e-7, mode='min')    
    # training and validation
    val_score_best = math.inf
    train_losses, test_losses = batch_gd(net_model, train_loader, val_loader, EPOCHS, val_score_best)
    folds_train_losses[fold] = train_losses
    folds_test_losses[fold] = test_losses
    
    # loading best weights
    # print(f'--- loading best weights ---')
    # net_model.load_state_dict(torch.load('best_weights.pth'))
    
    # prediction on test data
    test_preds = []
    net_model.eval()
    with torch.no_grad():
        for idx, batch_tensor in enumerate(test_loader):
            batch_tensor = batch_tensor.to(device)
            preds = net_model(batch_tensor)
            test_preds.extend(preds.cpu().detach().numpy())
    test_predictions.append(test_preds)

# Losses and Plots

In [None]:
def average_loss_per_fold():
    for (f_train, l_train), (f_test, l_test) in zip(folds_train_losses.items(), folds_test_losses.items()):
        print(f'Fold: {f_train} \t Average Train Loss: {np.mean(l_train)} \t Average Test Loss: {np.mean(l_test)}')
    
average_loss_per_fold()

In [None]:
for fold in range(0,5):
    sns.set_style("darkgrid", {"axes.facecolor": ".9"})
    fig, ax = plt.subplots(1, 1, tight_layout=True, figsize=(8,4))
    plt.title(f'Fold: {fold+1}')
    plt.subplot(1,1,1)
    fold_df = pd.DataFrame(data=list(np.stack((folds_train_losses[fold], folds_test_losses[fold])).T),
                     columns=['train_loss','val_loss'])
    plt.plot(fold_df.loc[:, ['train_loss', 'val_loss']], label=fold_df.columns)
    plt.xticks(np.arange(0,51,5))
    plt.legend(fontsize=13)

# Submission

In [None]:
tp_df=pd.DataFrame(data=[list(i) for i in zip(*test_predictions)], columns=['fold_1','fold_2','fold_3','fold_4','fold_5'])
tp_df.head(4)

In [None]:
test_predictions_v2 = copy.deepcopy(test_predictions)
sub['target'] = np.mean(np.column_stack(test_predictions_v2), axis=1)
sub.to_csv("submission.csv",index=None)
sub.head(5)