## TPS-06 Pytorch RESIDUAL connection for tabular data

This notebook implements concept of residual connection in pure Pytorch. 

Notebook is under development (required some investigation regarding network architecture - I am still trying to answer question - why the same (?) architecture developed in Keras (R/Python) perform better (in terms of LN). If you can point me out where are te differences I will be more then happy.

<div class="alert alert-info">
  <strong>My other Pytorch notebooks:</strong>
    <ul>
        <li><a href ="https://www.kaggle.com/remekkinas/pytorch-skorch-residual-hyperparameter">Pytorch (skorch) - RESIDUAL + hyperparameter</a></li>
        <li><a href = "https://www.kaggle.com/remekkinas/skorch-tutorial-simple-pytorch-nn-with-scikit">SKORCH tutorial - simple PyTorch NN with scikit</a></li>
        <li><a href ="https://www.kaggle.com/remekkinas/tps-5-pytorch-nn-for-tabular-step-by-step">[TPS-5] Pytorch NN for tabular - step by step</a></li>
    </ul>
</div>

### IDEAS BEHIND NOTEBOOK
1. Implementation of skipped connection in Pytroch - this is still under development (looking for better architecture)
2. Traing 100 NN networks based on (the best model from each iteration is saved and then used for prediction)
    - different Random state 
    - different Architecture (I took TOP5 network architectures from Skorch GridSearchCV hyperparameter optimization - https://www.kaggle.com/remekkinas/pytorch-skorch-residual-hyperparameter)
    - folds variability

In [None]:
!pip install torchviz -q

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import random
import time


from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold


import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.nn.utils import weight_norm
from torch.optim.lr_scheduler import ReduceLROnPlateau

from torchviz import make_dot, make_dot_from_trace
from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings("ignore")

device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.manual_seed(0)

In [None]:
RANDOM_STATE = 42
TARGET = 'target'

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jun-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jun-2021/test.csv')


X = train.drop(['target', 'id'], axis = 1).to_numpy()

lencoder = LabelEncoder()
y = lencoder.fit_transform(train['target']).astype('int64')

In [None]:
def seed_everything(seed=43):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(RANDOM_STATE)

In [None]:
BATCH_SIZE = 256
NUM_FEATURES = X.shape[1]
NUM_CLASSES = 9
NUM_EPOCHS = 50
FEATURE_DICT_SIZE = 360

N_SPLITS = 5
N_RANDOM_STATES = 3
N_MODELS = 3

LEARNING_RATE = 0.03
EPOCH_VERBOSE = False
MAX_PATIENCE = 5

In [None]:
def residual_block(in_features, out_features, batch_norm, p_drop, non_linear = nn.ReLU(), *args, **kwargs):       
        net = nn.Sequential(
            nn.Dropout(p = p_drop),
            weight_norm(nn.Linear(in_features, out_features)),
            non_linear) 
        
        if batch_norm:
            net = nn.Sequential(nn.BatchNorm1d(in_features),
                                nn.Dropout(p = p_drop),
                                nn.Linear(in_features, out_features),
                                non_linear)
        return net



class TPSResidual(nn.Module):
    def __init__(self, 
                 num_features = 75, 
                 num_class = 9, 
                 feature_dictionary_size = 360, 
                 batch_norm = False,
                 dropout = 0.3, 
                 linear_nodes = 32, 
                 linear_out = 16, 
                 emb_output = 4, 
                 num_block = 3):
        super(TPSResidual, self).__init__()
        self.num_block = num_block
        self.final_module_list = nn.ModuleList()
    
        
        self.embedding = nn.Embedding(feature_dictionary_size, emb_output)
        self.flatten = nn.Flatten()

        self.linear = weight_norm(nn.Linear(emb_output * num_features, linear_nodes))
        torch.nn.init.xavier_uniform(self.linear.weight)
        
        for res_num in range(self.num_block):
            self.non_linear = nn.ELU() if res_num % 2 else nn.ReLU()
            self.lin_out = linear_out if res_num == (num_block-1) else linear_nodes
            self.final_module_list.append(residual_block(emb_output * num_features + (res_num + 1) * linear_nodes, self.lin_out, batch_norm, dropout, self.non_linear))
        
        #self.bn = nn.BatchNorm1d(linear_out)
        self.out = nn.Linear(linear_out, num_class)
        
        # nonlinearity - activation function
        self.selu = nn.SELU()
        
        self.dropout = nn.Dropout(p = dropout)

    def forward(self, x):
        x = torch.tensor(x).to(torch.int64)
        
        # Embedding 
        e = self.embedding(x)
        e = self.flatten(e)
        
        h1 = self.dropout(e)
        h1 = self.linear(h1)
        h1 = self.selu(h1)
        
        ri = torch.cat((e, h1), 1)
        
        for res_num in range(self.num_block):          
            rx = self.final_module_list[res_num](ri)
            ri = torch.cat((ri, rx), 1)
        # rx = self.bn(rx)
        return  self.out(rx)

In [None]:

# Below you can find the best params for this approach found in over 270 runs in this notebook: https://www.kaggle.com/remekkinas/pytorch-skorch-residual-hyperparameter

net_params = [{'net__module__dropout': 0.3, 'net__module__emb_output': 2, 'net__module__linear_nodes': 16, 
               'net__module__linear_out': 16, 'net__module__num_block': 2, 'net__optimizer': optim.Adam},
             {'net__module__dropout': 0.3, 'net__module__emb_output': 2, 'net__module__linear_nodes': 32,
              'net__module__linear_out': 16, 'net__module__num_block': 2, 'net__optimizer': optim.Adam},
             {'net__module__dropout': 0.2, 'net__module__emb_output': 2, 'net__module__linear_nodes': 16, 
              'net__module__linear_out': 16, 'net__module__num_block': 3, 'net__optimizer': optim.Adam},
             {'net__module__dropout': 0.2, 'net__module__emb_output': 8, 'net__module__linear_nodes': 64, 
              'net__module__linear_out': 32, 'net__module__num_block': 3, 'net__optimizer': optim.Adam},
             {'net__module__dropout': 0.2, 'net__module__emb_output': 6, 'net__module__linear_nodes': 32, 
              'net__module__linear_out': 16, 'net__module__num_block': 3, 'net__optimizer': optim.Adam}]

net_params_exp = [{'net__module__dropout': 0.2, 'net__module__emb_output': 2, 'net__module__linear_nodes': 16, 
               'net__module__linear_out': 16, 'net__module__num_block': 2, 'net__optimizer': optim.Adam}]

def get_estimator(net_params):
    return TPSResidual(NUM_FEATURES,
                       NUM_CLASSES,
                       FEATURE_DICT_SIZE,
                       batch_norm = False,
                       dropout = net_params['net__module__dropout'], 
                       emb_output = net_params['net__module__emb_output'], 
                       linear_nodes = net_params['net__module__linear_nodes'], 
                       linear_out = net_params['net__module__linear_out'], 
                       num_block = net_params['net__module__num_block'])

In [None]:
model_graph = get_estimator(net_params_exp[0])
model_graph.to(device)
model_graph.eval()

print(model_graph)

In [None]:
x_graph = torch.ones(1, NUM_FEATURES).to(device)
y_graph = model_graph(x_graph)

make_dot(y_graph.mean(), params=dict(model_graph.named_parameters()))

In [None]:
def acc_calc(y_pred, y_test):
    y_pred_softmax = torch.log_softmax(y_pred, dim = 1)
    _, y_pred_tags = torch.max(y_pred_softmax, dim = 1)    
    
    correct_pred = (y_pred_tags == y_test).float()
    acc = correct_pred.sum() / len(correct_pred)
    
    acc = torch.round(acc * 100)
    
    return acc

In [None]:
criterion = nn.CrossEntropyLoss()

def model_train(data_loader, model, optimizer):
    
    model.train()
    
    train_epoch_loss = 0
    train_epoch_acc = 0
    
    for X_train_batch, y_train_batch in data_loader:
            
            X_train_batch, y_train_batch = X_train_batch.to(device), y_train_batch.to(device)
            
            optimizer.zero_grad()

            y_train_pred = model(X_train_batch)

            train_loss = criterion(y_train_pred, y_train_batch)

            train_acc = acc_calc(y_train_pred, y_train_batch)
   
            train_loss.backward()

            optimizer.step()
            
            train_epoch_loss += train_loss.item()
            train_epoch_acc += train_acc.item()
            
    return train_epoch_loss, train_epoch_acc


In [None]:
def model_validate(data_loader, model):
    
    val_epoch_loss = 0
    val_epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():

            val_epoch_loss = 0
            val_epoch_acc = 0

            
            for X_val_batch, y_val_batch in data_loader:
                X_val_batch, y_val_batch = X_val_batch.to(device), y_val_batch.to(device)

                y_val_pred = model(X_val_batch)

                val_loss = criterion(y_val_pred, y_val_batch)
                val_acc = acc_calc(y_val_pred, y_val_batch)

                val_epoch_loss += val_loss.item()
                val_epoch_acc += val_acc.item()
    
    return val_epoch_loss, val_epoch_acc

In [None]:
 
    for r_state in tqdm(range(N_RANDOM_STATES)):
        
        for model_idx in range(N_MODELS):
            
            fold_random_state = (RANDOM_STATE + r_state)
            seed_everything(fold_random_state)

            kfold = StratifiedKFold(n_splits = N_SPLITS, shuffle = True, random_state = fold_random_state)

            for fold, (train_idx, valid_idx) in enumerate(kfold.split(X, y)):
                e_start_time = time.time()

                best_loss = 9999
                patience = 0

                OOF_PRED = np.zeros((X.shape[0], 9))

                accuracy_stat = {'train': [],"validation": []}
                loss_stat = {'train': [], "validation": [] }

                X_train, X_valid = X[train_idx], X[valid_idx]
                y_train, y_valid = y[train_idx], y[valid_idx]

                train_dataset = TensorDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).long())
                valid_dataset = TensorDataset(torch.from_numpy(X_valid).float(), torch.from_numpy(y_valid).long())

                train_loader = DataLoader(dataset = train_dataset, batch_size = BATCH_SIZE)
                valid_loader = DataLoader(dataset = valid_dataset, batch_size = BATCH_SIZE)

                model = get_estimator(net_params[model_idx])
                model.to(device)

                optimizer = optim.AdamW(model.parameters(), lr = LEARNING_RATE) 
                scheduler = ReduceLROnPlateau(optimizer, 'min', patience = 3)

                for progress in range(1, NUM_EPOCHS+1):

                    train_epoch_loss, train_epoch_acc = model_train(train_loader, model, optimizer)

                    val_epoch_loss, val_epoch_acc = model_validate(valid_loader, model)

                    # Metrics
                    e_train_loss = train_epoch_loss / len(train_loader)
                    e_train_acc = train_epoch_acc / len(train_loader)
                    e_valid_loss = val_epoch_loss / len(valid_loader)
                    e_valid_acc = val_epoch_acc / len(valid_loader)

                    loss_stat['train'].append(e_train_loss)
                    loss_stat['validation'].append(e_valid_loss)
                    accuracy_stat['train'].append(e_train_acc)
                    accuracy_stat['validation'].append(e_valid_acc) 

                    if (e_valid_loss) < best_loss:
                        torch.save(model.state_dict(),f'model-{RANDOM_STATE + r_state}-m{model_idx}-{fold}.bin')
                        best_loss = e_valid_loss
                        patience = 0

                    clr = optimizer.param_groups[0]['lr']        
                    scheduler.step(e_valid_loss)


                    if EPOCH_VERBOSE:
                        print(f'Epoch { progress + 0:03}: Loss: [Train: {e_train_loss:.7f} | Validation: {e_valid_loss:.7f} ] Accuracy: [Train: {e_train_acc:.3f} | Validation: {e_valid_acc:.3f}] LR: {clr}')

                    if patience == MAX_PATIENCE:
                        if EPOCH_VERBOSE:
                            print(f'No vlaidation loss improvement - best: {best_loss}')
                        break
                    else:
                        patience +=1

                print(f'RANDOM: {RANDOM_STATE + r_state} - MODEL_IDX: {model_idx+1} - FOLD: {fold} - BEST MODEL: {best_loss:.5f} - TIME: {(time.time()-e_start_time):.2f}s)')
            print("\n")

In [None]:
test_df = TensorDataset(torch.Tensor(np.array(test.drop('id', axis = 1))))
test_loader = DataLoader(test_df, batch_size = 100000, shuffle = False)
test_preds = np.zeros((test.shape[0], 9))

for r_state in tqdm(range(N_RANDOM_STATES)):
    for model_idx in range(N_MODELS):
        
        for fold in range(N_SPLITS):
            in_size = test.shape[1]
            model = get_estimator(net_params[model_idx])
            model.load_state_dict(torch.load(f'./model-{RANDOM_STATE + r_state}-m{model_idx}-{fold}.bin'))
            model.to(device)
            preds = list()

            for _, data in enumerate(test_loader, 0):
                features = data[0]
                features = features.to(device, dtype=torch.float)

                with torch.set_grad_enabled(False):
                    y_pred = model(features)

                    sm = nn.Softmax(dim=1)
                    pred_percentage = sm(y_pred)
                    preds.extend(pred_percentage.detach().cpu().numpy())

            test_preds += np.array(preds)
        
test_preds = test_preds / (N_SPLITS * N_RANDOM_STATES * N_MODELS)


In [None]:
sub = pd.read_csv("../input/tabular-playground-series-jun-2021/sample_submission.csv")

predictions_df = pd.DataFrame(test_preds, columns = ["Class_1", "Class_2", "Class_3", "Class_4", "Class_5", "Class_6", "Class_7", "Class_8", "Class_9" ])
predictions_df['id'] = sub['id']

In [None]:
predictions_df.to_csv("TPS06-Pytorch_residual_submission.csv", index = False)