# Pytorch (+skorch) RESIDUAL (skipped connection) with CROSS VALIDATION and HYPERPARAMETER SEARCH (GridSearchCV)


<div class="alert alert-success">
  <strong>Content of this notebook</strong>
    <ul>
        <li>Notebook contribution</li>
        <li>Model definition</li>
        <li>Callback definition</li>
        <li>Pilepine for data preprocessing and model run</li>
        <li>Hyperparameter search</li>
        <ul>
            <li>number of residual modules in NN</li>
            <li>number of embedding dimention</li>
<li>best network optimizer</li>
            <li>optimal linear nodes number</li>
<li>dropout </li>
<li>and many more (I just provided framework for searching the best params for NN)</li></ul>
        <li>Model ranking + optimal parameters</li>
        <li>Prediction smoothing - using many models</li>
        <li>Submission</li>
    </ul>
</div>
</br>
<hr class="background-color: #fff; border-top: 2px dotted #8c8b8b;">
</br>
Scope of seach defined as a standard GridSearchCV
<code>
    grid_params = {
    'net__module__dropout': [0.2, 0.3], 
    'net__optimizer': [optim.Adam], 
    'net__module__linear_nodes' : [64, 32, 16],
    'net__module__emb_output' : [2, 4, 6],
    'net__module__linear_out' : [16],
    'net__module__num_block' : [2, 3, 4]
} 
 </code>


<div>
   <strong>Contributions</strong>
    <ul>
        <li><a href = "https://www.kaggle.com/oxzplvifi/tabular-residual-network">Tabular Residual Network by @oxzplvifi</a></li>
        <li><a href = "https://www.kaggle.com/oxzplvifi/tabular-residual-network">Python keras NN (residual) by @alexryzhkov</a></li>
    </ul>
</div>

<div>
<strong>List of experiments which have not provide better result</strong>
<ul>
    <li>I have used different ways to normalize weight - batch norm, weight norm and layer norm - the best so far is weight norm</li>
    <li> Mixing many activation function (ReLU, ELU, SELU) works fine. If using only ReLU score has not improved.</li>
</ul>
</div>

In [None]:
!pip install -U skorch -q

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
import seaborn as sns
sns.set(font_scale= 1.0)


from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV


import torch
from torch import nn
from torch.nn.utils import weight_norm
import torch.optim as optim
import torch.nn.functional as F

from skorch import NeuralNetClassifier, NeuralNet
from skorch.callbacks import EpochScoring
from skorch.callbacks import LRScheduler, EarlyStopping
from torch.optim.lr_scheduler import ReduceLROnPlateau

import warnings
warnings.filterwarnings("ignore")

device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.manual_seed(0)

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jun-2021/train.csv', index_col = 'id')
test = pd.read_csv('../input/tabular-playground-series-jun-2021/test.csv', index_col = 'id')
test_pred = test.values.astype('float32')


X = train.drop('target', axis = 1).values.astype('float32')

lencoder = LabelEncoder()
y = lencoder.fit_transform(train['target']).astype('int64')

## MODEL DEFINITION 

This model is prepared for maximum tuning by scikit GridSearchCV. Certainly you can add more feature to optimize. 

In [None]:
feature_dictionary_size = 360
num_features = 75

def residual_block(in_features, out_features, p_drop, non_linear = nn.ReLU(), *args, **kwargs):  
        return nn.Sequential(
            nn.Dropout(p = p_drop),
            weight_norm(nn.Linear(in_features, out_features)),
            non_linear)


class TPSResidual(nn.Module):
    def __init__(self, num_class = 9, dropout = 0.3, linear_nodes = 32, linear_out = 16, emb_output = 4, num_block = 3):
        super(TPSResidual, self).__init__()
        
        self.num_block = num_block
        self.final_module_list = nn.ModuleList()
    
        
        self.embedding = nn.Embedding(feature_dictionary_size, emb_output)
        self.flatten = nn.Flatten()

        self.linear = weight_norm(nn.Linear(emb_output * num_features, linear_nodes))
        
        for res_num in range(self.num_block):
            self.non_linear = nn.ELU() if res_num % 2 else nn.ReLU()
            self.lin_out = linear_out if res_num == (self.num_block - 1) else linear_nodes
            self.final_module_list.append(residual_block(emb_output * num_features + (res_num + 1) * linear_nodes, self.lin_out, dropout, self.non_linear))
            
        self.out = nn.Linear(linear_out, num_class)
        
        # nonlinearity - activation function
        self.selu = nn.SELU()
        
        self.dropout = nn.Dropout(p = dropout)
    
   
    def forward(self, x):
        x = torch.tensor(x).to(torch.int64)
        
        # Embedding 
        e = self.embedding(x)
        e = self.flatten(e)
        
        h1 = self.dropout(e)
        h1 = self.linear(h1)
        h1 = self.selu(h1)
        
        ri = torch.cat((e, h1), 1)
        
        for res_num in range(self.num_block):          
            rx = self.final_module_list[res_num](ri)
            ri = torch.cat((ri, rx), 1)
        
        return  F.softmax(self.out(rx), dim = -1)

## CALLBACKS 
I decided to define to callback (1) Learning Rate Scheduler and (2) Early Stopping. You can look for it using GridSearch as well but I think that this is a way better approach (dynamic search).

In [None]:
lr_scheduler = LRScheduler(policy = ReduceLROnPlateau, monitor = 'valid_loss', mode = 'min', patience = 3, factor = 0.1, verbose = True)
early_stopping = EarlyStopping(monitor='valid_loss', patience = 10, threshold = 0.0001, threshold_mode='rel', lower_is_better=True)

In [None]:
net = NeuralNetClassifier(TPSResidual, device = device, lr = 0.001, max_epochs = 50, callbacks = [lr_scheduler, early_stopping])

## HYPERPARAMETER SEARCH PARAMETERS 

In [None]:
# if you can search for hyperparamteters set OPTIM = True
# I ran many experiments (270 runs) o local machine. Results provided below.

OPTIM = False

In [None]:
# Each parameter you can define as you want. For testing purposes and limited Kaggle notebook uptime I decided to compute them on my local machine and use it here.  

# This is demo only so I dropped some of parameters (it is only subset, full list of params (about 270 runs) will be trained locally and then I will provide here)
grid_params = {
    'net__module__dropout': [0.2, 0.3], 
    'net__optimizer': [optim.AdamW, optim.Adam, optim.RMSprop],
    'net__module__linear_nodes' : [64, 32, 16],
    'net__module__emb_output' : [2, 4, 6],
    'net__module__linear_out' : [16],
    'net__module__num_block' : [2, 3, 4]
} 



# Here you can define steps in Pipeline (eg. data preprocessing). In this example there is no such needs.
steps = [('net', net)]
pipeline = Pipeline(steps)


grid_net = GridSearchCV(pipeline, grid_params, cv = 5, refit = True, verbose = 1)

#### TOP3 - AFTER 270 RUNS THE BEST PARAMETERS SO FAR:
**Model with rank: 1**
Mean validation score: -1.744 (std: 0.004)
Parameters: {'net__module__dropout': 0.3, 'net__module__emb_output': 2, 'net__module__linear_nodes': 16, 'net__module__linear_out': 16, 'net__module__num_block': 2, 'net__optimizer': <class 'torch.optim.adam.Adam'>}

**Model with rank: 2**
Mean validation score: -1.745 (std: 0.003)
Parameters: {'net__module__dropout': 0.3, 'net__module__emb_output': 2, 'net__module__linear_nodes': 32, 'net__module__linear_out': 16, 'net__module__num_block': 2, 'net__optimizer': <class 'torch.optim.adam.Adam'>}

**Model with rank: 3**
Mean validation score: -1.745 (std: 0.004)
Parameters: {'net__module__dropout': 0.2, 'net__module__emb_output': 2, 'net__module__linear_nodes': 16, 'net__module__linear_out': 16, 'net__module__num_block': 3, 'net__optimizer': <class 'torch.optim.adam.Adam'>}

In [None]:
if OPTIM:
    result = grid_net.fit(X,y)

# You can look into training history below

## TOP5 MODELS RANKING 

In [None]:
def report(results, n_top=5):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

if OPTIM:
    report(grid_net.cv_results_,5)

## BEST ESTIMATOR 

In [None]:
if OPTIM:
    grid_net.best_estimator_

## LEARNING HISTORY 

In [None]:
if OPTIM:
    epochs = [i for i in range(len(grid_net.best_estimator_[0].history))]
    train_loss = grid_net.best_estimator_[0].history[:,'train_loss']
    valid_loss = grid_net.best_estimator_[0].history[:,'valid_loss']

    plt.plot(epochs,train_loss,'g-');
    plt.plot(epochs,valid_loss,'r-');
    plt.title('Training Loss Curves');
    plt.xlabel('Epochs');
    plt.ylabel('Mean Squared Error');
    plt.legend(['Train','Validation']);

## ESTIMATOR LEARNING AND BLENDING

In [None]:
# This parameters were taken from TOP3 searchgridcv estimators (computed on local machine) 

net_params = [{'net__module__dropout': 0.3, 'net__module__emb_output': 2, 
               'net__module__linear_nodes': 16, 'net__module__linear_out': 16, 
               'net__module__num_block': 2, 'net__optimizer': optim.Adam},
             
              {'net__module__dropout': 0.3, 'net__module__emb_output': 2, 
              'net__module__linear_nodes': 32, 'net__module__linear_out': 16, 
              'net__module__num_block': 2, 'net__optimizer': optim.Adam},
              
              {'net__module__dropout': 0.2, 'net__module__emb_output': 2, 
              'net__module__linear_nodes': 16, 'net__module__linear_out': 16, 
              'net__module__num_block': 3, 'net__optimizer': optim.Adam},
             
              {'net__module__dropout': 0.3, 'net__module__emb_output': 2, 
               'net__module__linear_nodes': 16, 'net__module__linear_out': 16, 
               'net__module__num_block': 3, 'net__optimizer': optim.Adam},
              
              {'net__module__dropout': 0.3, 'net__module__emb_output': 2, 
               'net__module__linear_nodes': 16, 'net__module__linear_out': 16, 
               'net__module__num_block': 4, 'net__optimizer': optim.Adam}]

def get_estimator(net_params):
    return NeuralNetClassifier(TPSResidual, 
                               device = device, 
                               lr = 3e-2, 
                               max_epochs = 50, 
                               optimizer = optim.AdamW,
                               callbacks = [lr_scheduler, early_stopping],
                               module__dropout = net_params['net__module__dropout'],
                               module__emb_output = net_params['net__module__emb_output'],
                               module__linear_nodes = net_params['net__module__linear_nodes'],
                               module__linear_out = net_params['net__module__linear_out'],
                               module__num_block = net_params['net__module__num_block'],
                               iterator_train__shuffle = True
                             )

In [None]:
if not OPTIM:
    NUM_MODELS = len(net_params)
    y_pred = np.zeros((100000,9))

    for net_config in net_params:
        print(f'MODEL PARMAETERS {net_config}\n')
        net = get_estimator(net_config)
        net.fit(X,y)
        y_pred += net.predict_proba(test_pred) / NUM_MODELS
        print("\n")

## SUBMISSION

In [None]:
sub = pd.read_csv("../input/tabular-playground-series-jun-2021/sample_submission.csv")

predictions_df = pd.DataFrame(y_pred, columns = ["Class_1", "Class_2", "Class_3", "Class_4", "Class_5", "Class_6", "Class_7", "Class_8", "Class_9" ])
predictions_df['id'] = sub['id']

In [None]:
predictions_df.head(5)

In [None]:
palette = itertools.cycle(sns.color_palette())

plt.figure(figsize=(16, 8))
for i in range(9):
    plt.subplot(3, 3, i+1)
    c = next(palette)
    sns.histplot(predictions_df, x = f'Class_{i+1}', color=c)
plt.suptitle("CLASS PREDICTION - DISTRIBUTION")

In [None]:
predictions_df.drop("id", axis=1).describe().T.style.bar(subset=['mean'], color='#205ff2')\
                            .background_gradient(subset=['std'], cmap='Reds')\
                            .background_gradient(subset=['50%'], cmap='coolwarm')

In [None]:
predictions_df.to_csv("TPS06-pytorch_residual_submission.csv", index = False)

#### I really appreciate any feedback and support. Thank you very much!