# Demonstration of Inference/Blending using pretrained models

## This notebook demonstrates how you can use pretrained models output by kernels and perform inference as well as blending.

## Models taken from my public kernels:

### [Label Smoothing](https://www.kaggle.com/rahulsd91/moa-label-smoothing) : V10, LB 0.01865
### [ResNet Model](https://www.kaggle.com/rahulsd91/moa-multi-input-resnet-model) : V5, LB 0.01854
### [Autoencoder Model](https://www.kaggle.com/rahulsd91/moa-autoencoder-features-only-lb-0-01884?scriptVersionId=44521379) : V2, LB 0.01884

Currently, there seems to be no way to include outputs of previous versions of kernels directly, only the latest one. So while the ResNet models can be imported directly, the other two of the notebooks' outputs were uploaded to datasets.

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
import tensorflow as tf
from tensorflow.keras import layers,regularizers,Sequential,backend,callbacks,optimizers,metrics,Model,losses
import sys
import json
import gc
sys.path.append('../input/iterative-stratification/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from scipy.optimize import dual_annealing, minimize

## Things to keep in mind:

* Make sure you initialize the seeds/create the folds the same way as the training notebooks.
* The preprocessing must also be done in an identical manner. One common mistake I have observed people (including myself in a previous version of this notebook!) make is to use the test_features.csv for determining transformations - since the submission run includes public+private dataset, you end up creating different features from what your model was trained on. To bypass this, I've uploaded the public test to a [dataset](https://www.kaggle.com/rahulsd91/moapublictest) and included that here.

In [None]:
# Import train data, drop sig_id, cp_type

train_features = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
non_ctl_idx = train_features.loc[train_features['cp_type']!='ctl_vehicle'].index.to_list()
train_features = train_features.drop(['sig_id','cp_type','cp_dose','cp_time'],axis=1)
train_targets_scored = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')
train_targets_scored = train_targets_scored.drop('sig_id',axis=1)
labels_train = train_targets_scored.values

# Drop training data with ctl vehicle

train_features = train_features.iloc[non_ctl_idx]
labels_train = labels_train[non_ctl_idx]

# Import test data

test_features = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')
test_features = test_features.drop(['sig_id','cp_dose','cp_time'],axis=1)

# Define Preprocessing Functions for each notebook

In [None]:
# Preprocessing for Label Smoothing kernel

def preprocessor_labelsmooth():
    # Import train data, drop sig_id, cp_type

    train_features = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
    train_features = train_features.drop(['sig_id'],axis=1)
    non_ctl_idx = train_features.loc[train_features['cp_type']!='ctl_vehicle'].index.to_list()
    train_features = train_features.drop(['cp_type'],axis=1)
    train_features = train_features.iloc[non_ctl_idx]

    # Import public test data

    public_test_features = pd.read_csv('/kaggle/input/moapublictest/test_features.csv')
    public_test_features = public_test_features.drop('sig_id',axis=1)

    # Import test data

    test_features = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')
    test_features = test_features.drop('sig_id',axis=1)

 
    # Label Encoder for categorical cp_dose
    cat = 'cp_dose'
    le = preprocessing.LabelEncoder()
    le.fit(train_features[cat])
    train_features[cat] = le.transform(train_features[cat])

    # Transform categorical
    
    public_test_features[cat] = le.transform(public_test_features[cat])
    test_features[cat] = le.transform(test_features[cat])
    
    # Min Max Scaler for numerical values

    # Fit scaler to joint train and test data
    scaler = preprocessing.MinMaxScaler()
    scaler.fit(train_features.append(public_test_features.drop('cp_type',axis=1)))

    # Scale train data
    data_train = scaler.transform(train_features)

    # Scale test data
    data_test = scaler.transform(test_features.drop('cp_type',axis=1))
    
    cs = train_features.columns.str.contains('c-')
    gs = train_features.columns.str.contains('g-')
    
    return data_train, data_test, cs, gs

# Preprocessing for AutoEncoder Kernel
## Takes preprocessed data from label smooth kernel and transforms to autoencoder features

def preprocessor_autoencoder(data_test,cs,gs):

    cells_test = data_test[:,cs]
    genes_test = data_test[:,gs]
    
    cells_autoencoder = tf.keras.models.load_model(
        '../input/moaaemodels/CellsAE')
    genes_autoencoder = tf.keras.models.load_model(
        '../input/moaaemodels/GenesAE')
    
    ae_cells_test = cells_autoencoder.encoder(cells_test).numpy()
    ae_genes_test = genes_autoencoder.encoder(genes_test).numpy()
    
    data_test = np.concatenate((data_test[:,~(cs+gs)],ae_genes_test,ae_cells_test),axis=1)
    
    return data_test

# ResNet has two steps of preprocessing : first with the entire dataset and second per fold
# This model does not need the public test, since all transformations are determined by the train alone.

def preprocessor_resnet():
    
    # Import train data, drop sig_id, cp_type

    train_features = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
    non_ctl_idx = train_features.loc[train_features['cp_type']!='ctl_vehicle'].index.to_list()
    train_features = train_features.drop(['sig_id','cp_type','cp_dose','cp_time'],axis=1)
    train_targets_scored = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')
    train_targets_scored = train_targets_scored.drop('sig_id',axis=1)
    labels_train = train_targets_scored.values

    # Drop training data with ctl vehicle

    train_features = train_features.iloc[non_ctl_idx]
    labels_train = labels_train[non_ctl_idx]

    # Import test data

    test_features = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')
    test_features = test_features.drop(['sig_id','cp_dose','cp_time'],axis=1)

    # Import predictors from public kernel

    json_file_path = '../input/t-test-pca-rfe-logistic-regression/main_predictors.json'

    with open(json_file_path, 'r') as j:
        predictors = json.loads(j.read())
        predictors = predictors['start_predictors']
        
    # Create g-mean, c-mean, genes_pca (2 components), cells_pca (all components)

    cs = train_features.columns.str.contains('c-')
    gs = train_features.columns.str.contains('g-')
    
    return train_features,test_features,cs,gs,predictors

def preprocessor_resnet_fold(train,test,cs,gs):

    # PCA

    n_gs = 2 # No of PCA comps to include
    n_cs = 100 # No of PCA comps to include

    pca_cs = PCA(n_components = n_cs)
    pca_gs = PCA(n_components = n_gs)

    train_pca_gs = pca_gs.fit_transform(train[:,gs])
    train_pca_cs = pca_cs.fit_transform(train[:,cs])
    test_pca_gs = pca_gs.transform(test[:,gs])
    test_pca_cs = pca_cs.transform(test[:,cs])

    # c-mean, g-mean

    train_c_mean = train[:,cs].mean(axis=1)
    test_c_mean = test[:,cs].mean(axis=1)
    train_g_mean = train[:,gs].mean(axis=1)
    test_g_mean = test[:,gs].mean(axis=1)

    # Append Features

    train = np.concatenate((train,train_pca_gs,train_pca_cs,train_c_mean[:,np.newaxis]
                            ,train_g_mean[:,np.newaxis]),axis=1)
    test = np.concatenate((test,test_pca_gs,test_pca_cs,test_c_mean[:,np.newaxis],
                           test_g_mean[:,np.newaxis]),axis=1)

    # Scaler for numerical values

    # Scale train data
    scaler = preprocessing.StandardScaler()

    train = scaler.fit_transform(train)

    # Scale Test data
    test = scaler.transform(test)

    return train, test

In [None]:
model_paths = ['../input/moalabelsmoothmodels/LabelSmoothed','../input/moaaemodels/AutoEncoded','../input/moa-multi-input-resnet-model/TwoHeads']

n_models = 3
n_labels = labels_train.shape[1]
n_test = test_features.shape[0]
n_train = train_features.shape[0]
# Create arrays that store the train oof predictions (y_val) and test set predictions (y_pred) per model

y_val = np.zeros((n_models, n_train, n_labels))
y_pred = np.zeros((n_models, n_test, n_labels))

# Label Smoothing and Autoencoder

These kernels had the same number of folds and seeds. So we can run the inference in parallel

## Custom Evaluation Metrics used by pretrained models 

### If any of your pretrained models in keras used a custom metric, it needs to be passed to the load_model function, otherwise you will get an error


In [None]:
# Clipping Thresholds

p_min = 0.001
p_max = 0.999

# Custom Metrics

def logloss(y_true, y_pred):
    y_pred = tf.clip_by_value(y_pred,p_min,p_max)
    return -backend.mean(y_true*backend.log(y_pred) + (1-y_true)*backend.log(1-y_pred))

## Inference Loop, Seed Blend

## You want to be careful to use the same seeds for training and inference. Hard coding the seed into your saved model names the way I've done here is one way to ensure that an error in that doesn't go unnoticed.

Note: I run the backend.clear_session() and garbage collector calls once per seed because the RAM gets fully used otherwise

In [None]:
# 6 seed, 5 fold CV used in Label Smoothing and Autoencoder Kernels

n_seeds = 6
n_folds = 5


np.random.seed(1)
seeds = np.random.randint(0,100,size=n_seeds)

# Preprocess Data

data_train, data_test, cs, gs = preprocessor_labelsmooth()
data_train_ae = preprocessor_autoencoder(data_train,cs,gs)
data_test_ae = preprocessor_autoencoder(data_test,cs,gs)

# Loop over seeds

for seed in seeds:
    
    fold = 0
    mskf = MultilabelStratifiedKFold(n_splits=n_folds,shuffle=True,random_state=seed)
    
    for train, test in mskf.split(data_train,labels_train):
        y_test = labels_train[test]
        
        # Loop over models
        
        for i in range(2):
            
            # Load Model
            model_path = model_paths[i]+'_seed_'+str(seed)+'_fold_'+str(fold)
            model = tf.keras.models.load_model(model_path,custom_objects={'logloss':logloss})
            
            # Preprocess Autoencoder
            if i==1:
                X_test = data_train_ae[test]
                # Entire Test Set
                y_pred[i] += model.predict(data_test_ae)/(n_folds*n_seeds)
            else:
                X_test = data_train[test]
                # Entire Test Set
                y_pred[i] += model.predict(data_test)/(n_folds*n_seeds)
                
            # Run predictions
            
            
            
            # OOF Validation Set
            y_val[i,test] += model.predict(X_test)/n_seeds
            
        fold += 1
        
    tf.keras.backend.clear_session()
    del model, X_test, y_test
    gc.collect()

# ResNet Model

The ResNet model has 10 folds and 7 seeds. It also has different clipping thresholds

In [None]:
# Generate Seeds

n_seeds = 5
np.random.seed(1)
seeds = np.random.randint(0,100,size=n_seeds)

# Training Loop

n_folds = 10

# Preprocessing Step 1
train_features,test_features,cs,gs,predictors = preprocessor_resnet()

for seed in seeds:
    fold = 0
    kf = KFold(n_splits=n_folds,shuffle=True,random_state=seed)
    for train, test in kf.split(train_features):
        _, X_test = preprocessor_resnet_fold(train_features.iloc[train].values,
                                       train_features.iloc[test].values,cs,gs)
        _,data_test = preprocessor_resnet_fold(train_features.iloc[train].values,
                                   test_features.drop('cp_type',axis=1).values,cs,gs)
        X_test_2 = train_features.iloc[test][predictors].values
        data_test_2 = test_features[predictors].values

        # Load Model
        model_path = model_paths[2]+'_seed_'+str(seed)+'_fold_'+str(fold)
        model = tf.keras.models.load_model(model_path,custom_objects={'logloss':logloss})
        # OOF Score
        y_val[2,test] += model.predict([X_test,X_test_2])/n_seeds

        # Run prediction
        y_pred[2] += model.predict([data_test,data_test_2])/(n_folds*n_seeds)

        fold += 1

    tf.keras.backend.clear_session()
    del model, X_test,X_test_2,data_test,data_test_2
    gc.collect()

# Calculate Blending Weights

### Optimize weights based on the OOF score. I use scipy's minimize function here. Check out [this post](https://www.kaggle.com/c/lish-moa/discussion/186539) for a discussion on scipy optimizers for blending. Note that the optimizer here actually doesn't do a great job. I might update it in a later version.

In [None]:
# Define Blending Functions

def blended_preds(y_pred,ws):
    return (y_pred*ws[:,np.newaxis,np.newaxis]).sum(axis=0)


def blended_oof(ws):
    y_vals = blended_preds(y_val,ws)
    loss = logloss(tf.constant(labels_train,dtype=tf.float32),tf.constant(y_vals,dtype=tf.float32))
    return loss.numpy() + ((1-np.sum(ws))**2)

In [None]:
# Optimize weights for blended OOF

opt = dual_annealing(blended_oof,bounds=((0,1),(0,1),(0,1)))
ws = opt.x

print('The weights are ' + str(ws))

# Check that weights sum to one

print('Sum of weights is ' + str(ws.sum()))

# OOF Score

print('OOF score is ' + str(opt.fun))

In [None]:
y_val = blended_preds(y_val,ws)
y_pred = blended_preds(y_pred,ws)

In [None]:
# Clipping Thresholds

p_min = 0.001
p_max = 0.999

# Generate submission file, Clip Predictions

sub = pd.read_csv('/kaggle/input/lish-moa/sample_submission.csv')
sub.iloc[:,1:] = np.clip(y_pred,p_min,p_max)

# Set ctl_vehicle to 0
sub.iloc[test_features['cp_type'] == 'ctl_vehicle',1:] = 0

sub.to_csv('submission.csv', index=False)