This notebook demonstrates the importance of sampling. It makes use of the fast convergence trick I mentioned [here](https://www.kaggle.com/c/lish-moa/discussion/185161) and allows quick training of 30 models on 30 different sub-samples. This is in contrast to the use of small K in K-fold cross validation.

CV (OOF) is available in V1, submission is made in V2.

**If you find the notebook useful, please don't forget to upvote.**

### Import Libraries

In [None]:
!pip install ../input/iter-strat

In [None]:
import gc
import os
import random
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, KFold
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.preprocessing import OneHotEncoder, QuantileTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA

import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow.keras.layers as L
from tensorflow.keras.models import load_model

from tensorflow.keras.callbacks import (
    ModelCheckpoint, EarlyStopping, LearningRateScheduler, ReduceLROnPlateau)

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

SEED = 1984
seed_everything(SEED)

### Read Data

In [None]:
x_develop = pd.read_csv('../input/lish-moa/train_features.csv')
y_develop = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
x_test= pd.read_csv('../input/lish-moa/test_features.csv')
sub = pd.read_csv('../input/lish-moa/sample_submission.csv')

c_cols = x_develop.columns[x_develop.columns.str.startswith('c-')]
g_cols = x_develop.columns[x_develop.columns.str.startswith('g-')]
cont_cols = g_cols.to_list() + c_cols.to_list()
target_cols = y_develop.columns[1:]  # All columns except sig_id
N_TARGETS = len(target_cols)

### Preprocessing

#### Encode Categorical Features

In [None]:
def preprocess_df(df):
    if 'sig_id' in df.columns:
        df.set_index('sig_id', inplace=True)
    df['cp_type'] = (df['cp_type'] == 'trt_cp').astype(int)
    df['cp_dose'] = (df['cp_dose'] == 'D2').astype(int)
    
    df = df.join(pd.get_dummies(df['cp_time'], drop_first=False, prefix='cp_time'))
    df = df.drop('cp_time', axis=1)
    return df

In [None]:
x_develop = preprocess_df(x_develop)
y_develop = y_develop.set_index('sig_id')
x_test = preprocess_df(x_test)

#### PCA Decomposition

In [None]:
# g-features
n_comp = 28
pca_g = PCA(n_components=n_comp, random_state=SEED)
x_develop_pca = pca_g.fit_transform(x_develop[g_cols])
x_develop_pca = pd.DataFrame(x_develop_pca, columns=[f'pca_g-{i}' for i in range(n_comp)], index=x_develop.index)
x_develop = pd.concat((x_develop, x_develop_pca), axis=1)

x_test_pca = pca_g.transform(x_test[g_cols])
x_test_pca = pd.DataFrame(x_test_pca, columns=[f'pca_g-{i}' for i in range(n_comp)], index=x_test.index)
x_test = pd.concat((x_test, x_test_pca), axis=1)

cont_cols += [f'pca_g-{i}' for i in range(n_comp)]

# c-features
n_comp = 5
pca_c = PCA(n_components=n_comp, random_state=SEED)
x_develop_pca = pca_c.fit_transform(x_develop[c_cols])
x_develop_pca = pd.DataFrame(x_develop_pca, columns=[f'pca_c-{i}' for i in range(n_comp)], index=x_develop.index)
x_develop = pd.concat((x_develop, x_develop_pca), axis=1)

x_test_pca = pca_c.transform(x_test[c_cols])
x_test_pca = pd.DataFrame(x_test_pca, columns=[f'pca_c-{i}' for i in range(n_comp)], index=x_test.index)
x_test = pd.concat((x_test, x_test_pca), axis=1)

cont_cols += [f'pca_c-{i}' for i in range(n_comp)]

#### Variance Threshold

In [None]:
class VarianceThreshold:
    def __init__(self, threshold):
        self.threshold = threshold
    def fit(self, df, cont_cols):
        self.cont_cols = cont_cols
        self.var = x_develop[cont_cols].var()
        good_cols = self.var[self.var > self.threshold]
        self.index = good_cols.index.to_list()
        self.dropcols = [x for x in cont_cols if x not in self.var[self.var > self.threshold].index.to_list()]
        self.validcols = [x for x in cont_cols if x in self.var[self.var > self.threshold].index.to_list()]
    def transform(self, df):
        return df.drop(self.dropcols, axis=1)
    def fit_transform(self, df, cont_cols):
        self.fit(df, cont_cols)
        return self.transform(df), self.validcols

In [None]:
threshold = 0.6 #x_develop[cont_cols].var().sort_values().quantile(0.01)
print('Variance Threshold:', threshold)
VarThres = VarianceThreshold(threshold)
x_develop, cont_cols = VarThres.fit_transform(x_develop, cont_cols)
x_test = VarThres.transform(x_test)

#### Transform Numerical Features

In [None]:
qt = QuantileTransformer(output_distribution='normal')
x_develop[cont_cols] = qt.fit_transform(x_develop[cont_cols])
x_test[cont_cols] = qt.transform(x_test[cont_cols])

### Group Data Into Folds

In [None]:
def create_folds(df, fold_no, fold_type='mls_kfold', save=False):
    """
    df: target dataframe
    """
    if fold_type == 'kfold':
        kf = KFold(n_splits=fold_no, shuffle=True, random_state=SEED)
    elif fold_type == 'mls_kfold':
        kf = MultilabelStratifiedKFold(n_splits=fold_no, random_state=SEED)
        
    df['Fold'] = -1
    df.reset_index(inplace=True)
    for fold, (t, v) in enumerate(kf.split(df, df)):
        df.loc[v, 'Fold'] = fold
    df.set_index('sig_id', inplace=True)
    if save:
        df.to_csv('y_develop.csv')

In [None]:
N_FOLDS = 30
fold_type = 'mls_kfold'
create_folds(y_develop, fold_no=N_FOLDS, fold_type=fold_type, save=True)

### Define Model Architecture

In [None]:
class Model():
    def __init__(self, input_shape, output_bias=None):
        self.input_shape = input_shape
        self.output_bias = output_bias
        
    def create_model1(self):
        if self.output_bias is not None:
            self.output_bias = tf.keras.initializers.Constant(output_bias)

        inputs = tf.keras.Input(shape=self.input_shape)
        x = L.BatchNormalization()(inputs)
        x = tfa.layers.WeightNormalization(L.Dense(800, activation='swish'))(x)
        x = L.BatchNormalization()(x)
        x = L.Dropout(0.4)(x)
        x = tfa.layers.WeightNormalization(L.Dense(400, activation='swish'))(x)
        x = L.BatchNormalization()(x)
        x = L.Dropout(0.4)(x)
        outputs = tfa.layers.WeightNormalization(L.Dense(N_TARGETS,
                                                         activation='sigmoid',
                                                         bias_initializer=self.output_bias
                                                        )
                                                 )(x)
        model = tf.keras.Model(inputs=inputs, outputs=outputs)
        
        metrics = [tf.keras.losses.BinaryCrossentropy(name='mean_loss')]
        
        OPTIMIZER = tfa.optimizers.Lookahead(
            tfa.optimizers.AdamW(weight_decay=1e-5),
            sync_period=5)
        model.compile(optimizer=OPTIMIZER, loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=0.0008), metrics=metrics)

        return model

### Cross Validation

In [None]:
def calc_pred(x_df, models):
    if isinstance(models[0], np.ndarray):
        pred_test = np.repeat([models[0]], len(x_df), axis=0)
    else:
        for i in models:
            if i == 0:
                pred_test = models[i].predict(x_df)
            else:
                pred_test += models[i].predict(x_df)

        pred_test = pred_test/len(models)
    pred_test = pd.DataFrame(index=x_df.index, columns=target_cols, data=pred_test)
    pred_test.loc[x_df['cp_type'] == 0] = 0
    return pred_test


def oof_score(oof: dict):
    return np.mean(list(oof.values())), np.std(list(oof.values()))


def combine_pred(pred):
    for i in pred:
        if i==0:
            r = pred[i]
        else:
            r = np.append(r, pred[i], axis=0)
    return r


def run_dump_baseline_cv(xtrain=x_develop, ytrain=y_develop, N_FOLDS=N_FOLDS, summary=True, debug=False):
    models = {x: '' for x in range(N_FOLDS)}
    results = {x: '' for x in range(N_FOLDS)}
    oof_bp = {x: [] for x in range(N_FOLDS)}
    oof_ap = {x: [] for x in range(N_FOLDS)}
    pred_val_fold = {x: [] for x in range(N_FOLDS)}
    
    for foldno in np.sort(y_develop['Fold'].unique()):
        x_train_fold = x_develop[y_develop['Fold'] != foldno]
        y_train_fold = y_develop[y_develop['Fold'] != foldno].drop('Fold', axis=1)
        x_val_fold = x_develop[y_develop['Fold'] == foldno]
        y_val_fold = y_develop[y_develop['Fold'] == foldno].drop('Fold', axis=1)
            
        train_sample_size = len(y_train_fold)
        val_sample_size = len(y_val_fold)
        print(" ")
        print(f"Fold-%d" % (foldno))
        print("Train sample size:", train_sample_size, ", Validation sample size:", val_sample_size)
        
        # Predict Validation probabilities
        y_dev = y_develop.drop('Fold', axis=1)
        models[foldno] = y_dev.mean(axis=0).to_numpy()
        pred_val_fold[foldno] = np.repeat([models[foldno]], len(y_val_fold), axis=0)
        
        # Calculate OOF (Validation) Results
        oof_bp[foldno] = tf.keras.losses.binary_crossentropy(y_val_fold, pred_val_fold[foldno]).numpy().mean()
        print('Out-of-Fold Score: ', oof_bp[foldno])
        
        pred_val_fold[foldno][x_val_fold['cp_type'] == 0] == 0
        oof_ap[foldno] = tf.keras.losses.binary_crossentropy(y_val_fold, pred_val_fold[foldno]).numpy().mean()
        print('Out-of-Fold Score with post processing: ', oof_ap[foldno])
        
    pred_val_fold = combine_pred(pred_val_fold)
        
    print('\n')
    if summary:
        print('Summary')
        # Mean out of score before postprocessing
        print('Mean OOF score: %f +/- %f' % (oof_score(oof_bp)))

        # Mean out of score after postprocessing
        print('Mean OOF score after postprocessing: %f +/- %f' % (oof_score(oof_ap)))
            
    return models, oof_ap, pred_val_fold


def run_cv(xtrain=x_develop, ytrain=y_develop, model=None, N_FOLDS=N_FOLDS, summary=True, debug=False):
    histories = {x: '' for x in range(N_FOLDS)} 
    models = {x: '' for x in range(N_FOLDS)}
    results = {x: '' for x in range(N_FOLDS)}
    oof_bp = {x: [] for x in range(N_FOLDS)}
    oof_ap = {x: [] for x in range(N_FOLDS)}
    pred_val_fold = {x: [] for x in range(N_FOLDS)}
    
    for foldno in np.sort(y_develop['Fold'].unique()):
        x_train_fold = x_develop[y_develop['Fold'] != foldno]
        y_train_fold = y_develop[y_develop['Fold'] != foldno].drop('Fold', axis=1)
        x_val_fold = x_develop[y_develop['Fold'] == foldno]
        y_val_fold = y_develop[y_develop['Fold'] == foldno].drop('Fold', axis=1)
            
        train_sample_size = len(y_train_fold)
        val_sample_size = len(y_val_fold)
        print(" ")
        print(f"Fold-%d" % (foldno))
        print("Train sample size:", train_sample_size, ", Validation sample size:", val_sample_size)

        FEATURE_SIZE = x_train_fold.shape[-1]

        # Train Data Pipeline
        train_ds = tf.data.Dataset.from_tensor_slices((x_train_fold, y_train_fold))
        # train_ds = train_ds.filter(lambda x, y: tf.reduce_any(y != np.zeros(206)))
        train_ds = train_ds.shuffle(1024).batch(56)

        # Validation Data Pipeline
        val_ds = tf.data.Dataset.from_tensor_slices((x_val_fold, y_val_fold))
        val_ds = val_ds.batch(val_sample_size)

        # MODEL
        models[foldno] = model.create_model1()
        
        # Train
        cb_es = EarlyStopping(monitor='val_mean_loss', patience=5, restore_best_weights=True)
        reduce_lr_loss = ReduceLROnPlateau(monitor='val_mean_loss', factor=0.1, patience=3, verbose=1, epsilon=1e-4, mode='min')
        histories[foldno] = models[foldno].fit(train_ds, validation_data=val_ds, epochs=EPOCHS, callbacks=[cb_es, reduce_lr_loss], verbose=1)
        
        # Predict Validation Probabilities
        pred_val_fold[foldno] = models[foldno].predict(x_val_fold)
        
        # Calculate OOF (Validation) Results
        oof_bp[foldno] = tf.keras.losses.binary_crossentropy(y_val_fold, pred_val_fold[foldno]).numpy().mean()
        print('Out-of-Fold Score: ', oof_bp[foldno])
        
        pred_val_fold[foldno][x_val_fold['cp_type'] == 0] == 0
        oof_ap[foldno] = tf.keras.losses.binary_crossentropy(y_val_fold, pred_val_fold[foldno]).numpy().mean()
        print('Out-of-Fold Score with post processing: ', oof_ap[foldno])

        # Save Model
        if SAVE_MODEL:
            models[foldno].save(f'weights-fold{foldno}.h5')
            
    pred_val_fold = combine_pred(pred_val_fold)
    
    print('\n')
    if summary:
        print('Summary')
        # Mean out of score before postprocessing
        print('Mean OOF score: %f +/- %f' % (oof_score(oof_bp)))

        # Mean out of score after postprocessing
        print('Mean OOF score after postprocessing: %f +/- %f' % (oof_score(oof_ap)))
    
    return models, histories, oof_ap, pred_val_fold


def submit(res):
    sub = res.reset_index()
    sub.to_csv('submission.csv', index=False)

In [None]:
# RUN THE TRAINING
EPOCHS = 45
SAVE_MODEL = True

In [None]:
if sub.shape[0] != 3982:
    #models, oof_ap, pred_val_fold = run_dump_baseline_cv(x_develop, y_develop, N_FOLDS=N_FOLDS)
    output_bias = -np.log(y_develop[y_develop.columns[:-1]].mean(axis=0).to_numpy())
    models, histories, oof_ap, pred_val_fold = run_cv(x_develop, y_develop,
                                                      model=Model(x_develop.shape[1], output_bias),
                                                      N_FOLDS=N_FOLDS,
                                                      debug=False)
    pred_test = calc_pred(x_test, models) # This is for single model submission
    submit(pred_test)
    np.save('LBS.npy', pred_val_fold)
else:
    sub.to_csv('submission.csv', index=False)

In [None]:
pd.DataFrame.from_dict(oof_ap, orient='index').hist()
plt.show()