In [1]:
import sys
sys.path.append('../input/iterative-stratification/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense, Activation, Dropout, BatchNormalization
from tensorflow_addons.layers import WeightNormalization
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

In [3]:
train_targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
del train_targets['sig_id']

train_features = pd.read_csv('../input/lish-moa/train_features.csv')
del train_features['sig_id']

test_features = pd.read_csv('../input/lish-moa/test_features.csv')

In [4]:
keras_val_submission = pd.read_csv('../input/keras-neural-net/val-submission.csv')
xboost_val_submission = np.load('../input/xgboost-baseline-saved-model-kadri/xgboos-oof.npy')
#log_reg_val_submission = np.load('../input/nb-log-reg-3-multilabelskf-ver2-saved-pyinference/log-reg-oof.npy', allow_pickle=True)
marge_keras_val_submission = pd.read_csv('../input/marge-keras-v2-load-model/val-submission.csv')


#val_submissions = [keras_val_submission, xboost_val_submission, log_reg_val_submission, marge_keras_val_submission]

In [5]:
if len(test_features) == 3982: # if public test set, we can use existing submissions
    keras_submission = pd.read_csv('../input/keras-neural-net/submission.csv')
    xboost_submission = pd.read_csv('../input/xgboost-baseline-saved-model-kadri/submission.csv')
    #logreg_submission = pd.read_csv('../input/nb-log-reg-3-multilabelskf-ver2-saved-pyinference/submission.csv')
    marge_keras_submission = pd.read_csv('../input/marge-keras-v2-load-model/submission.csv')
else: # if private test set, we have to rerun inference  
    print("Reruning inference for keras nn")
    !python ../input/keras-neural-net/inference.py
    keras_submission = pd.read_csv('./submission.csv')
    
    print("Reruning inference for xgboost")
    ! python ../input/xgboost-baseline-saved-model-kadri/inference.py
    xboost_submission = pd.read_csv('./submission.csv')
    
    print("Reruning inference for logistic regression")
    #! python ../input/nb-log-reg-3-multilabelskf-ver2-saved-pyinference/inference.py
    #logreg_submission = pd.read_csv('./submission.csv')
    
    print("Reruning inference for keras nn2")
    ! python ../input/marge-keras-v2-load-model/inference.py
    marge_keras_submission = pd.read_csv('./submission.csv')
    
del keras_submission['sig_id']
del xboost_submission['sig_id']
#del logreg_submission['sig_id']
del marge_keras_submission['sig_id']

In [6]:
def create_model(input_size):
    model = keras.Sequential([
        Input(input_size),
        Dense(input_size, activation="relu"),
        Dense(206, activation="sigmoid")
    ])
    
    optimizer = tfa.optimizers.AdamW(lr = 1e-3, weight_decay = 1e-5, clipvalue = 756)
    model.compile(loss=BinaryCrossentropy(label_smoothing=1e-15), optimizer=optimizer)
    return model

early_stopping = EarlyStopping(monitor="val_loss", min_delta=0, patience=5, verbose=1, mode="auto", baseline=None, restore_best_weights=True)
reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, epsilon=1e-4, mode='min')

In [7]:
blended_val_pred = train_targets.copy()
blended_val_pred.loc[:, train_targets.columns] = 0

blended_test_pred = pd.read_csv('../input/lish-moa/sample_submission.csv')
blended_test_pred.loc[:, train_targets.columns] = 0

In [8]:
MAX_EPOCHS = 50
BATCH_SIZE = 64
FOLDS = 10

val_submissions = np.hstack((keras_val_submission, xboost_val_submission, marge_keras_val_submission)).astype('float32')
test_submissions = np.hstack((keras_submission, xboost_submission, marge_keras_submission)).astype('float32')

mskf = MultilabelStratifiedKFold(n_splits=FOLDS, random_state=42, shuffle=True)
for n, (train_idx, val_idx) in enumerate(mskf.split(train_targets, train_targets)):    
    X_train, X_val = val_submissions[train_idx, :], val_submissions[val_idx, :]
    y_train, y_val = train_targets.iloc[train_idx, :], train_targets.iloc[val_idx, :]
    
    model = create_model(val_submissions.shape[1])
    model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=BATCH_SIZE, epochs=MAX_EPOCHS, callbacks=[early_stopping])
    
    blended_val_pred.loc[val_idx, train_targets.columns] += model.predict(X_val)
    blended_test_pred.loc[:, train_targets.columns] += model.predict(test_submissions) / FOLDS



Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 00048: early stopping
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 

In [9]:
def log_loss_metric(y_true, y_pred):
    metrics = []
    for _target in train_targets.columns:
        metrics.append(log_loss(y_true.loc[:, _target], y_pred.loc[:, _target].astype(float), labels = [0,1]))
    return np.mean(metrics)

In [10]:
print(f'NN OOF before postprocessing: {log_loss_metric(train_targets, blended_val_pred):.6f}')
blended_val_pred.loc[train_features['cp_type'] == 'ctl_vehicle', train_targets.columns] = 0
blended_test_pred.loc[test_features['cp_type'] == 'ctl_vehicle', train_targets.columns] = 0
print(f'NN OOF after postprocessing: {log_loss_metric(train_targets, blended_val_pred):.6f}')

NN OOF before postprocessing: 0.014261
NN OOF after postprocessing: 0.014257


In [11]:
blended_test_pred.to_csv('submission.csv', index=False)