### Import Libraries

In [None]:
!pip install ../input/moaiterative/iterative_stratification-0.1.6-py3-none-any.whl

In [None]:
import gc
import os
import random
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, KFold
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.preprocessing import OneHotEncoder, QuantileTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA, KernelPCA
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import (KNeighborsClassifier, NeighborhoodComponentsAnalysis)
from joblib import dump, load

import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow.keras.layers as L
from tensorflow.keras.models import load_model
from tensorflow.keras import layers,regularizers,Sequential,backend,callbacks,optimizers,metrics,losses

from tensorflow.keras.callbacks import (
    ModelCheckpoint, EarlyStopping, LearningRateScheduler, ReduceLROnPlateau)

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

SEED = 1984
seed_everything(SEED)

### Read Data

In [None]:
x_develop = pd.read_csv('../input/lish-moa/train_features.csv')
y_develop = pd.read_csv('../input/lish-moa/train_targets_scored.csv')

x_test= pd.read_csv('../input/lish-moa/test_features.csv')
sub = pd.read_csv('../input/lish-moa/sample_submission.csv')

c_cols = x_develop.columns[x_develop.columns.str.startswith('c-')]
g_cols = x_develop.columns[x_develop.columns.str.startswith('g-')]
target_cols = y_develop.columns[1:]  # All columns except sig_id
N_TARGETS = len(target_cols)
N_TARGETS

### Preprocessing

#### Encode Categorical Features

In [None]:
def preprocess_df(df):
    if 'sig_id' in df.columns:
        df.set_index('sig_id', inplace=True)
    df['cp_type'] = (df['cp_type'] == 'trt_cp').astype(int)
    df['cp_dose'] = (df['cp_dose'] == 'D2').astype(int)
    
    df = df.join(pd.get_dummies(df['cp_time'], drop_first=False, prefix='cp_time'))
    df = df.drop('cp_time', axis=1)
    return df

In [None]:
x_develop = preprocess_df(x_develop)
y_develop = y_develop.set_index('sig_id')
x_test = preprocess_df(x_test)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable


from sklearn import preprocessing
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def load_data(train_path, cols):
    # read in from csv
    if isinstance(train_path, list):
        df_base = pd.read_csv(train_path[0])
        for i in range(1, len(train_path)):
            df_base.append(pd.read_csv(train_path[i]))
        df_base = df_base[cols]
    else:
        df = pd.read_csv(train_path)
        df_base = df[cols]
    x = df_base.values.reshape(-1, df_base.shape[1]).astype('float32')
    # stadardize values
    standardizer = preprocessing.StandardScaler()
    x = standardizer.fit_transform(x)    
    return x, standardizer

def numpyToTensor(x):
    x_train = torch.from_numpy(x).to(device)
    return x_train

from torch.utils.data import Dataset, DataLoader

class DataBuilder(Dataset):
    def __init__(self, path, cols):
        self.x, self.standardizer = load_data(path, cols)
        self.x = numpyToTensor(self.x)
        self.len=self.x.shape[0]
    def __getitem__(self,index):      
        return self.x[index]
    def __len__(self):
        return self.len

In [None]:
class Autoencoder(nn.Module):
    def __init__(self,D_in, H, H2, latent_dim):
        
        #Encoder
        super(Autoencoder,self).__init__()
        self.linear1=nn.Linear(D_in,H)
        self.lin_bn1 = nn.BatchNorm1d(num_features=H)
        self.linear2=nn.Linear(H,H2)
        self.lin_bn2 = nn.BatchNorm1d(num_features=H2)
        self.linear3=nn.Linear(H2,H2)
        self.lin_bn3 = nn.BatchNorm1d(num_features=H2)
        
#         # Latent vectors mu and sigma
        self.fc1 = nn.Linear(H2, latent_dim)
        self.bn1 = nn.BatchNorm1d(num_features=latent_dim)
        self.fc21 = nn.Linear(latent_dim, latent_dim)
        self.fc22 = nn.Linear(latent_dim, latent_dim)

#         # Sampling vector
        self.fc3 = nn.Linear(latent_dim, latent_dim)
        self.fc_bn3 = nn.BatchNorm1d(latent_dim)
        self.fc4 = nn.Linear(latent_dim, H2)
        self.fc_bn4 = nn.BatchNorm1d(H2)
        
#         # Decoder
        self.linear4=nn.Linear(H2,H2)
        self.lin_bn4 = nn.BatchNorm1d(num_features=H2)
        self.linear5=nn.Linear(H2,H)
        self.lin_bn5 = nn.BatchNorm1d(num_features=H)
        self.linear6=nn.Linear(H,D_in)
        self.lin_bn6 = nn.BatchNorm1d(num_features=D_in)
        
        self.leaky_relu = nn.LeakyReLU()
        
    def encode(self, x):
        lin1 = self.leaky_relu(self.lin_bn1(self.linear1(x)))
        lin2 = self.leaky_relu(self.lin_bn2(self.linear2(lin1)))
        lin3 = self.leaky_relu(self.lin_bn3(self.linear3(lin2)))

        fc1 = F.leaky_relu(self.bn1(self.fc1(lin3)))

        r1 = self.fc21(fc1)
        r2 = self.fc22(fc1)
        
        return r1, r2
    
    def reparameterize(self, mu, logvar):
        if self.training:
            std = logvar.mul(0.5).exp_()
            eps = Variable(std.data.new(std.size()).normal_())
            return eps.mul(std).add_(mu)
        else:
            return mu
        
    def decode(self, z):
        fc3 = self.leaky_relu(self.fc_bn3(self.fc3(z)))
        fc4 = self.leaky_relu(self.fc_bn4(self.fc4(fc3)))

        lin4 = self.leaky_relu(self.lin_bn4(self.linear4(fc4)))
        lin5 = self.leaky_relu(self.lin_bn5(self.linear5(lin4)))
        return self.lin_bn6(self.linear6(lin5))
    
    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        # self.decode(z) ist spÃ¤ter recon_batch, mu ist mu und logvar ist logvar
        return self.decode(z), mu, logvar
    
class customLoss(nn.Module):
    def __init__(self):
        super(customLoss, self).__init__()
        self.mse_loss = nn.MSELoss(reduction="sum")
    
    # x_recon ist der im forward im Model erstellte recon_batch, x ist der originale x Batch, mu ist mu und logvar ist logvar 
    def forward(self, x_recon, x, mu, logvar):
        loss_MSE = self.mse_loss(x_recon, x)
        loss_KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())

        return loss_MSE + loss_KLD
    
# takes in a module and applies the specified weight initialization
def weights_init_uniform_rule(m):
    classname = m.__class__.__name__
    # for every Linear layer in a model..
    if classname.find('Linear') != -1:
        # get the number of the inputs
        n = m.in_features
        y = 1.0/np.sqrt(n)
        m.weight.data.uniform_(-y, y)
        m.bias.data.fill_(0)
        
def train(epoch, dataloader, model, optimizer):
    loss_mse = customLoss()
    model.train()
    train_loss = 0
    for batch_idx, data in enumerate(dataloader):
        data = data.to(device)
        optimizer.zero_grad()
        recon_batch, mu, logvar = model(data)
        loss = loss_mse(recon_batch, data, mu, logvar)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()

    if epoch % 100 == 0:        
        print('====> Epoch: {} Average loss: {:.4f}'.format(epoch, train_loss / len(dataloader.dataset)))
        
        
def encode_features(data_loader, model, optimizer):
    mu_output = []
    logvar_output = []
    with torch.no_grad():
        for i, (data) in enumerate(data_loader):
            data = data.to(device)
            optimizer.zero_grad()
            recon_batch, mu, logvar = model(data)

            mu_tensor = mu   
            mu_output.append(mu_tensor)
            mu_result = torch.cat(mu_output, dim=0)

            logvar_tensor = logvar   
            logvar_output.append(logvar_tensor)
            logvar_result = torch.cat(logvar_output, dim=0)
    return mu_result


def get_encoder_features(TRAIN_DATA_PATH, TEST_DATA_PATH, cols, latent_dim, name, train_flag=True):
    print('Reading train data...')
    data_set_train=DataBuilder(TRAIN_DATA_PATH, cols)
    trainloader=DataLoader(dataset=data_set_train,batch_size=512)
    
    print('Reading test data...')
    data_set_test=DataBuilder(TEST_DATA_PATH, cols)
    testloader=DataLoader(dataset=data_set_test,batch_size=512)
    
    print('Reading data...')
    data_set = DataBuilder([TRAIN_DATA_PATH, TEST_DATA_PATH], cols)
    loader=DataLoader(dataset=data_set,batch_size=512)
    
    
    print('Prepare params...')
    D_in = data_set_train.x.shape[1]
    H = D_in - (D_in - latent_dim) // 4
    H2 = D_in - (D_in - latent_dim) // 2
    model = Autoencoder(D_in, H, H2, latent_dim).to(device)
    model.apply(weights_init_uniform_rule)
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

    loss_mse = customLoss()

    epochs = 1000
    log_interval = 50
    val_losses = []
    train_losses = []
    
    print("train or load model...")
    if train_flag:
        print("train model...")
        for epoch in range(1, epochs + 1):
            train(epoch, loader, model, optimizer)
        torch.save(model.state_dict(), name)
    else:
        print("load model...")
        model.load_state_dict(torch.load(name))
        model.eval()
    
    print("extract features...")
    train_encoder = encode_features(trainloader, model, optimizer)
    test_encoder = encode_features(testloader, model, optimizer)
    
    return train_encoder, test_encoder

In [None]:
TRAIN_DATA_PATH = '../input/lish-moa/train_features.csv'
TEST_DATA_PATH = '../input/lish-moa/test_features.csv'

gene_encode_dim = 50
cell_encode_dim = 30

GENES = g_cols.to_list()
CELLS = c_cols.to_list()

genes_model_name = f'../input/update-mx10-model/autoencode_genes_50_30_default'
cells_model_name = f'../input/update-mx10-model/autoencode_cells_50_30_default'

train_gene_encoder, test_gene_encoder = get_encoder_features(TRAIN_DATA_PATH, TEST_DATA_PATH, GENES, gene_encode_dim, genes_model_name, 
                                                             train_flag=False)
train_cell_encoder, test_cell_encoder = get_encoder_features(TRAIN_DATA_PATH, TEST_DATA_PATH, CELLS, cell_encode_dim, cells_model_name, 
                                                             train_flag=False)

train_gene_encoder_pd = pd.DataFrame(train_gene_encoder.cpu().numpy(), columns=[f'VAE_G_{i}' for i in range(gene_encode_dim)])
test_gene_encoder_pd = pd.DataFrame(test_gene_encoder.cpu().numpy(), columns=[f'VAE_G_{i}' for i in range(gene_encode_dim)])

train_cell_encoder_pd = pd.DataFrame(train_cell_encoder.cpu().numpy(), columns=[f'VAE_C_{i}' for i in range(cell_encode_dim)])
test_cell_encoder_pd = pd.DataFrame(test_cell_encoder.cpu().numpy(), columns=[f'VAE_C_{i}' for i in range(cell_encode_dim)])


In [None]:
train_gene_encoder_pd['sig_id'] = list(x_develop.index)
train_cell_encoder_pd['sig_id'] = list(x_develop.index)
test_gene_encoder_pd['sig_id'] = list(x_test.index)
test_cell_encoder_pd['sig_id'] = list(x_test.index)

x_develop = x_develop.merge(train_gene_encoder_pd, on='sig_id')
x_develop = x_develop.merge(train_cell_encoder_pd, on='sig_id')

x_test = x_test.merge(test_gene_encoder_pd, on='sig_id')
x_test = x_test.merge(test_cell_encoder_pd, on='sig_id')

#### PCA Decomposition

In [None]:
from sklearn.decomposition import FactorAnalysis, FastICA
from sklearn.cluster import FeatureAgglomeration
# g-features
n_comp = 80
transformer = load('../input/update-mx10-model/mx10_pca_g.joblib')
x_develop_pca = transformer.transform(x_develop[g_cols])
x_develop_pca = pd.DataFrame(x_develop_pca, columns=[f'pca_g-{i}' for i in range(n_comp)], index=x_develop.index)
x_develop = pd.concat((x_develop, x_develop_pca), axis=1)

x_test_pca = transformer.transform(x_test[g_cols])
x_test_pca = pd.DataFrame(x_test_pca, columns=[f'pca_g-{i}' for i in range(n_comp)], index=x_test.index)
x_test = pd.concat((x_test, x_test_pca), axis=1)

# c-features
n_comp = 40
transformer = load('../input/update-mx10-model/mx10_pca_c.joblib')
x_develop_pca = transformer.transform(x_develop[c_cols])
x_develop_pca = pd.DataFrame(x_develop_pca, columns=[f'pca_c-{i}' for i in range(n_comp)], index=x_develop.index)
x_develop = pd.concat((x_develop, x_develop_pca), axis=1)

x_test_pca = transformer.transform(x_test[c_cols])
x_test_pca = pd.DataFrame(x_test_pca, columns=[f'pca_c-{i}' for i in range(n_comp)], index=x_test.index)
x_test = pd.concat((x_test, x_test_pca), axis=1)
# ------------------------------------------------------------------------
n_comp = 80
transformer = load('../input/update-mx10-model/mx10_FastICA_g.joblib')
x_develop_pca = transformer.transform(x_develop[g_cols])
x_develop_pca = pd.DataFrame(x_develop_pca, columns=[f'FastICA_g-{i}' for i in range(n_comp)], index=x_develop.index)
x_develop = pd.concat((x_develop, x_develop_pca), axis=1)

x_test_pca = transformer.transform(x_test[g_cols])
x_test_pca = pd.DataFrame(x_test_pca, columns=[f'FastICA_g-{i}' for i in range(n_comp)], index=x_test.index)
x_test = pd.concat((x_test, x_test_pca), axis=1)

# c-features
n_comp = 40
transformer = load('../input/update-mx10-model/mx10_FastICA_c.joblib')
x_develop_pca = transformer.transform(x_develop[c_cols])
x_develop_pca = pd.DataFrame(x_develop_pca, columns=[f'FastICA_c-{i}' for i in range(n_comp)], index=x_develop.index)
x_develop = pd.concat((x_develop, x_develop_pca), axis=1)

x_test_pca = transformer.transform(x_test[c_cols])
x_test_pca = pd.DataFrame(x_test_pca, columns=[f'FastICA_c-{i}' for i in range(n_comp)], index=x_test.index)
x_test = pd.concat((x_test, x_test_pca), axis=1)

# ------------------------------------------------------------------------
n_comp = 80
transformer = load('../input/update-mx10-model/mx10_FactorAnalysis_g.joblib')
x_develop_pca = transformer.transform(x_develop[g_cols])
x_develop_pca = pd.DataFrame(x_develop_pca, columns=[f'FactorAnalysis_g-{i}' for i in range(n_comp)], index=x_develop.index)
x_develop = pd.concat((x_develop, x_develop_pca), axis=1)

x_test_pca = transformer.transform(x_test[g_cols])
x_test_pca = pd.DataFrame(x_test_pca, columns=[f'FactorAnalysis_g-{i}' for i in range(n_comp)], index=x_test.index)
x_test = pd.concat((x_test, x_test_pca), axis=1)


# c-features
n_comp = 40
transformer = load('../input/update-mx10-model/mx10_FactorAnalysis_c.joblib')
x_develop_pca = transformer.transform(x_develop[c_cols])
x_develop_pca = pd.DataFrame(x_develop_pca, columns=[f'FactorAnalysis_c-{i}' for i in range(n_comp)], index=x_develop.index)
x_develop = pd.concat((x_develop, x_develop_pca), axis=1)

x_test_pca = transformer.transform(x_test[c_cols])
x_test_pca = pd.DataFrame(x_test_pca, columns=[f'FactorAnalysis_c-{i}' for i in range(n_comp)], index=x_test.index)
x_test = pd.concat((x_test, x_test_pca), axis=1)

# ------------------------------------------------------------------------
n_comp = 80
transformer = load('../input/update-mx10-model/mx10_FeatureAgglomeration_g.joblib')
x_develop_pca = transformer.transform(x_develop[g_cols])
x_develop_pca = pd.DataFrame(x_develop_pca, columns=[f'FeatureAgglomeration_g-{i}' for i in range(n_comp)], index=x_develop.index)
x_develop = pd.concat((x_develop, x_develop_pca), axis=1)

x_test_pca = transformer.transform(x_test[g_cols])
x_test_pca = pd.DataFrame(x_test_pca, columns=[f'FeatureAgglomeration_g-{i}' for i in range(n_comp)], index=x_test.index)
x_test = pd.concat((x_test, x_test_pca), axis=1)


# c-features
n_comp = 40
transformer = load('../input/update-mx10-model/mx10_FeatureAgglomeration_c.joblib')
x_develop_pca = transformer.transform(x_develop[c_cols])
x_develop_pca = pd.DataFrame(x_develop_pca, columns=[f'FeatureAgglomeration_c-{i}' for i in range(n_comp)], index=x_develop.index)
x_develop = pd.concat((x_develop, x_develop_pca), axis=1)

x_test_pca = transformer.transform(x_test[c_cols])
x_test_pca = pd.DataFrame(x_test_pca, columns=[f'FeatureAgglomeration_c-{i}' for i in range(n_comp)], index=x_test.index)
x_test = pd.concat((x_test, x_test_pca), axis=1)



In [None]:
def stat_feat(df, features_g, features_c):
    df['g_sum'] = df[features_g].sum(axis = 1)
    df['g_mean'] = df[features_g].mean(axis = 1)
    df['g_std'] = df[features_g].std(axis = 1)
    df['g_kurt'] = df[features_g].kurtosis(axis = 1)
    df['g_skew'] = df[features_g].skew(axis = 1)
    df['c_sum'] = df[features_c].sum(axis = 1)
    df['c_mean'] = df[features_c].mean(axis = 1)
    df['c_std'] = df[features_c].std(axis = 1)
    df['c_kurt'] = df[features_c].kurtosis(axis = 1)
    df['c_skew'] = df[features_c].skew(axis = 1)
    df['gc_sum'] = df[features_g + features_c].sum(axis = 1)
    df['gc_mean'] = df[features_g + features_c].mean(axis = 1)
    df['gc_std'] = df[features_g + features_c].std(axis = 1)
    df['gc_kurt'] = df[features_g + features_c].kurtosis(axis = 1)
    df['gc_skew'] = df[features_g + features_c].skew(axis = 1)
    return df, ['g_sum', 'g_mean', 'g_std', 'g_kurt', 'g_skew', 
                'c_sum', 'c_mean', 'c_std', 'c_kurt', 'c_skew', 
                'gc_sum', 'gc_mean', 'gc_std', 'gc_kurt', 'gc_skew']

x_develop, new_feat = stat_feat(x_develop, g_cols.to_list(), c_cols.to_list())
x_test, new_feat = stat_feat(x_test, g_cols.to_list(), c_cols.to_list())

cont_cols += new_feat

#### Variance Threshold

In [None]:
class VarianceThreshold:
    def __init__(self, threshold):
        self.threshold = threshold
    def fit(self, df, cont_cols):
        self.cont_cols = cont_cols
        self.var = df[cont_cols].var()
        good_cols = self.var[self.var > self.threshold]
        self.index = good_cols.index.to_list()
        self.dropcols = [x for x in cont_cols if x not in self.var[self.var > self.threshold].index.to_list()]
        self.validcols = [x for x in cont_cols if x in self.var[self.var > self.threshold].index.to_list()]
    def transform(self, df):
        return df.drop(self.dropcols, axis=1)
    def fit_transform(self, df, cont_cols):
        self.fit(df, cont_cols)
        return self.transform(df), self.validcols

In [None]:
cont_cols = [col for col in x_develop.columns.to_list() if col not in ['sig_id','cp_type','cp_time','cp_dose']]

In [None]:
threshold = 0.2 #x_develop[cont_cols].var().sort_values().quantile(0.01)
print('Variance Threshold:', threshold)
VarThres = VarianceThreshold(threshold)
x_develop, cont_cols = VarThres.fit_transform(x_develop, cont_cols)
x_test = VarThres.transform(x_test)

#### Transform Numerical Features

In [None]:
qt = load('../input/update-mx10-model/mx10_qt.joblib')
x_develop[cont_cols] = qt.fit_transform(x_develop[cont_cols])
x_test[cont_cols] = qt.transform(x_test[cont_cols])

### Group Data Into Folds

In [None]:
def create_folds(df, fold_no, fold_type='mls_kfold', save=False):
    """
    df: target dataframe
    """
    if fold_type == 'kfold':
        kf = KFold(n_splits=fold_no, shuffle=True, random_state=SEED)
    elif fold_type == 'mls_kfold':
        kf = MultilabelStratifiedKFold(n_splits=fold_no, random_state=SEED)
        
    df['Fold'] = -1
    df.reset_index(inplace=True)
    for fold, (t, v) in enumerate(kf.split(df, df)):
        df.loc[v, 'Fold'] = fold
    df.set_index('sig_id', inplace=True)
    if save:
        pass

In [None]:
N_FOLDS = 30
fold_type = 'mls_kfold'
create_folds(y_develop, fold_no=N_FOLDS, fold_type=fold_type, save=True)

### Define Model Architecture

In [None]:
class Model():
    def __init__(self, input_shape, output_bias=None):
        self.input_shape = input_shape
        self.output_bias = output_bias
        
    def create_model1(self, trans_model):
        if self.output_bias is not None:
            self.output_bias = tf.keras.initializers.Constant(output_bias)
#         trans_model.trainable = False
            
        x = trans_model.layers[7].output
        
        outputs = tfa.layers.WeightNormalization(L.Dense(N_TARGETS,
                                                         activation='sigmoid',
                                                         bias_initializer=self.output_bias,
                                                         name="scored_dense"
                                                        ),
                                                 name='scored_weight'
                                                 )(x)
        model = tf.keras.Model(inputs=trans_model.input, outputs=outputs)

        metrics = [tf.keras.losses.BinaryCrossentropy(name='mean_loss')]

        OPTIMIZER = tfa.optimizers.Lookahead(
            tfa.optimizers.AdamW(weight_decay=1e-5),
            sync_period=5)
        model.compile(optimizer=OPTIMIZER, loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=0.001), metrics=metrics)

        return model

In [None]:
x_develop.set_index('sig_id', inplace=True)
x_test.set_index('sig_id', inplace=True)

### Cross Validation

In [None]:
def calc_pred(x_df, models):
    if isinstance(models[0], np.ndarray):
        pred_test = np.repeat([models[0]], len(x_df), axis=0)
    else:
        for i in models:
            if i == 0:
                pred_test = models[i].predict(x_df)
            else:
                pred_test += models[i].predict(x_df)

        pred_test = pred_test/len(models)
    pred_test = pd.DataFrame(index=x_df.index, columns=target_cols, data=pred_test)
    pred_test.loc[x_df['cp_type'] == 0] = 0
    return pred_test


def oof_score(oof: dict):
    return np.mean(list(oof.values())), np.std(list(oof.values()))


def combine_pred(pred):
    for i in pred:
        if i==0:
            r = pred[i]
        else:
            r = np.append(r, pred[i], axis=0)
    return r



def run_cv(xtrain=x_develop, ytrain=y_develop, model=None, N_FOLDS=N_FOLDS, summary=True, debug=False):
    histories = {x: '' for x in range(N_FOLDS)} 
    models = {x: '' for x in range(N_FOLDS)}
    results = {x: '' for x in range(N_FOLDS)}
    oof_bp = {x: [] for x in range(N_FOLDS)}
    oof_ap = {x: [] for x in range(N_FOLDS)}
    pred_val_fold = {x: [] for x in range(N_FOLDS)}
    target_val_fold = {x: [] for x in range(N_FOLDS)}
    oof_pd = pd.DataFrame()
    
    for foldno in np.sort(y_develop['Fold'].unique()):
        x_train_fold = x_develop[y_develop['Fold'] != foldno]
        y_train_fold = y_develop[y_develop['Fold'] != foldno].drop('Fold', axis=1)
        x_val_fold = x_develop[y_develop['Fold'] == foldno]
        y_val_fold = y_develop[y_develop['Fold'] == foldno].drop('Fold', axis=1)
        
            
        train_sample_size = len(y_train_fold)
        val_sample_size = len(y_val_fold)
        print(" ")
        print(f"Fold-%d" % (foldno))
        print("Train sample size:", train_sample_size, ", Validation sample size:", val_sample_size)

        FEATURE_SIZE = x_train_fold.shape[-1]

        # Train Data Pipeline
        train_ds = tf.data.Dataset.from_tensor_slices((x_train_fold, y_train_fold))
        # train_ds = train_ds.filter(lambda x, y: tf.reduce_any(y != np.zeros(206)))
        train_ds = train_ds.shuffle(1024).batch(56)

        # Validation Data Pipeline
        val_ds = tf.data.Dataset.from_tensor_slices((x_val_fold, y_val_fold))
        val_ds = val_ds.batch(val_sample_size)
        
#         t_model = load_model(f'../input/moafinalmodels/mx10_nonscored/weights-fold{foldno}.h5', compile=False)
        
#         # MODEL
#         models[foldno] = model.create_model1(t_model)
        
#         # Train
#         cb_es = EarlyStopping(monitor='val_mean_loss', patience=5, restore_best_weights=True)
#         reduce_lr_loss = ReduceLROnPlateau(monitor='val_mean_loss', factor=0.1, patience=3, verbose=1, epsilon=1e-4, mode='min')
#         histories[foldno] = models[foldno].fit(train_ds, validation_data=val_ds, epochs=EPOCHS, callbacks=[cb_es, reduce_lr_loss], verbose=1)


        models[foldno] = load_model(f'../input/update-mx10-model/weights-fold{foldno}.h5', compile=False)
        
        # Predict Validation Probabilities
        pred_val_fold[foldno] = models[foldno].predict(x_val_fold)
        
        # Calculate OOF (Validation) Results
        oof_bp[foldno] = tf.keras.losses.binary_crossentropy(y_val_fold, pred_val_fold[foldno]).numpy().mean()
        print('Out-of-Fold Score: ', oof_bp[foldno])
        
        pred_val_fold[foldno][x_val_fold['cp_type'] == 0] == 0
        oof_ap[foldno] = tf.keras.losses.binary_crossentropy(y_val_fold, pred_val_fold[foldno]).numpy().mean()
        print('Out-of-Fold Score with post processing: ', oof_ap[foldno])
        
        target_val_fold[foldno] = y_val_fold.to_numpy()
        
        cur_oof_pd = pd.DataFrame(pred_val_fold[foldno], columns=[f'{i}' for i in range(206)])
        cur_oof_pd['sig_id'] = y_val_fold.index
        oof_pd = oof_pd.append(cur_oof_pd)
        

        # Save Model
        if SAVE_MODEL:
            pass
#             models[foldno].save(f'weights-fold{foldno}.h5')
            
    pred_val_fold = combine_pred(pred_val_fold)
    target_val_fold = combine_pred(target_val_fold)
    
    
    print('\n')
    if summary:
        print('Summary')
        # Mean out of score before postprocessing
        print('Mean OOF score: %f +/- %f' % (oof_score(oof_bp)))

        # Mean out of score after postprocessing
        print('Mean OOF score after postprocessing: %f +/- %f' % (oof_score(oof_ap)))
    
    return models, histories, oof_ap, pred_val_fold, pred_val_fold, target_val_fold, oof_pd


def submit(res):
    sub = res.reset_index()
    sub.to_csv('submission.csv', index=False)

In [None]:
# RUN THE TRAINING
EPOCHS = 45
SAVE_MODEL = True

In [None]:
output_bias = -np.log(y_develop[y_develop.columns[:-1]].mean(axis=0).to_numpy())
models, histories, oof_ap, pred_val_fold, pred_val_fold, target_val_fold, oof_pd = run_cv(x_develop, y_develop,
                                                                                          model=Model(x_develop.shape[1], output_bias),
                                                                                          N_FOLDS=N_FOLDS,
                                                                                          debug=False)
pred_test = calc_pred(x_test, models) # This is for single model submission
submit(pred_test)

In [None]:
def log_loss_score(actual, predicted,  eps=1e-15):
    p1 = actual * np.log(predicted+eps)
    p0 = (1-actual) * np.log(1-predicted+eps)
    loss = p0 + p1
    return -loss.mean()

def log_loss_multi(y_true, y_pred):
    M = y_true.shape[1]
    results = np.zeros(M)
    for i in range(M):
        results[i] = log_loss_score(y_true[:,i], y_pred[:,i])
    return results.mean()
oof_final = y_develop.merge(oof_pd, how='left', on='sig_id')[[str(col) for col in range(206)]]

In [None]:
y_true = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
log_loss_multi(y_true.drop(['sig_id'], axis=1).to_numpy(), oof_final.to_numpy())

In [None]:
np.save('LBS.npy', oof_final.to_numpy())

In [None]:
pd.DataFrame.from_dict(oof_ap, orient='index').hist()
plt.show()