In [None]:
import sys
sys.path.append('../input/iterativestratification')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [None]:
import numpy as np
import pandas as pd
import os
import random
from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import QuantileTransformer
from sklearn.feature_selection import VarianceThreshold, SelectKBest
from sklearn.multioutput import MultiOutputClassifier
from xgboost import XGBClassifier
from category_encoders import CountEncoder
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow.keras.layers as L
import tensorflow.keras.models as M
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
import tensorflow_addons as tfa

# read datasets

In [None]:
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets_scored = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv')
test_features = pd.read_csv('../input/lish-moa/test_features.csv')
train_drug = pd.read_csv("../input/lish-moa/train_drug.csv")

data = train_features.append(test_features)

ss = pd.read_csv('../input/lish-moa/sample_submission.csv')

# main parameters
from https://www.kaggle.com/vbmokin/moa-pytorch-rankgauss-pca-nn-upgrade-3d-visual

In [None]:
n_comp_GENES = 463
n_comp_CELLS = 60
VarianceThreshold_for_FS = 0.9
NFOLDS = 5
NSEEDS = 5

# set seeds

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

    
seed_everything(seed=42)

# transform columns to normal dist with rankgauss (QuantileTransformer)

In [None]:
GENES = [col for col in train_features.columns if col.startswith('g-')]
CELLS = [col for col in train_features.columns if col.startswith('c-')]

In [None]:
col_names = GENES + CELLS
col_example_index = 300
col_example_name = col_names[300]

In [None]:
# quantile transformer normal dist --> 
for col in (GENES + CELLS):    
    transformer = QuantileTransformer(n_quantiles=100,random_state=0, output_distribution="normal")
    vec_len = len(train_features[col].values)
    vec_len_test = len(test_features[col].values)
    raw_vec = train_features[col].values.reshape(vec_len, 1)
    transformer.fit(raw_vec)

    train_features[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
    test_features[col] = transformer.transform(test_features[col].values.reshape(vec_len_test, 1)).reshape(1, vec_len_test)[0]

# dimensionality reduction with pca

In [None]:
len(GENES)

In [None]:
# GENES

data = pd.concat([pd.DataFrame(train_features[GENES]), pd.DataFrame(test_features[GENES])])
data2 = (PCA(n_components=n_comp_GENES, random_state=42).fit_transform(data[GENES]))
train2 = data2[:train_features.shape[0]]; test2 = data2[-test_features.shape[0]:]

train2 = pd.DataFrame(train2, columns=[f'pca_G-{i}' for i in range(n_comp_GENES)])
test2 = pd.DataFrame(test2, columns=[f'pca_G-{i}' for i in range(n_comp_GENES)])

train_features = pd.concat((train_features, train2), axis=1)
test_features = pd.concat((test_features, test2), axis=1)

In [None]:
len(CELLS)

In [None]:
# CELLS

data = pd.concat([pd.DataFrame(train_features[CELLS]), pd.DataFrame(test_features[CELLS])])
data2 = (PCA(n_components=n_comp_CELLS, random_state=42).fit_transform(data[CELLS]))
train2 = data2[:train_features.shape[0]]; test2 = data2[-test_features.shape[0]:]

train2 = pd.DataFrame(train2, columns=[f'pca_C-{i}' for i in range(n_comp_CELLS)])
test2 = pd.DataFrame(test2, columns=[f'pca_C-{i}' for i in range(n_comp_CELLS)])

train_features = pd.concat((train_features, train2), axis=1)
test_features = pd.concat((test_features, test2), axis=1)

In [None]:
train_features.shape[1]

In [None]:
train_features.head(5)

# feature elimination with variance threshold

In [None]:
data = train_features.append(test_features)
data

In [None]:
var_thresh = VarianceThreshold(VarianceThreshold_for_FS)
data_transformed = var_thresh.fit_transform(data.iloc[:, 4:])

train_features_transformed = data_transformed[ : train_features.shape[0]]
test_features_transformed = data_transformed[-test_features.shape[0] : ]


train_features = pd.DataFrame(train_features[['sig_id','cp_type','cp_time','cp_dose']].values.reshape(-1, 4),\
                              columns=['sig_id','cp_type','cp_time','cp_dose'])

train_features = pd.concat([train_features, pd.DataFrame(train_features_transformed)], axis=1)


test_features = pd.DataFrame(test_features[['sig_id','cp_type','cp_time','cp_dose']].values.reshape(-1, 4),\
                             columns=['sig_id','cp_type','cp_time','cp_dose'])

test_features = pd.concat([test_features, pd.DataFrame(test_features_transformed)], axis=1)

train_features.shape

In [None]:
train_features.head(5)

# create datasets

In [None]:
# merge feature and and targets
merged = train_features.merge(train_targets_scored, on='sig_id')

# remove ctl_vehicle rows
merged = merged[merged['cp_type']!='ctl_vehicle'].reset_index(drop=True)
X_test = test_features[test_features['cp_type']!='ctl_vehicle'].reset_index(drop=True)

# create X_train and y_train
X_train = merged[train_features.columns]
y_train = merged[train_targets_scored.columns]

In [None]:
# drop cp_type column
X_train = X_train.drop('cp_type', axis=1)
X_test = X_test.drop('cp_type', axis=1)

In [None]:
# drop sig_id column
X_train = X_train.drop('sig_id', axis=1)
X_test = X_test.drop('sig_id', axis=1)
y_train = y_train.drop('sig_id', axis=1)

In [None]:
X_train.head(5)

# CountEncoder and StandardScaler

In [None]:
ce = CountEncoder(cols=["cp_dose","cp_time"])
X_train_encoded = ce.fit_transform(X_train)
X_test_encoded = ce.transform(X_test)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)

In [None]:
# convert arrays to df again
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns.to_list())
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns.to_list())

# Neural Network

In [None]:
num_columns = X_train.shape[1]

In [None]:
def create_model(num_columns):
    model = tf.keras.Sequential([
    tf.keras.layers.Input(num_columns),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.2),
    tfa.layers.WeightNormalization(tf.keras.layers.Dense(2048, activation="relu")),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.5),
    tfa.layers.WeightNormalization(tf.keras.layers.Dense(1048, activation="relu")),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.5),
    tfa.layers.WeightNormalization(tf.keras.layers.Dense(206, activation="sigmoid"))
    ])
    model.compile(optimizer=tfa.optimizers.Lookahead(tf.optimizers.Adam(), sync_period=10),
                  loss='binary_crossentropy', 
                  )
    return model

In [None]:
res = y_train.copy()
ss.loc[:, y_train.columns] = 0
res.loc[:, y_train.columns] = 0
control_mask = test_features['cp_type']!='ctl_vehicle'

In [None]:
for seed in range(NSEEDS):
    mskf = MultilabelStratifiedKFold(n_splits=NFOLDS, random_state=seed, shuffle=True)
    for n, (tr, te) in enumerate(mskf.split(X_train, y_train)):
        print(f'Fold {n}')
        
        X_trn, X_val = X_train.iloc[tr,:], X_train.iloc[te,:]
        y_trn, y_val = y_train.iloc[tr,:], y_train.iloc[te,:]
        
        model = create_model(num_columns)
        checkpoint_path = f'repeat:{seed}_Fold:{n}.hdf5'
        reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, epsilon=1e-4, mode='min')
        cb_checkpt = ModelCheckpoint(checkpoint_path, monitor = 'val_loss', verbose = 0, save_best_only = True,
                                     save_weights_only = True, mode = 'min')
        model.fit(X_trn.values,
                  y_trn.values,
                  validation_data=(X_val.values, y_val.values),
                  epochs=35, batch_size=128,
                  callbacks=[reduce_lr_loss, cb_checkpt], verbose=2
                 )
        
        model.load_weights(checkpoint_path)
        test_predict = model.predict(X_test.values)
        val_predict = model.predict(X_val.values)
        
        ss.loc[control_mask, y_train.columns] += test_predict
        res.loc[te, y_train.columns] += val_predict
        print('')
        
ss.loc[:, y_train.columns] /= (NFOLDS * NSEEDS)
res.loc[:, y_train.columns] /= NSEEDS

In [None]:
def metric(y_true, y_pred):
    metrics = []
    for _target in y_train.columns:
        metrics.append(log_loss(y_true.loc[:, _target], y_pred.loc[:, _target].astype(float), labels=[0,1]))
    return np.mean(metrics)

In [None]:
print(f'OOF Metric: {metric(y_train, res)}')

In [None]:
ss.to_csv('submission.csv', index=False)

In [None]:
dot_img_file = 'model_1.png'
tf.keras.utils.plot_model(model, to_file=dot_img_file, show_shapes=True)