## What is working:

Version 2:
- PCA
- PCA concat train and test (better than PCA for seperate sets)
- Swish activation (instead of tanh)
- Adam (rather than sgd)

Version 8:
- Lookahead RAdam
- Do not use "cp_type" column
- Relu (better than Swish)
- Weight Normalization
- `ctl_vehicle` data is all zeros
- Label smoothing
- Batch Normalization

Version 9:
- Increase model's nodes
- np.clid(y_pred, 0.001, 0.999)

## What is not working:
- Quantile transformation (5% and 95%)
- Tanh activation

## Current best version: 8

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

- train_features.csv - Features for the training set. Features `g-` signify gene expression data, and `c-` signify cell viability data. `cp_type` indicates samples treated with a compound (`cp_vehicle`) or with a control perturbation (`ctrl_vehicle`); control perturbations have no MoAs; `cp_time` and `cp_dose` indicate treatment duration (24, 48, 72 hours) and `dose` (high or low).
- train_targets_scored.csv - The binary MoA targets that are scored.
- train_targets_nonscored.csv - Additional (optional) binary MoA responses for the training data. These are not predicted nor scored.
- test_features.csv - Features for the test data. You must predict the probability of each scored MoA for each row in the test data.
- sample_submission.csv - A submission file in the correct format.

In [None]:
train=pd.read_csv("/kaggle/input/lish-moa/train_features.csv")
test=pd.read_csv("../input/lish-moa/test_features.csv")
sub=pd.read_csv("../input/lish-moa/sample_submission.csv")
train_target=pd.read_csv("../input/lish-moa/train_targets_scored.csv")
train

In [None]:
train.isnull().sum(axis=0).sum(), train.isnull().sum(axis=1).sum()

In [None]:
train.nunique(dropna=False).sort_values()

In [None]:
# # This code is used to check duplicate columns (if any). It runs for a long time: the result is None, so avoid running this cell

# train_factorized = pd.DataFrame(index=train.index)
# for col in tqdm.notebook.tqdm(train.columns):
#     train_factorized[col] = train[col].map(train[col].value_counts())


# dup_cols = {}

# for i, c1 in enumerate(tqdm_notebook(train_factorized.columns)):
#     for c2 in train_factorized.columns[i + 1:]:
#         if c2 not in dup_cols and np.all(train_factorized[c1] == train_factorized[c2]):
#             dup_cols[c2] = c1
            
# dup_cols

In [None]:
# Check for classes distribution
limit = 0
for col in tqdm_notebook(train_target.columns):
    if col != "sig_id":
        print(train_target[col].value_counts())
    limit+=1
    if limit >= 15:
        break

In [None]:
ctlVehicle_idx = train["cp_type"] != "ctl_vehicle"
train = train.loc[ctlVehicle_idx].reset_index(drop=True)
train = train.drop("cp_type", axis=1)
train_target = train_target.loc[ctlVehicle_idx].reset_index(drop=True)

In [None]:
train

In [None]:
train_target

In [None]:
features_g = list(train.columns[3:775])
features_c = list(train.columns[775:875])

In [None]:
for df in [train, test]:
    df['g-sum'] = df[features_g].sum(axis = 1)
    df['g-mean'] = df[features_g].mean(axis = 1)
    df['g-std'] = df[features_g].std(axis = 1)
    df['g-kurt'] = df[features_g].kurtosis(axis = 1)
    df['g-skew'] = df[features_g].skew(axis = 1)
    df['c-sum'] = df[features_c].sum(axis = 1)
    df['c-mean'] = df[features_c].mean(axis = 1)
    df['c-std'] = df[features_c].std(axis = 1)
    df['c-kurt'] = df[features_c].kurtosis(axis = 1)
    df['c-skew'] = df[features_c].skew(axis = 1)
    df['gc-sum'] = df[features_g + features_c].sum(axis = 1)
    df['gc-mean'] = df[features_g + features_c].mean(axis = 1)
    df['gc-std'] = df[features_g + features_c].std(axis = 1)
    df['gc-kurt'] = df[features_g + features_c].kurtosis(axis = 1)
    df['gc-skew'] = df[features_g + features_c].skew(axis = 1)

In [None]:
train

In [None]:
test

## Robust Scaler and PCA

In [None]:
gcols = [g for g in train.columns if "g-" in g]
ccols = [c for c in train.columns if "c-" in c]
cpcols = [cp for cp in train.columns if "cp_" in cp]
gccols = [gc for gc in train.columns if "gc-" in gc]

In [None]:
from sklearn.preprocessing import LabelEncoder

ctlVehicle_test = test["cp_type"] == "ctl_vehicle"
test = test.drop("cp_type", axis=1)

enc = LabelEncoder()
for col in train[cpcols]:
    train[col] = enc.fit_transform(train[col])
    
enc = LabelEncoder()
for col in test[cpcols]:
    test[col] = enc.fit_transform(test[col])

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import RobustScaler

rs = RobustScaler(quantile_range=(5, 95))
features = train.columns[3:]
rs.fit(pd.concat([train[features], test[features]], axis=0))
train[features] = rs.transform(train[features])
test[features] = rs.transform(test[features])


g_pca = PCA(n_components=0.95, random_state=42)
c_pca = PCA(n_components=0.95, random_state=42)


train_test_g_concat = pd.concat([train[gcols], test[gcols]], axis=0)
train_test_c_concat = pd.concat([train[ccols], test[ccols]], axis=0)
g_pca.fit(train_test_g_concat)
c_pca.fit(train_test_c_concat)

train_gtrans = pd.DataFrame(g_pca.transform(train[gcols]), columns=["g_PCA" + str(i) for i in range(g_pca.n_components_)], index=train.index)
test_gtrans = pd.DataFrame(g_pca.transform(test[gcols]), columns=["g_PCA" + str(i) for i in range(g_pca.n_components_)], index=test.index)

train_ctrans = pd.DataFrame(c_pca.transform(train[ccols]), columns=["c_PCA" + str(i) for i in range(c_pca.n_components_)], index=train.index)
test_ctrans = pd.DataFrame(c_pca.transform(test[ccols]), columns=["c_PCA" + str(i) for i in range(c_pca.n_components_)], index=test.index)

g_pca.n_components_, c_pca.n_components_

In [None]:
train = pd.concat([train_gtrans, train_ctrans, train[cpcols], train[gccols]], axis=1)
test = pd.concat([test_gtrans, test_ctrans, test[cpcols], test[gccols]], axis=1)
train

In [None]:
test

In [None]:
train_target = train_target.drop("sig_id", axis=1)
train_target

In [None]:
import tensorflow as tf
import tensorflow.keras.layers as L
import tensorflow_addons as tfa


def create_model():
    model = tf.keras.Sequential([
        tfa.layers.WeightNormalization(L.Dense(train.shape[1], input_shape=(train.shape[1],))),
        L.BatchNormalization(),
        tfa.layers.WeightNormalization(L.Dense(1024, activation="relu")),
        L.BatchNormalization(),
        L.Dropout(0.3),
        tfa.layers.WeightNormalization(L.Dense(1024, activation="relu")),
        L.BatchNormalization(),
        L.Dropout(0.2),
        tfa.layers.WeightNormalization(L.Dense(train_target.shape[1], activation="sigmoid"))
    ])
    
    sgd = tf.keras.optimizers.SGD()
    adamw = tfa.optimizers.AdamW(weight_decay = 1e-5)
    adam = tf.keras.optimizers.Adam()
    radam = tfa.optimizers.RectifiedAdam()
    lookahead_radam = tfa.optimizers.Lookahead(radam)
    
    model.compile(loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=1e-15), optimizer=lookahead_radam, metrics=["binary_crossentropy"])
    return model

In [None]:
from sklearn.model_selection import KFold

predictions = []
kf = KFold(shuffle=True, random_state=42)
for fold_id, (train_idx, valid_idx) in enumerate(kf.split(train)):
    model = create_model()
    history = model.fit(train.iloc[train_idx], train_target.iloc[train_idx], 
              validation_data=(train.iloc[valid_idx], train_target.iloc[valid_idx]),
             epochs=50,
             verbose=2,
             callbacks=[
    tf.keras.callbacks.ReduceLROnPlateau(),
    tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True),
    tf.keras.callbacks.ModelCheckpoint("model_fold" + str(fold_id) + ".h5", save_best_only=True, save_weights_only=True)
])
    print("Fold ID: {}, train loss: {}, valid loss: {}".format(fold_id, min(history.history["loss"]), min(history.history["val_loss"])))
    model.load_weights("model_fold" + str(fold_id) + ".h5")
    predictions.append(model.predict(test))

In [None]:
pred = np.average(predictions, axis=0)
pred = np.clip(pred, 0.001, 0.999)
pred.shape

In [None]:
sub = pd.read_csv("../input/lish-moa/sample_submission.csv")
sub.loc[:, 1:] = pred
sub.loc[ctlVehicle_test, sub.columns != "sig_id"] = 0

# sub.loc[:, 1:] = tf.keras.utils.normalize(pred)
sub

In [None]:
sub.to_csv("submission.csv", index=False)