In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
from sklearn.decomposition import PCA

import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow.keras.backend as K
import tensorflow.keras.layers as L
import tensorflow.keras.models as M
import time
import sys
import os

In [3]:
train_df = pd.read_csv('/content/drive/MyDrive/lish-moa/train_features.csv')
test_df = pd.read_csv('/content/drive/MyDrive/lish-moa/test_features.csv')
targetns = pd.read_csv('/content/drive/MyDrive/lish-moa/train_targets_nonscored.csv')
train_target_df = pd.read_csv('/content/drive/MyDrive/lish-moa/train_targets_scored.csv')
sub = pd.read_csv('/content/drive/MyDrive/lish-moa/sample_submission.csv')

In [6]:
target_cols = train_target_df.columns[1:]
N_TARGETS = len(target_cols)
print(train_df.shape)

(23814, 876)


In [7]:
cells = [col for col in train_df.columns if col.startswith('c-')]
genes = [col for col in train_df.columns if col.startswith('g-')]

In [8]:
# For g- features
n_comp = 50
data = pd.concat([pd.DataFrame(train_df[genes]), pd.DataFrame(test_df[genes])])
data2 = (PCA(n_components = 50, random_state = 100).fit_transform(data[genes]))
train2 = data2[:train_df.shape[0]]
test2 = data2[-test_df.shape[0]:]

train2 = pd.DataFrame(train2, columns = [f'pca_G-{i}' for i in range(50)])
test2 = pd.DataFrame(test2, columns = [f'pca_G-{i}' for i in range(50)])

train_df = pd.concat((train_df, train2), axis = 1)
test_df = pd.concat((test_df, test2), axis = 1)

In [None]:
# For c- features
data = pd.concat([pd.DataFrame(train_df[cells]), pd.DataFrame(test_df[cells])])
data2 = (PCA(n_components = 15, random_state = 100).fit_transform(data[cells]))
train2 = data2[:train_df.shape[0]]
test2 = data2[-test_df.shape[0]:]

train2 = pd.DataFrame(train2, columns = [f'pca_C-{i}' for i in range(15)])
test2 = pd.DataFrame(test2, columns = [f'pca_C-{i}' for i in range(15)])
train_df = pd.concat((train_df, train2), axis = 1)
test_df = pd.concat((test_df, test2), axis = 1)
train_df

In [None]:
from sklearn.feature_selection import VarianceThreshold

train_copy = train_df
var_thresh = VarianceThreshold(0.8)
data = train_df.append(test_df)
data_transformed = var_thresh.fit_transform(data.iloc[:, 4:])
data_transformed.shape

In [None]:
train_df_trans = data_transformed[ : train_df.shape[0]]
test_df_trans = data_transformed[-test_df.shape[0] : ]

train_df = pd.DataFrame(train_df[['sig_id', 'cp_type', 'cp_time', 'cp_dose']].values.reshape(-1, 4), columns = ['sig_id', 'cp_type', 'cp_time', 'cp_dose'])
# train_df.head
train_df = pd.concat([train_df, pd.DataFrame(train_df_trans)], axis = 1)


test_df = pd.DataFrame(test_df[['sig_id', 'cp_type', 'cp_time', 'cp_dose']].values.reshape(-1, 4), columns = ['sig_id', 'cp_type', 'cp_time', 'cp_dose'])
# train_df.head
test_df = pd.concat([test_df, pd.DataFrame(test_df_trans)], axis = 1)
train_df.head

In [12]:
search_row = dict(train_copy.iloc[0, 4:])
col_rela = {}
for i in np.arange(0, 868):
  for k, v in search_row.items():
    if train_df[i][0] == v.all():
      col_rela[i] = k
train_df = train_df.rename(columns = col_rela)
test_df = test_df.rename(columns = col_rela)


In [13]:
SEED = 1925
EPOCHS = 25
BATCH_SIZE = 128
FOLDS = 5
REPEATS = 5
LR = 0.0005
N_TARGETS = len(target_cols)

In [14]:
def seed_everything(seed):
  np.random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)
  tf.random.set_seed(seed)

In [18]:
def multi_log_loss(y_true, y_pred):
  losses = []
  for col in y_true.columns:
    losses.append(log_loss(y_true.loc[:, col], y_pred.loc[:, col]))
  return np.mean(losses)

In [17]:
def preprocess_df(data):
  # data['cp_type'] = (data['cp_type'] == 'trt_cp').astype(int)
  # data['cp_dose'] = (data['cp_dose'] == 'D2').astype(int)
  data.drop(['cp_type'], axis = 1, inplace = True)
  data.loc[:, 'cp_dose'] = data.loc[:, 'cp_dose'].map({'D1':0, 'D2':1})
  data.loc[:, 'cp_time'] = data.loc[:, 'cp_time'].map({24:0, 48:1, 72:2})
  return data

In [20]:
x_train = preprocess_df(train_df.drop(columns = "sig_id"))
x_test = preprocess_df(test_df.drop(columns = "sig_id"))
y_train = train_target_df.drop(columns = "sig_id")
N_FEATURES = x_train.shape[1]

In [None]:
x_train = x_train.astype({'cp_time':int})
x_test = x_test.astype({'cp_time':int})
x_train

In [22]:
def create_model():
  # model = tf.keras.Sequential([tf.keras.layers.Input(N_FEATURES), tf.keras.layers.BatchNormalization(),
  #                              tf.keras.layers.Dropout(0.2), 
  #                              tfa.layers.WeightNormalization(tf.keras.layers.Dense(2048, activation = "relu")),
  #                              tf.keras.layers.BatchNormalization(), tf.keras.layers.Dropout(0.5), 
  #                              tfa.layers.WeightNormalization(tf.keras.layers.Dense(2048, activation = "relu")),
  #                              tf.keras.layers.BatchNormalization(), tf.keras.layers.Dropout(0.5),
  #                              tfa.layers.WeightNormalization(tf.keras.layers.Dense(N_TARGETS, activation = "sigmoid"))])
  model = tf.keras.Sequential()
  model.add(tf.keras.layers.Input(N_FEATURES))
  model.add(tf.keras.layers.BatchNormalization())
  model.add(tf.keras.layers.Dropout(0.2))
  model.add(tfa.layers.WeightNormalization(tf.keras.layers.Dense(2048, activation = "relu")))
  model.add(tf.keras.layers.BatchNormalization())
  model.add(tf.keras.layers.Dropout(0.5))
  model.add(tfa.layers.WeightNormalization(tf.keras.layers.Dense(2048, activation = "relu")))
  model.add(tf.keras.layers.BatchNormalization())
  model.add(tf.keras.layers.Dropout(0.5))
  model.add(tfa.layers.WeightNormalization(tf.keras.layers.Dense(N_TARGETS, activation = "sigmoid")))
  model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = LR), loss = 'binary_crossentropy', metrics = ["accuracy"])
  return model

In [23]:
def build_train(resume_models = None, repeat_number = 0, folds = 5, skip_folds = 0):
  models = []
  oof_preds = y_train.copy()
  kfold = KFold(n_splits = folds, shuffle = True)
  for fold, (train_ind, val_ind) in enumerate(kfold.split(x_train)):
    print(f'Training fold {fold + 1}')
    fold = fold + 1
    cb_lr_schedule = tf.keras.callbacks.ReduceLROnPlateau(monitor = 'val_loss', 
                                                          factor = 0.4, patience = 2, 
                                                          verbose = 1, min_delta = 0.0001, 
                                                          mode = 'auto')
    checkpoint_path = f'repeat:{repeat_number}_Fold:{fold}.hdf5'
    cb_checkpt = tf.keras.callbacks.ModelCheckpoint(checkpoint_path, monitor = 'val_loss', 
                                                    verbose = 0,
                                                    save_best_only = True, save_weights_only = True, 
                                                    mode = 'min')
    model = create_model()
    model.fit(x_train.values[train_ind], y_train.values[train_ind], 
              validation_data = (x_train.values[val_ind], y_train.values[val_ind]),
              callbacks = [cb_lr_schedule, cb_checkpt], epochs = EPOCHS, batch_size = BATCH_SIZE, verbose = 2)
    model.load_weights(checkpoint_path)
    oof_preds.loc[val_ind, :] = model.predict(x_train.values[val_ind])
    models.append(model)
    print('train:')
    print(list(zip(model.metrics_names, model.evaluate(x_train.values[train_ind], y_train.values[train_ind], verbose = 0, batch_size = 32))))
    print('val:')
    print(list(zip(model.metrics_names, model.evaluate(x_train.values[train_ind], y_train.values[train_ind], verbose = 0, batch_size = 32))))
    return models, oof_preds
  

In [None]:
model = create_model()
model.summary()

In [None]:
models = []
oof_preds = []
# seed_everything(SEED)
SEED_ARRAY = [0, 1, 2, 3, 4]
for seed in SEED_ARRAY:
  print("seed: ", seed)
  seed_everything(seed)
  for i in range(REPEATS):
    m, oof = build_train(repeat_number = i, folds = FOLDS)
    models = models + m
    oof_preds.append(oof)

In [None]:
models[1].predict(x_test)

In [27]:
test_preds = sub.copy()
test_preds[target_cols] = 0
for model in models:
  test_preds.loc[:, target_cols] += model.predict(x_test)
test_preds.loc[:, target_cols] /= len(models)
test_preds.loc[x_test['cp_type'] == 0, target_cols] = 0
test_preds.to_csv('submission.csv', index = False)