In [5]:
import numpy as np
import pandas as pd
import copy

import tensorflow as tf
from tensorflow import feature_column as fc
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.layers import (Dense, DenseFeatures, Dropout, 
                                     BatchNormalization, Embedding, Input, Concatenate, Average,
                                     InputLayer, Lambda)
from tensorflow.keras.metrics import AUC, Precision, Recall
from tensorflow.keras import backend as K, Sequential, Model
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.optimizers import Adam, RMSprop

import tensorflow_addons as tfa
from tensorflow_addons.layers import WeightNormalization
from keras.wrappers.scikit_learn import KerasRegressor
import keras

from sklearn.preprocessing import QuantileTransformer, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold

import matplotlib.pyplot as plt
from math import log2

import sys
sys.path.append('../input/iterativestratification')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

print(pd.__version__)
print(tf.__version__)

1.1.3
2.3.1


In [3]:
# Loading data and encoding

folder_path = '../input/lish-moa/'
raw_test = pd.read_csv(folder_path + 'test_features.csv')
raw_train = pd.read_csv(folder_path + 'train_features.csv')
raw_targets = pd.read_csv(folder_path + 'train_targets_scored.csv')

# Phân loại dữ liệu
cols_id = ['sig_id']
cols_to_remove = ['cp_type']
cols_fts = [i for i in raw_train.columns if i not in cols_id +cols_to_remove]
cols_gene = [col for col in raw_train.columns if col.startswith("g-")]
cols_cell = [col for col in raw_train.columns if col.startswith("c-")]
cols_experiment = [col for col in cols_fts if col not in cols_gene+cols_cell]
cols_target = [i for i in raw_targets.columns if i not in cols_id]
num_fts, num_labels = len(cols_fts), len(cols_target)

# xử lý categorical
def transform_data(input_data):
    '''Clean data and encoding
        * input_data: table '''
    out = input_data.copy()
    out['cp_dose'] = out['cp_dose'].map({'D1':0, 'D2':1})
    out['cp_time'] = out['cp_time']/72
    
    return out

to_train = transform_data(raw_train[raw_train['cp_type'] != 'ctl_vehicle'])
to_train_targets = raw_targets.iloc[to_train.index]
to_pred  = transform_data(raw_test)
to_pred_non_ctl = to_pred[to_pred['cp_type'] != 'ctl_vehicle']

In [38]:
# preprocessing pipeline
def pipe_line_builder(quantiles_num, pca_dims):
    '''Dựng pipe line cho từng nhóm columns
    :quantiles_num: int: số quantile khi normalise
    :pca_dims: int: số chiều pca'''
    norm = QuantileTransformer(n_quantiles=quantiles_num,random_state=0, output_distribution="normal")
    pca = PCA(n_components = pca_dims)
    
    p_var_norm = Pipeline([ 
        ('norm', norm) ])
    p_var_norm_pca = Pipeline([ 
        ('norm1', norm),
        ('pca', pca),
        ('norm2', norm)
    ])
    return FeatureUnion([
        ('norm', p_var_norm)
        , ('norm_pca', p_var_norm_pca) 
        ])


pipe = ColumnTransformer([
     ('gene', pipe_line_builder(quantiles_num = 100, pca_dims = 600), cols_gene),
     ('cell', pipe_line_builder(quantiles_num = 100, pca_dims = 50), cols_cell),
     ('experiment', 'passthrough', cols_experiment)
    ])

pipe = Pipeline([
    ('norm_and_pca', pipe),
    ('variance',  VarianceThreshold(0.))
])

In [40]:
model = Sequential([
    BatchNormalization(),
    WeightNormalization(Dense( 1500, activation = 'elu', kernel_initializer='he_normal')),
    
    BatchNormalization(),
    Dropout(0.2619422201258426),
    WeightNormalization(Dense( 1500, activation = 'elu', kernel_initializer='he_normal')),
    
    BatchNormalization(),
    Dropout(0.2619422201258426),
    WeightNormalization(Dense( 1500, activation = 'elu', kernel_initializer='he_normal')),
    
    Dense(num_labels, activation = 'sigmoid', kernel_initializer='he_normal')
])

step = tf.Variable(0, trainable=False)
schedule = tf.optimizers.schedules.PiecewiseConstantDecay(
    [10000, 15000], [1e-0, 1e-1, 1e-2])
lr = 1e-1 * schedule(step)
wd = lambda: 1e-3 * schedule(step)
opt = tfa.optimizers.AdamW(learning_rate=lr, weight_decay=wd)

model.compile(loss= BinaryCrossentropy(label_smoothing=0.01), optimizer='adam')
# tf.keras.utils.plot_model(model,show_shapes=True)

In [None]:
NFOLDS = 7

def df_by_index(df, indexes, cols = None):
    if cols is None:
        cols = df.columns
    return df[df.index.isin(indexes)][cols]

kf = KFold(n_splits= NFOLDS, shuffle = True)

ss= np.zeros([to_pred_non_ctl.shape[0], num_labels])

from_fols =0
for train_index, val_index in kf.split(to_train):
    print('Training at fold: ', from_fols)
    from_fols += 1
    tf.keras.backend.clear_session()
    
    fold_X_train = df_by_index(to_train, train_index)
    fold_y_train = df_by_index(to_train_targets, train_index, cols_target)
    
    fold_X_val = df_by_index(to_train, val_index)
    fold_y_val = df_by_index(to_train_targets, val_index, cols_target)
    
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=5, mode='min', min_lr=1E-5, verbose=1)
    early_stopping = EarlyStopping(monitor='val_loss', min_delta=1E-5, patience=15, mode='min',restore_best_weights=True, verbose=1)
    
    model.fit(
        pipe.fit_transform(fold_X_train), 
        fold_y_train, 
        validation_data = (pipe.transform(fold_X_val), fold_y_val),
        batch_size=64, 
        epochs=150,
        callbacks=[reduce_lr, early_stopping]
        )
    
    ss += model.predict(pred_list_arr)

ss = ss/NFOLDS

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150

In [None]:
# Final data
pipe = QuantileTransformer(n_quantiles=100,random_state=0, output_distribution="normal")
pipe.fit(to_train[cols_fts].append(to_pred[cols_fts]))

X_train = pipe.transform(to_train[cols_fts])
X_pred =  pipe.transform(to_pred_non_ctl[cols_fts])
y_train = to_train_targets[cols_target]
X_train.shape

**Thử nghiệm ResFMnet**

In [None]:
# Tiếp cận theo hướng recommend - cell -> chemical | cell/gene: user, chemial: item
n_components = 256

u_fts_num = to_pred.shape[1]#num_fts
i_fts_num = num_labels

g_ft_num = len(cols_gene)
c_ft_num = len(cols_cell)
e_ft_num = len(cols_experiment)

In [None]:
#1. User embedding
#1.1. Gene fts
input_g = Input(shape = (g_ft_num,) )
layer_g = WeightNormalization(Dense( 512, activation = 'elu', kernel_initializer='he_normal')) (input_g)
layer_g = Dropout(0.2619422201258426) (layer_g)
layer_g = BatchNormalization() (layer_g)

layer_g = WeightNormalization(Dense( 320, activation = 'elu', kernel_initializer='he_normal')) (layer_g)
layer_g = Dropout(0.2619422201258426) (layer_g)
layer_g = BatchNormalization() (layer_g)

#1.2. Cell fts
input_c = Input(shape = (c_ft_num,) )
layer_c = WeightNormalization(Dense( 80, activation = 'elu', kernel_initializer='he_normal')) (input_c)
layer_c = Dropout(0.2619422201258426) (layer_c)
layer_c = BatchNormalization() (layer_c)

#1.3. Experiment fts
layer_e = Input(shape = (e_ft_num,) )

#1.4 user full fts with residual connection
layer_u = Concatenate() ([layer_g,input_g, layer_c,input_c, layer_e])

layer_u = WeightNormalization(Dense( n_components*2, activation = 'elu', kernel_initializer='he_normal')) (layer_u)
layer_u = Dropout(0.2619422201258426) (layer_u)
layer_u = BatchNormalization() (layer_u)

layer_u = WeightNormalization(Dense( n_components, activation = 'elu', kernel_initializer='he_normal')) (layer_u)
layer_u = Dropout(0.2619422201258426) (layer_u)
layer_u = BatchNormalization() (layer_u)



#2. Item embedding
#2.1. Addition information for item_info
chemical_category = tf.transpose(
        tf.constant(
            [[1 if '_inhibitor' in i else 0 for i in cols_target],
               [1 if '_agonist' in i else 0 for i in cols_target],
               [1 if '_agent' in i else 0 for i in cols_target],
               [1 if '_antagonist' in i else 0 for i in cols_target],
               [1 if '_blocker' in i else 0 for i in cols_target],
               [1 if '_activator' in i else 0 for i in cols_target] 
             ]))

#2.2 Full item fts: addition + onehot
item_ft = tf.concat(
    [chemical_category ,
     tf.eye(i_fts_num, dtype = tf.int32) # Create tensor 0-1 coresponse with chemical labels
    ], axis = 1
)
layer_i = Dense(n_components, activation = 'relu', kernel_initializer='he_normal', name ='layer_u1') (item_ft)


#3. Dot product user - item
def dot_2layer(x):
    return K.dot( x[0], K.transpose(x[1]))
dot_ui = Lambda( dot_2layer, name = 'lambda_dot' ) ([layer_u,layer_i])
dot_ui= WeightNormalization(Dense(512, activation="relu", kernel_initializer='he_normal')) (dot_ui)
dot_ui= BatchNormalization() (dot_ui)
dot_ui = WeightNormalization(Dense(i_fts_num, activation = 'sigmoid', kernel_initializer='he_normal', name = 'labels'))(dot_ui)

# Compile model
model = Model(inputs=[layer_e, input_g, input_c, ], outputs= [dot_ui])

step = tf.Variable(0, trainable=False)
schedule = tf.optimizers.schedules.PiecewiseConstantDecay(
    [10000, 15000], [1e-0, 1e-1, 1e-2])
lr = 1e-1 * schedule(step)
wd = lambda: 1e-3 * schedule(step)
opt = tfa.optimizers.AdamW(learning_rate=lr, weight_decay=wd)

model.compile(loss= BinaryCrossentropy(label_smoothing=0.0005), optimizer='adam')
print( model.summary() )

tf.keras.utils.plot_model(model,show_shapes=True)

In [None]:
def get_train_set(X_train):
    X_train_e = X_train[ :, :e_ft_num]
    X_train_g = X_train[ :, e_ft_num: (e_ft_num+ g_ft_num)]
    X_train_c = X_train[ :, (e_ft_num+ g_ft_num): (e_ft_num+ g_ft_num+ c_ft_num)]
    return [ X_train_e, X_train_g, X_train_c]

train_list_arr = get_train_set(X_train)
pred_list_arr = get_train_set(X_pred)

In [None]:
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=5, mode='min', min_lr=1E-5, verbose=1)
early_stopping = EarlyStopping(monitor='val_loss', min_delta=1E-5, patience=15, mode='min',restore_best_weights=True, verbose=1)

ss= np.zeros([to_pred_non_ctl.shape[0], num_labels])

N_STARTS = 3 # <-- change it
for seed in range(N_STARTS):
    print('Trainging at seed: ', seed)
    history = model.fit(
                    train_list_arr, 
                    y_train, 
                    batch_size=64*(seed+1), 
                    epochs=150,
                    validation_split = 0.3,
                    callbacks=[reduce_lr, early_stopping])
    ss += model.predict(pred_list_arr)
    
    K.clear_session()
#     del model, history

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])

In [None]:
# predict non ctl vehicle
df_preds_non_ctl =  pd.DataFrame(ss, columns= cols_target, index = to_pred_non_ctl.index)

# concat with all to pred values
df_preds = pd.concat([ to_pred[cols_id], df_preds_non_ctl], axis = 1).fillna(0)

# to csv
df_preds.to_csv("submission.csv", index = None)