In [63]:
import numpy as np
import pandas as pd
import copy

import tensorflow as tf
from tensorflow import feature_column as fc
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.layers import (Dense, DenseFeatures, Dropout, 
                                     BatchNormalization, Embedding, Input, Concatenate, Average,
                                     InputLayer, Lambda)
from tensorflow.keras.metrics import AUC, Precision, Recall
from tensorflow.keras import backend as K, Sequential, Model
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.optimizers import Adam

from tensorflow_addons.layers import WeightNormalization
from keras.wrappers.scikit_learn import KerasRegressor
import keras

from sklearn.preprocessing import QuantileTransformer, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
from math import log2

print(pd.__version__)
print(tf.__version__)

1.0.5
2.2.0


In [13]:
# Loading data and encoding

folder_path = './'
raw_test = pd.read_csv(folder_path + 'test_features.csv')
raw_train = pd.read_csv(folder_path + 'train_features.csv')
raw_targets = pd.read_csv(folder_path + 'train_targets_scored.csv')

# Phân loại dữ liệu
cols_id = ['sig_id']
cols_to_remove = ['cp_type']
cols_fts = [i for i in raw_train.columns if i not in cols_id +cols_to_remove]
cols_gene = [col for col in raw_train.columns if col.startswith("g-")]
cols_cell = [col for col in raw_train.columns if col.startswith("c-")]
cols_experiment = [col for col in cols_fts if col not in cols_gene+cols_cell]
cols_target = [i for i in raw_targets.columns if i not in cols_id]
num_fts, num_labels = len(cols_fts), len(cols_target)

# xử lý categorical
def transform_data(input_data):
    '''Clean data and encoding
        * input_data: table '''
    out = input_data.copy()
    out['cp_dose'] = out['cp_dose'].map({'D1':0, 'D2':1})
    out['cp_time'] = out['cp_time']/72
    
    return out

to_train = transform_data(raw_train[raw_train['cp_type'] != 'ctl_vehicle'])
to_train_targets = raw_targets.iloc[to_train.index]
to_pred  = transform_data(raw_test)
to_pred_non_ctl = to_pred[to_pred['cp_type'] != 'ctl_vehicle']

In [15]:
to_train[cols_fts]

Unnamed: 0,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,g-7,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
0,0.333333,0,1.0620,0.5577,-0.2479,-0.6208,-0.1944,-1.0120,-1.0220,-0.0326,...,0.2862,0.2584,0.8076,0.5523,-0.1912,0.6584,-0.3981,0.2139,0.3801,0.4176
1,1.000000,0,0.0743,0.4087,0.2991,0.0604,1.0190,0.5207,0.2341,0.3372,...,-0.4265,0.7543,0.4708,0.0230,0.2957,0.4899,0.1522,0.1241,0.6077,0.7371
2,0.666667,0,0.6280,0.5817,1.5540,-0.0764,-0.0323,1.2390,0.1715,0.2155,...,-0.7250,-0.6297,0.6103,0.0223,-1.3240,-0.3174,-0.6417,-0.2187,-1.4080,0.6931
3,0.666667,0,-0.5138,-0.2491,-0.2656,0.5288,4.0620,-0.8095,-1.9590,0.1792,...,-2.0990,-0.6441,-5.6300,-1.3780,-0.8632,-1.2880,-1.6210,-0.8784,-0.3876,-0.8154
4,1.000000,1,-0.3254,-0.4009,0.9700,0.6919,1.4180,-0.8244,-0.2800,-0.1498,...,0.0042,0.0048,0.6670,1.0690,0.5523,-0.3031,0.1094,0.2885,-0.3786,0.7125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23808,1.000000,0,0.1608,-1.0500,0.2551,-0.2239,-0.2431,0.4256,-0.1166,-0.1777,...,0.0789,0.3538,0.0558,0.3377,-0.4753,-0.2504,-0.7415,0.8413,-0.4259,0.2434
23809,0.333333,1,0.1394,-0.0636,-0.1112,-0.5080,-0.4713,0.7201,0.5773,0.3055,...,0.1969,0.0262,-0.8121,0.3434,0.5372,-0.3246,0.0631,0.9171,0.5258,0.4680
23810,0.333333,1,-1.3260,0.3478,-0.3743,0.9905,-0.7178,0.6621,-0.2252,-0.5565,...,0.4286,0.4426,0.0423,-0.3195,-0.8086,-0.9798,-0.2084,-0.1224,-0.2715,0.3689
23812,0.333333,0,0.6660,0.2324,0.4392,0.2044,0.8531,-0.0343,0.0323,0.0463,...,-0.1105,0.4258,-0.2012,0.1506,1.5230,0.7101,0.1732,0.7015,-0.6290,0.0740


In [142]:
# Addition information for item_info
chemical_category = tf.transpose(
        tf.constant(
            [[1 if '_inhibitor' in i else 0 for i in cols_target],
               [1 if '_agonist' in i else 0 for i in cols_target],
               [1 if '_agent' in i else 0 for i in cols_target],
               [1 if '_antagonist' in i else 0 for i in cols_target],
               [1 if '_blocker' in i else 0 for i in cols_target],
               [1 if '_activator' in i else 0 for i in cols_target]
            ]    
        )
    )

# Full item fts: addition + onehot
item_ft = tf.concat(
    [chemical_category ,
     tf.eye(i_fts_num, dtype = tf.int32) # Create tensor 0-1 coresponse with chemical labels
    ], axis = 1
)

In [165]:
# Tiếp cận theo hướng recommend - cell -> chemical | cell/gene: user, chemial: item
n_components = 256

u_fts_num = num_fts
i_fts_num = num_labels

#User embedding
input_u = Input(shape = (u_fts_num,) , name ='input_u1' )
# layer_u= WeightNormalization(Dense(700, activation="relu", kernel_initializer='he_normal')) (input_u)
layer_u = Dense(n_components, activation = 'relu', kernel_initializer='he_normal', name ='layer_u1') (input_u)

#Item embedding
layer_i = Dense(n_components, activation = 'relu', kernel_initializer='he_normal', name ='layer_u1') (item_ft)

# Dot product user - item
def dot_2layer(x):
    return K.dot( x[0], K.transpose(x[1]))
dot_ui = Lambda( dot_2layer, name = 'lambda_dot' ) ([layer_u,layer_i])
# dot_ui= BatchNormalization() (dot_ui)
dot_ui= WeightNormalization(Dense(512, activation="relu", kernel_initializer='he_normal')) (dot_ui)
# dot_ui= BatchNormalization() (dot_ui)
dot_ui= WeightNormalization(Dense(256, activation="relu", kernel_initializer='he_normal')) (dot_ui)
# dot_ui= BatchNormalization() (dot_ui)
dot_ui = Dense(i_fts_num, activation = 'sigmoid', kernel_initializer='he_normal', name = 'labels')(dot_ui)

# Compile model
model = Model(inputs=[input_u, ], outputs= [dot_ui])
opt = Adam(lr=0.0005)
model.compile(loss= BinaryCrossentropy(label_smoothing=0.0005), optimizer=opt)
print( model.summary() )

# tf.keras.utils.plot_model(model,show_shapes=True)

Model: "model_33"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_u1 (InputLayer)        [(None, 874)]             0         
_________________________________________________________________
layer_u1 (Dense)             (None, 256)               224000    
_________________________________________________________________
tf_op_layer_MatMul_33 (Tenso [(None, 206)]             0         
_________________________________________________________________
weight_normalization_56 (Wei (None, 512)               212481    
_________________________________________________________________
weight_normalization_57 (Wei (None, 256)               262913    
_________________________________________________________________
labels (Dense)               (None, 206)               52942     
Total params: 752,336
Trainable params: 515,022
Non-trainable params: 237,314
______________________________________________

In [166]:
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=5, mode='min', min_lr=1E-5, verbose=1)
early_stopping = EarlyStopping(monitor='val_loss', min_delta=1E-5, patience=15, mode='min',restore_best_weights=True, verbose=1)

model.fit(to_train[cols_fts], to_train_targets[cols_target], batch_size=64, epochs=150,validation_split = 0.3
         ,callbacks=[reduce_lr, early_stopping])

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 00010: ReduceLROnPlateau reducing learning rate to 0.0001500000071246177.
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 00015: ReduceLROnPlateau reducing learning rate to 4.500000213738531e-05.
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 00020: ReduceLROnPlateau reducing learning rate to 1.3500000204658135e-05.
Restoring model weights from the end of the best epoch.
Epoch 00020: early stopping


<tensorflow.python.keras.callbacks.History at 0x166b8556148>

In [None]:
'''
Dùng batch 64 -> dùng tiếp 128 thì thấy loss giảm
Tăng batch_size lên 256 thì thấy val_loss tăng
Training không với batch 128 thì thấy loss vẫn cao
'''