COLAB

In [2]:
folder_path = '/content/drive/MyDrive/Data/colabs_data/MOA_kaggle/'

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
!cp '/content/drive/MyDrive/Data/colabs_data/MOA_kaggle/quanvh8_funcs.py' .

KAGGLE

In [None]:
# folder_path = '../input/lish-moa/'

In [None]:
# !cp '../input/coded-file/quanvh8_funcs.py' .

In [8]:
'''ENSEMBLE NETS
Inspire by https://www.kaggle.com/c/otto-group-product-classification-challenge/discussion/14335'''

import numpy as np, pandas as pd, copy, tensorflow as tf, matplotlib.pyplot as plt, sklearn

from tensorflow import feature_column as fc
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.layers import (Dense, DenseFeatures, Dropout, 
                                     BatchNormalization, Embedding, Input, Concatenate, Average,
                                     InputLayer, Lambda)
from tensorflow.keras.metrics import AUC, Precision, Recall
from tensorflow.keras import backend as K, Sequential, Model
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.optimizers import Adam, RMSprop

import tensorflow_addons as tfa
from tensorflow_addons.layers import WeightNormalization

from sklearn.preprocessing import QuantileTransformer, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.cluster import KMeans

from quanvh8_funcs import (DerivedFeatures, kfolds_bagging_training, voting_predict,
                           kolds_stacked_ensemble_training, stacked_ensemble_predict )

import sys

def log_loss_metric(y_true, y_pred):
    bce = tf.keras.losses.BinaryCrossentropy()
    return bce(y_true, y_pred).numpy()

print(pd.__version__)
print(tf.__version__)

1.1.4
2.3.0


In [9]:
# Loading data and encoding


raw_test = pd.read_csv(folder_path + 'test_features.csv')
raw_train = pd.read_csv(folder_path + 'train_features.csv')
raw_targets = pd.read_csv(folder_path + 'train_targets_scored.csv')

# Phân loại dữ liệu
cols_id = ['sig_id']
cols_to_remove = ['cp_type']
cols_fts = [i for i in raw_train.columns if i not in cols_id +cols_to_remove]
cols_gene = [col for col in raw_train.columns if col.startswith("g-")]
cols_cell = [col for col in raw_train.columns if col.startswith("c-")]
cols_experiment = [col for col in cols_fts if col not in cols_gene+cols_cell]
cols_target = [i for i in raw_targets.columns if i not in cols_id]
num_fts, num_labels = len(cols_fts), len(cols_target)

# xử lý categorical
def transform_data(input_data):
    '''Clean data and encoding
        * input_data: table '''
    out = input_data.copy()
    out['cp_dose'] = out['cp_dose'].map({'D1':0, 'D2':1})
    out['cp_time'] = out['cp_time']/72
    
    return out

to_train = transform_data(raw_train[raw_train['cp_type'] != 'ctl_vehicle'])
to_train_targets = raw_targets.iloc[to_train.index]
full_pred  = transform_data(raw_test)
to_pred = full_pred[full_pred['cp_type'] != 'ctl_vehicle']

In [10]:
# preprocessing pipeline
def pipe_line_builder(quantiles_num, pca_dims, kmean_clusters):
    '''Dựng pipe line cho từng nhóm columns
    :quantiles_num: int: số quantile khi normalise
    :pca_dims: int: số chiều pca'''
    norm = QuantileTransformer(n_quantiles=quantiles_num,random_state=0, output_distribution="normal")
    pca = PCA(n_components = pca_dims)
    derived_ft = DerivedFeatures(n_clusters = kmean_clusters)
    tsne = sklearn.manifold.TSNE(n_components = int(pca_dims/2))
    isomap = sklearn.manifold.Isomap(n_neighbors = 128, n_components = int(pca_dims/2) )

    p_derived_ft = Pipeline([
        ('norm', norm), 
        ('derived', derived_ft)])
    
    p_isomap = Pipeline([
        ('norm', norm), 
        ('isomap', isomap)])

    p_norm_pca = Pipeline([ 
        ('norm', norm),
        ('pca', pca) ])
    return FeatureUnion([
        ('norm', norm), 
        ('norm_pca', p_norm_pca),
        ('derived', p_derived_ft),
#         ('isomap', p_isomap)
    ])

# Dựng pipe transform data

pipe = Pipeline([
    ('norm_pca', ColumnTransformer([
                     ('gene', pipe_line_builder(quantiles_num = 200, pca_dims = 600, kmean_clusters = 5), cols_gene),
                     ('cell', pipe_line_builder(quantiles_num = 200, pca_dims = 50, kmean_clusters = 5), cols_cell),
                    ]) 
    ), 
    ('var', VarianceThreshold(0.01)) 
])

pipe = ColumnTransformer([
    ('gene_cell', pipe, cols_gene+ cols_cell),
    ('experiment', 'passthrough', cols_experiment)
])

In [11]:
# Transform data
pipe.fit(to_train[cols_fts].append(to_pred[cols_fts]))
X_train = pipe.transform(to_train[cols_fts])
X_pred = pipe.transform(to_pred[cols_fts])
y_train = to_train_targets[cols_target].values

In [65]:

def get_list_contains_ohe(keywords, cols_list):
    ouput = []
    for keyword in keywords:
        ouput.append( [1 if keyword in i else 0 for i in cols_list] )
    return ouput

# Tiếp cận theo hướng recommend - cell -> chemical | cell/gene: user, chemial: item
n_components = 350

u_fts_num = X_train.shape[1]#num_fts
i_fts_num = num_labels

initializer = 'he_normal'

#User embedding
input_u = Input(shape = (u_fts_num,) , name ='input_u1' )
layer_u = BatchNormalization( ) (input_u)
layer_u = Dropout(0.25 ) (layer_u)
layer_u = WeightNormalization(Dense(1024, activation="selu", kernel_initializer= initializer, kernel_regularizer= tf.keras.regularizers.l2(0.0001) )) (layer_u)
layer_u = BatchNormalization( ) (layer_u)
layer_u = Dropout(0.25 ) (layer_u)
layer_u = WeightNormalization(Dense(1024, activation="selu", kernel_initializer=initializer, kernel_regularizer= tf.keras.regularizers.l2(0.0001) )) (layer_u)
layer_u = BatchNormalization( ) (layer_u)
layer_u = Dropout(0.25 ) (layer_u)
layer_u = WeightNormalization(Dense(n_components, activation = 'selu', kernel_initializer= initializer, kernel_regularizer= tf.keras.regularizers.l2(0.0001) )) (layer_u)
layer_u = BatchNormalization() (layer_u)

#Item embedding
  # Addition information for item_info
list_chem_gr = ['_inhibitor', '_agonist', '_agent', '_antagonist', '_blocker', '_activator']
chemical_category = tf.transpose( tf.constant( get_list_contains_ohe( list_chem_gr, cols_target  ) ))
  # Full item fts: addition + onehot
item_ft = tf.concat(
    [chemical_category ,
     tf.eye(i_fts_num, dtype = tf.int32) # Create tensor 0-1 coresponse with chemical labels
    ], axis = 1
)
layer_i = WeightNormalization(Dense(n_components, activation = 'selu', kernel_initializer= initializer, kernel_regularizer= tf.keras.regularizers.l2(0.0001) )) (item_ft)
layer_i = BatchNormalization() (layer_i)

# Dot product user - item
def dot_2layer(x):
    return K.dot( x[0], K.transpose(x[1]))
dot_ui = Lambda( dot_2layer, name = 'lambda_dot' ) ([layer_u,layer_i])
dot_ui= BatchNormalization() (dot_ui)
dot_ui= WeightNormalization(Dense(512, activation="selu" , kernel_initializer= initializer, kernel_regularizer= tf.keras.regularizers.l2(0.0001))) (dot_ui)
dot_ui= BatchNormalization() (dot_ui)
dot_ui= WeightNormalization(Dense(256, activation="selu", kernel_initializer= initializer, kernel_regularizer= tf.keras.regularizers.l2(0.0001) )) (dot_ui)
dot_ui= BatchNormalization() (dot_ui)
dot_ui = Dense(i_fts_num, activation = 'sigmoid' )(dot_ui)

# Compile model
model = Model(inputs=[input_u, ], outputs= [dot_ui])

opt = tf.keras.optimizers.Adam(learning_rate=0.0001)

bce = tf.keras.losses.BinaryCrossentropy()
model.compile(loss= BinaryCrossentropy(label_smoothing=0.001), optimizer=opt 
              , metrics= [bce])
# print( model.summary() )

# tf.keras.utils.plot_model(model,show_shapes=True)

In [66]:
# help(tfa.optimizers.CyclicalLearningRate)

In [67]:
reduce_lr = ReduceLROnPlateau(monitor='val_binary_crossentropy', factor=0.1, patience=5, mode='min', min_lr=1E-5, verbose= 0)
early_stopping = EarlyStopping(monitor='val_binary_crossentropy', min_delta=1E-5, patience=15, mode='min',restore_best_weights=True, verbose= 0)
    
model.fit(
        X_train, y_train, validation_split = 0.25, 
        callbacks=[reduce_lr, early_stopping], epochs=150, verbose =1,
        batch_size=32 )

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150


<tensorflow.python.keras.callbacks.History at 0x7f7b96162cf8>

In [14]:
prediction = model.predict(X_pred)
df_preds_non_ctl =  pd.DataFrame(prediction, columns= cols_target, index = to_pred.index)

# concat with all to pred values
df_preds = pd.concat([ full_pred[cols_id], df_preds_non_ctl], axis = 1).fillna(0)

df_preds.iloc[:,[34,82]] = 0
# to csv
df_preds.to_csv("submission.csv", index = None)

In [15]:
df_preds

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,adrenergic_receptor_agonist,adrenergic_receptor_antagonist,akt_inhibitor,aldehyde_dehydrogenase_inhibitor,alk_inhibitor,ampk_activator,analgesic,androgen_receptor_agonist,androgen_receptor_antagonist,anesthetic_-_local,angiogenesis_inhibitor,angiotensin_receptor_antagonist,anti-inflammatory,antiarrhythmic,antibiotic,anticonvulsant,antifungal,antihistamine,antimalarial,antioxidant,antiprotozoal,antiviral,apoptosis_stimulant,aromatase_inhibitor,atm_kinase_inhibitor,atp-sensitive_potassium_channel_antagonist,atp_synthase_inhibitor,atpase_inhibitor,atr_kinase_inhibitor,aurora_kinase_inhibitor,...,protein_synthesis_inhibitor,protein_tyrosine_kinase_inhibitor,radiopaque_medium,raf_inhibitor,ras_gtpase_inhibitor,retinoid_receptor_agonist,retinoid_receptor_antagonist,rho_associated_kinase_inhibitor,ribonucleoside_reductase_inhibitor,rna_polymerase_inhibitor,serotonin_receptor_agonist,serotonin_receptor_antagonist,serotonin_reuptake_inhibitor,sigma_receptor_agonist,sigma_receptor_antagonist,smoothened_receptor_antagonist,sodium_channel_inhibitor,sphingosine_receptor_agonist,src_inhibitor,steroid,syk_inhibitor,tachykinin_antagonist,tgf-beta_receptor_inhibitor,thrombin_inhibitor,thymidylate_synthase_inhibitor,tlr_agonist,tlr_antagonist,tnf_inhibitor,topoisomerase_inhibitor,transient_receptor_potential_channel_antagonist,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.005583,0.001314,0.001375,0.012943,0.038262,0.005347,0.004701,0.004893,0.000287,0.011884,0.019809,0.000697,0.000481,0.000977,0.001719,0.000897,0.004366,0.008201,0.008962,0.002465,0.002560,0.006375,0.000394,0.002246,0.000680,0.000542,0.001457,0.001167,0.010180,0.003589,0.002711,0.003168,0.005202,0,0.000672,0.000414,0.002110,0.000349,0.000867,...,0.003634,0.001439,0.008521,0.000562,0.000717,0.005229,0.000731,0.001123,0.001188,0.001752,0.013692,0.015655,0.003518,0.006351,0.002917,0.001300,0.029287,0.002469,0.000672,0.000559,0.000298,0.003552,0.000776,0.001336,0.001713,0.004114,0.001161,0.001678,0.002045,0.001615,0.000935,0.000554,0.003373,0.001401,0.000506,0.000892,0.001066,0.001678,0.001374,0.001015
1,id_001897cda,0.000153,0.000285,0.000130,0.000326,0.000113,0.000206,0.000898,0.006111,0.026226,0.003814,0.002276,0.001464,0.000102,0.004161,0.000324,0.000481,0.000167,0.001675,0.000608,0.000335,0.001512,0.000610,0.000814,0.000488,0.000559,0.001593,0.000165,0.000060,0.000229,0.000647,0.000571,0.000906,0.000417,0,0.000387,0.000109,0.003098,0.001348,0.002504,...,0.000191,0.000461,0.000070,0.000590,0.001441,0.002988,0.000317,0.012005,0.000143,0.004533,0.003265,0.002142,0.000332,0.000335,0.001604,0.000362,0.001501,0.000125,0.037203,0.000179,0.011030,0.001294,0.003802,0.000623,0.000398,0.000197,0.002493,0.000642,0.005461,0.001560,0.000421,0.000457,0.001082,0.000038,0.009953,0.000258,0.007813,0.000701,0.002730,0.002760
2,id_002429b5b,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,id_00276f245,0.000339,0.001508,0.004201,0.024830,0.027415,0.003785,0.004537,0.003072,0.000276,0.010542,0.020311,0.002696,0.000810,0.002649,0.001252,0.003805,0.005163,0.005585,0.003117,0.001961,0.002397,0.002115,0.002040,0.003319,0.001848,0.003168,0.001286,0.004291,0.004296,0.000896,0.001365,0.003787,0.001230,0,0.000838,0.000428,0.004539,0.001240,0.001080,...,0.008176,0.001016,0.002726,0.000381,0.001376,0.000193,0.001682,0.001058,0.001607,0.000474,0.013486,0.061702,0.001826,0.001736,0.002394,0.001013,0.005602,0.001114,0.005080,0.001129,0.001260,0.005377,0.000486,0.001015,0.002644,0.001268,0.000461,0.000767,0.000400,0.001116,0.000586,0.002631,0.006095,0.090069,0.009991,0.001240,0.002370,0.003181,0.000486,0.002984
4,id_0027f1083,0.002219,0.001370,0.001572,0.027905,0.028901,0.005148,0.004459,0.002862,0.000530,0.013277,0.027453,0.000986,0.000345,0.000489,0.001252,0.002040,0.002826,0.006799,0.005734,0.001926,0.003484,0.004632,0.000560,0.002293,0.000861,0.000454,0.001037,0.001120,0.007600,0.003658,0.001751,0.001908,0.003670,0,0.000624,0.000296,0.001807,0.000338,0.000835,...,0.004405,0.001010,0.006172,0.001003,0.000745,0.001143,0.000757,0.000901,0.001689,0.001923,0.014402,0.015854,0.003836,0.003672,0.002404,0.001787,0.018058,0.002871,0.000395,0.001222,0.000445,0.003985,0.000225,0.001228,0.002016,0.002803,0.001454,0.001770,0.001558,0.000772,0.000809,0.000543,0.003003,0.001384,0.000894,0.000846,0.001032,0.002667,0.000518,0.002411
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3977,id_ff7004b87,0.002043,0.000896,0.002036,0.001796,0.002569,0.002002,0.001937,0.016592,0.000395,0.002073,0.009302,0.000984,0.001366,0.064894,0.001582,0.000911,0.002354,0.005767,0.007967,0.003669,0.001629,0.005205,0.001331,0.003668,0.001320,0.010669,0.000819,0.002493,0.002502,0.001016,0.001388,0.007436,0.000578,0,0.000678,0.001000,0.011747,0.005090,0.018431,...,0.006263,0.003121,0.000847,0.000067,0.001445,0.013301,0.002009,0.004706,0.000437,0.001644,0.010887,0.007552,0.001953,0.001487,0.007364,0.000290,0.004812,0.000371,0.053012,0.000635,0.001610,0.003242,0.011000,0.001929,0.000435,0.003177,0.000527,0.002659,0.002053,0.007888,0.000619,0.006065,0.002772,0.008554,0.010210,0.001724,0.005813,0.001956,0.013887,0.001542
3978,id_ff925dd0d,0.001824,0.001039,0.001424,0.013572,0.035148,0.006389,0.006325,0.003257,0.000700,0.023029,0.027248,0.001301,0.000354,0.000598,0.000811,0.000597,0.003997,0.007639,0.005331,0.002295,0.001311,0.003449,0.000549,0.002375,0.001250,0.000569,0.001356,0.000823,0.003880,0.004327,0.001595,0.001318,0.006148,0,0.000421,0.000270,0.002156,0.000433,0.001544,...,0.003718,0.001208,0.007023,0.001011,0.000789,0.000679,0.000594,0.001646,0.001264,0.002783,0.015866,0.029523,0.002812,0.003039,0.002030,0.002022,0.029872,0.000854,0.000770,0.000368,0.000865,0.004369,0.000451,0.001638,0.001169,0.002253,0.000757,0.001644,0.001671,0.000720,0.001177,0.000762,0.003477,0.001658,0.001032,0.000691,0.003784,0.001897,0.000146,0.001413
3979,id_ffb710450,0.004286,0.001245,0.001146,0.012885,0.042563,0.006226,0.004335,0.003914,0.000277,0.012340,0.023879,0.000739,0.000394,0.000526,0.001341,0.000815,0.003701,0.010694,0.007800,0.001816,0.002799,0.006547,0.000519,0.002232,0.000866,0.000500,0.001441,0.001237,0.008829,0.003538,0.002007,0.002584,0.006183,0,0.000549,0.000341,0.001689,0.000337,0.000788,...,0.003562,0.001197,0.008294,0.000422,0.000818,0.002400,0.000636,0.000805,0.001234,0.002015,0.013319,0.024542,0.003698,0.006147,0.003478,0.001538,0.024966,0.002138,0.000491,0.000602,0.000293,0.005471,0.000410,0.001735,0.001828,0.003462,0.001234,0.001536,0.001706,0.001074,0.000937,0.000546,0.003513,0.001212,0.000457,0.000755,0.000890,0.002110,0.000788,0.001246
3980,id_ffbb869f2,0.002017,0.000637,0.000932,0.010708,0.019955,0.006708,0.003201,0.003336,0.000736,0.021404,0.033875,0.000916,0.000299,0.000342,0.000870,0.000678,0.002881,0.006156,0.004142,0.001493,0.002578,0.002299,0.000596,0.001303,0.001068,0.000321,0.001050,0.000967,0.008729,0.003294,0.001480,0.001671,0.005976,0,0.000302,0.000429,0.001941,0.000228,0.000317,...,0.002834,0.000779,0.003743,0.000536,0.000737,0.002642,0.000386,0.000634,0.001495,0.001997,0.014127,0.023513,0.002996,0.003581,0.001584,0.002494,0.019857,0.001789,0.000369,0.000309,0.000356,0.005165,0.000342,0.001217,0.002109,0.002304,0.000795,0.001303,0.001831,0.001075,0.000506,0.000666,0.003080,0.000450,0.000767,0.000438,0.001073,0.002346,0.000643,0.002790
