## **Using sklearn to preprocessing data and tensorflow for modeling**
***Preprocessing:***
* Focus only in feature scaling (normalise) and feature combine (PCA)
* Building pipeline

***Modeling:***
* MLP
inspired by https://www.kaggle.com/riadalmadani/pytorch-cv-0-0145-lb-0-01839#Single-fold-training

In [2]:
import numpy as np
import pandas as pd
import copy

import tensorflow as tf
from tensorflow import feature_column as fc
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.layers import Dense, DenseFeatures, Dropout, BatchNormalization, Embedding, Input, Concatenate, Average
from tensorflow.keras.metrics import AUC, Precision, Recall
from tensorflow.keras import backend as K, Sequential, Model
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

from tensorflow_addons.layers import WeightNormalization
from keras.wrappers.scikit_learn import KerasRegressor

from sklearn.preprocessing import QuantileTransformer, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
from math import log2

print(pd.__version__)
print(tf.__version__)

1.1.4
2.3.0


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Data loading

In [41]:
# Loading data and encoding

folder_path = '/content/drive/My Drive/Data/colabs_data/MOA_kaggle/'
raw_test = pd.read_csv(folder_path + 'test_features.csv')
raw_train = pd.read_csv(folder_path + 'train_features.csv')
raw_targets = pd.read_csv(folder_path + 'train_targets_scored.csv')

# Phân loại dữ liệu
cols_id = ['sig_id']
cols_to_remove = ['cp_type']
cols_fts = [i for i in raw_train.columns if i not in cols_id +cols_to_remove]
cols_gene = [col for col in raw_train.columns if col.startswith("g-")]
cols_cell = [col for col in raw_train.columns if col.startswith("c-")]
cols_experiment = [col for col in cols_fts if col not in cols_gene+cols_cell]
cols_target = [i for i in raw_targets.columns if i not in cols_id]
num_fts, num_labels = len(cols_fts), len(cols_target)

# xử lý categorical
def transform_data(input_data):
    '''Clean data and encoding
        * input_data: table '''
    out = input_data.copy()
    out['cp_dose'] = out['cp_dose'].map({'D1':0, 'D2':1})
    out['cp_time'] = out['cp_time']/72
    
    return out

to_train = transform_data(raw_train ) #[raw_train['cp_type'] != 'ctl_vehicle'])
to_train_targets = raw_targets.iloc[to_train.index]
to_pred  = transform_data(raw_test)
to_pred_non_ctl = to_pred[to_pred['cp_type'] != 'ctl_vehicle']

## Feature engineering

In [48]:
# preprocessing pipeline
def pipe_line_builder( quantiles_num):
    '''Dựng pipe line cho từng nhóm columns
    :quantiles_num: int: số quantile khi normalise
    :pca_dims: int: số chiều pca'''
#     variance = VarianceThreshold(variance_threshold)
    # norm = StandardScaler()
    norm = QuantileTransformer(n_quantiles=quantiles_num,random_state=0, output_distribution="normal")
    # pca = PCA(n_components = pca_dims)
    p_var_norm = Pipeline([ 
#         ('var', variance),
        ('norm', norm) ])
#     p_var_norm_pca = Pipeline([ 
# #         ('var', variance),
#         ('pca', pca),
#         ('norm', norm)
#     ])
    return FeatureUnion([
        ('norm', p_var_norm)
        , ('norm_pca', p_var_norm_pca) 
        ])

pipe = ColumnTransformer([
     ('gene', pipe_line_builder(quantiles_num = 100), cols_gene),
     ('cell', pipe_line_builder(quantiles_num = 100), cols_cell),
     ('experiment', pipe_line_builder(quantiles_num = 100), cols_experiment)
#      ('all', Pipeline([ 
#                 ('norm', QuantileTransformer(n_quantiles=100,random_state=0, output_distribution="normal")), 
#                 ('pca', PCA(n_components = int(len(cols_fts)*0.8)) ) ]), cols_fts)
    ])

In [49]:
# Final data
# Trick
pipe.fit(to_train[cols_cell+cols_gene])
X_train = pipe.transform(to_train[cols_cell+cols_gene])
y_train = to_train_targets[cols_target]
X_train

array([[ 1.13484916,  0.90768744, -0.41638451, ...,  0.31798879,
         0.54566219,  0.6413394 ],
       [ 0.11928153,  0.68173822,  0.27239921, ...,  0.17968379,
         0.91916084,  1.16583255],
       [ 0.77997254,  0.94646298,  1.42534985, ..., -0.27763452,
        -1.12308772,  1.08923459],
       ...,
       [ 0.52514873,  0.63122535,  0.28817292, ...,  1.16710363,
         1.02059289,  0.5843928 ],
       [ 0.8164071 ,  0.4176183 ,  0.43163123, ...,  1.0779754 ,
        -0.70199778,  0.13396695],
       [-1.24309611,  1.56773029, -0.26957347, ..., -0.58170296,
        -1.29840708, -1.84722499]])

In [44]:
to_train[col]

0        1.0620
1        0.0743
2        0.6280
3       -0.5138
4       -0.3254
          ...  
23809    0.1394
23810   -1.3260
23811    0.3942
23812    0.6660
23813   -0.8598
Name: g-0, Length: 23814, dtype: float64

In [45]:
col = 'g-0'
transformer = QuantileTransformer(n_quantiles=100,random_state=0, output_distribution="normal")
vec_len = len(to_train[col].values)
raw_vec = to_train[col].values.reshape(vec_len, 1)
transformer.fit(raw_vec)

transformer.transform(raw_vec).reshape(1, vec_len)[0]

array([ 1.13484916,  0.11928153,  0.77997254, ...,  0.52514873,
        0.8164071 , -1.24309611])

In [35]:
norm = QuantileTransformer(n_quantiles=100,random_state=0, output_distribution="normal")
norm.fit_transform(to_train['g-0'].values.reshape(21948, 1))

array([[ 1.11180063],
       [ 0.10566707],
       [ 0.76703558],
       ...,
       [-1.94607703],
       [ 0.80391134],
       [-1.27070334]])

In [None]:
len(cols_gene+cols_cell+cols_experiment)+640+72+int(len(cols_experiment)*0.8)

1587

In [39]:
import sklearn
print(sklearn.__version__)

0.22.2.post1


## Modeling

In [None]:
# Define model
model = Sequential([
    BatchNormalization(),
    WeightNormalization(Dense(1024, activation="relu")),
    BatchNormalization(),
    Dropout(0.2),
    WeightNormalization(Dense(512, activation="relu")),
    BatchNormalization(),
    Dropout(0.2),
    WeightNormalization(Dense(256, activation="relu")),
    BatchNormalization(),
    Dropout(0.2),
    WeightNormalization(Dense(num_labels, activation="sigmoid"))
])

p_min = 0.001
p_max = 0.999
def logloss(y_true, y_pred):
    y_pred = tf.clip_by_value(y_pred,p_min,p_max)
    return -K.mean(y_true*K.log(y_pred) + (1-y_true)*K.log(1-y_pred))

model.compile(optimizer='adam', loss=BinaryCrossentropy(label_smoothing=0.0001), metrics=logloss)

In [None]:
# Training and evaluate
reduce_lr = ReduceLROnPlateau(monitor='val_logloss', factor=0.3, patience=5, mode='min', min_lr=1E-5, verbose=1)
early_stopping = EarlyStopping(monitor='val_logloss', min_delta=1E-5, patience=15, mode='min',restore_best_weights=True, verbose=1)
# reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=3,epsilon = 1e-4, mode = 'min',verbose=1)
# early_stopping = EarlyStopping(monitor='val_loss',min_delta=0,patience=10,mode='auto',verbose=1,baseline=None,restore_best_weights=True)

hist = model.fit(X_train,y_train, batch_size=64, epochs=150,validation_split = 0.2
                 ,callbacks=[reduce_lr, early_stopping]
                )

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 00013: ReduceLROnPlateau reducing learning rate to 0.0003000000142492354.
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 00018: ReduceLROnPlateau reducing learning rate to 9.000000427477062e-05.
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 00023: ReduceLROnPlateau reducing learning rate to 2.700000040931627e-05.
Restoring model weights from the end of the best epoch.
Epoch 00023: early stopping


In [None]:
# Test thử resnet
p_min = 0.001
p_max = 0.999
def logloss(y_true, y_pred):
    y_pred = tf.clip_by_value(y_pred,p_min,p_max)
    return -K.mean(y_true*K.log(y_pred) + (1-y_true)*K.log(1-y_pred))

def build_model(n_features, n_features_2, n_labels, label_smoothing = 0.0005):    
    input_1 = Input(shape = (n_features,), name = 'Input1')
    input_2 = Input(shape = (n_features_2,), name = 'Input2')

    head_1 = Sequential([
        BatchNormalization(),
        Dropout(0.2),
        Dense(512, activation="elu"), 
        BatchNormalization(),
        Dense(256, activation = "elu")
        ],name='Head1') 

    input_3 = head_1(input_1)
    input_3_concat = Concatenate()([input_2, input_3])

    head_2 = Sequential([
        BatchNormalization(),
        Dropout(0.3),
        Dense(512, "relu"),
        BatchNormalization(),
        Dense(512, "elu"),
        BatchNormalization(),
        Dense(256, "relu"),
        BatchNormalization(),
        Dense(256, "elu")
        ],name='Head2')

    input_4 = head_2(input_3_concat)
    input_4_avg = Average()([input_3, input_4]) 

    head_3 = Sequential([
        BatchNormalization(),
        Dense(256, kernel_initializer='lecun_normal', activation='selu'),
        BatchNormalization(),
        Dense(n_labels, kernel_initializer='lecun_normal', activation='selu'),
        BatchNormalization(),
        Dense(n_labels, activation="sigmoid")
        ],name='Head3')

    output = head_3(input_4_avg)


    model = Model(inputs = [input_1, input_2], outputs = output)
    model.compile(optimizer='adam', loss= BinaryCrossentropy(label_smoothing=label_smoothing), metrics=logloss)
    
    return model

num_all_ft = X_train.shape[1]
num_ft_1 = len(cols_gene)+640
num_ft_2 = num_all_ft - num_ft_1

model = build_model(num_ft_1, num_ft_2, len(cols_target))

In [None]:
num_ft_2

175

In [None]:
# Training and evaluate
reduce_lr = ReduceLROnPlateau(monitor='val_logloss', factor=0.3, patience=5, mode='min', min_lr=1E-5, verbose=1)
early_stopping = EarlyStopping(monitor='val_logloss', min_delta=1E-5, patience=15, mode='min',restore_best_weights=True, verbose=1)
# reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=3,epsilon = 1e-4, mode = 'min',verbose=1)
# early_stopping = EarlyStopping(monitor='val_loss',min_delta=0,patience=10,mode='auto',verbose=1,baseline=None,restore_best_weights=True)

hist = model.fit([X_train[:,:num_ft_1], X_train[:,num_ft_1:]],y_train, batch_size=64, epochs=150,validation_split = 0.2
                 ,callbacks=[reduce_lr, early_stopping]
                )

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 00011: ReduceLROnPlateau reducing learning rate to 0.0003000000142492354.
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 00016: ReduceLROnPlateau reducing learning rate to 9.000000427477062e-05.
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 00021: ReduceLROnPlateau reducing learning rate to 2.700000040931627e-05.
Restoring model weights from the end of the best epoch.
Epoch 00021: early stopping


## **Hoàn thành dev model với tên biến model => submit**

In [None]:
# predict non ctl vehicle
X_pred = pipe.transform( to_pred_non_ctl[cols_fts] )
arr_preds_non_ctl = model.predict( [X_pred[:,:num_ft_1], X_pred[:,num_ft_1:]] )
#     model.predict(pipe.transform(to_pred_non_ctl[cols_fts]))


df_preds_non_ctl =  pd.DataFrame(arr_preds_non_ctl, columns= cols_target, index = to_pred_non_ctl.index)

# concat with all to pred values
df_preds = pd.concat([ to_pred[cols_id], df_preds_non_ctl], axis = 1).fillna(0)
# to csv
df_preds.to_csv("submission.csv", index = None)