# Exploratory Data Analysis

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# # load module 

In [None]:
# Importing useful libraries
import warnings
warnings.filterwarnings("ignore")

# Adding iterative-stratification 
# Select add data from the right menu and search for iterative-stratification, then add it to your kernel.
import sys
sys.path.append('../input/iterative-stratification/iterative-stratification-master')
#from iterstrat.ml_stratifiers import MultilabelStratifiedKFold


from time import time
import datetime
import gc

import numpy as np
import pandas as pd 

# ML tools 
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
import tensorflow as tf 
from tensorflow.keras import layers
import tensorflow.keras.backend as K
from sklearn.metrics import log_loss
from tensorflow_addons.layers import WeightNormalization
# Setting random seeds
np.random.seed(42)
tf.random.set_seed(42)

# Visualization tools
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('white')
sns.set(font_scale=1.2)

# Step 1 : Data Check

In [None]:
df_train = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
display(df_train.head(3))
print('train data size', df_train.shape)

df_target_ns = pd.read_csv('/kaggle/input/lish-moa/train_targets_nonscored.csv')
display(df_target_ns.head(3))
print('train target nonscored size', df_target_ns.shape)


df_target_s = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')
display(df_target_s.head(3))
print('train target scored size', df_target_s.shape)


df_test = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')
display(df_test.head(3))
print('test data size', df_test.shape)

df_sample = pd.read_csv('/kaggle/input/lish-moa/sample_submission.csv')
display(df_sample.head(3))
print('sample submission size', df_sample.shape)

print(type(df_target_s))

# # train_features : df_train

In [None]:
train_copy= df_train.copy()
print(type(train_copy))
train_copy['target_71'] = df_target_s.iloc[:,72] 
print(train_copy['target_71'])
print(df_target_s.columns[72])


# # train_targets_nonscored : df_target_ns

In [None]:
df_target_ns_copy= df_target_ns.copy()
print(type(df_target_ns_copy))
print(df_target_ns_copy.shape)


In [None]:
df_target_ns_copy.drop('sig_id', axis=1, inplace=True)
n_row = 30
n_col = 4 
n_sub = 1   
fig = plt.figure(figsize=(20,50))
plt.subplots_adjust(left=-0.3, right=1.3,bottom=-0.3,top=1.3)

for i in np.random.choice(np.arange(0,df_target_ns_copy.shape[1],1),n_row):
    plt.subplot(n_row, n_col, n_sub)
    sns.countplot(y=df_target_ns_copy.iloc[:, i],palette='nipy_spectral',orient='h')
    
    plt.legend()                    
    n_sub+=1
plt.show()

In [None]:
df_target_ns_copy.sum()


In [None]:
plt.figure(figsize=(10,10))
df_target_ns_copy.sum().sort_values()[-20:].plot(kind='barh',color='mediumseagreen')
plt.show()

# # train_targets_scored : df_target_s

In [None]:
target_s_copy=df_target_s.copy()
print(type(target_s_copy))
print(target_s_copy.shape)


In [None]:
target_s_copy.drop('sig_id', axis=1, inplace=True)
n_row = 30
n_col = 4 
n_sub = 1   
fig = plt.figure(figsize=(20,50))
plt.subplots_adjust(left=-0.3, right=1.3,bottom=-0.3,top=1.3)

for i in np.random.choice(np.arange(0,target_s_copy.shape[1],1),n_row):
    plt.subplot(n_row, n_col, n_sub)
    sns.countplot(y=target_s_copy.iloc[:, i],palette='nipy_spectral',orient='h')
    
    plt.legend()                    
    n_sub+=1
plt.show()

In [None]:
target_s_copy.sum()


In [None]:
plt.figure(figsize=(10,10))
target_s_copy.sum().sort_values()[-20:].plot(kind='barh',color='mediumseagreen')
plt.show()

# Step 2 : Data Preprocessing

# 2-1 : Preprocessing for 772 gene expression features

In [None]:
ind_tr = df_train[df_train['cp_type']=='ctl_vehicle'].index
ind_te = df_test[df_test['cp_type']=='ctl_vehicle'].index
print(ind_tr[:5])
print(ind_te[:5])

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import QuantileTransformer
transformer = QuantileTransformer(n_quantiles=100,random_state=42, output_distribution="normal")

def preprocess(df):
    df['cp_time'] = df['cp_time'].map({24:1, 48:2, 72:3})
    df['cp_dose'] = df['cp_dose'].map({'D1':0, 'D2':1})
    df['cp_type'] = df['cp_type'].map({'trt_cp':0, 'ctl_vehicle':1})
    g_features = [cols for cols in df.columns if cols.startswith('g-')]
    c_features = [cols for cols in df.columns if cols.startswith('c-')]
    for col in (g_features + c_features):
        vec_len = len(df[col].values)
        raw_vec = df[col].values.reshape(vec_len, 1)
        transformer.fit(raw_vec)
        df[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
    return df

X_train = preprocess(df_train)
X_test = preprocess(df_test)

display(X_train.head(5))
print('Train data size', X_train.shape)
display(X_test.head(3))
print('Test data size', X_test.shape)

y = df_target_s.drop('sig_id', axis=1)
display(y.head(3))
print('target size', y.shape)
y0 =  df_target_ns.drop('sig_id', axis=1)

In [None]:
# Please see reference 3 for this part
g_features = [cols for cols in X_train.columns if cols.startswith('g-')]
n_comp = 0.95
print(g_features[:20])
print(len(g_features))

g_data = pd.concat([pd.DataFrame(X_train[g_features]), pd.DataFrame(X_test[g_features])])
print(g_data.shape)

# # PCA transformation 

In [None]:
g_data2 = (PCA(0.95, random_state=42).fit_transform(g_data))
g_data2.shape

In [None]:
g_train2 = g_data2[:X_train.shape[0]]
print(g_train2.shape)
print(type(g_data2))
g_test2 = g_data2[-X_test.shape[0]:]
print(g_test2.shape)

In [None]:
g_train2 = pd.DataFrame(g_train2, columns=[f'pca_g-{i}' for i in range(g_data2.shape[1])])
g_test2 = pd.DataFrame(g_test2, columns=[f'pca_g-{i}' for i in range(g_test2.shape[1])])

display(g_train2.head(3))
print('train data PCA',g_train2.shape)

display(g_test2.head(3))
print('test data PCA',g_test2.shape)

# # merge original data and PCA tansformed data for gene expression data

In [None]:
g_X_train = pd.concat((X_test[g_features],g_train2), axis=1)
g_X_test = pd.concat((X_test[g_features], g_test2), axis=1)

In [None]:
display(g_X_train.head(3))
print('train data PCA + train original data',g_X_train.shape)

display(g_X_test.head(3))
print('test data PCA + test original data',g_X_test.shape)

# 2-2 : Preprocessing for 100 cell viability features

In [None]:
c_features = [cols for cols in X_train.columns if cols.startswith('c-')]
n_comp = 0.95
print(len(c_features))

c_data = pd.concat([pd.DataFrame(X_train[c_features]), pd.DataFrame(X_test[c_features])])
c_data2 = (PCA(0.95, random_state=42).fit_transform(c_data[c_features]))
c_train2 = c_data2[:X_train.shape[0]]
c_test2 = c_data2[-X_test.shape[0]:]

c_train2 = pd.DataFrame(c_train2, columns=[f'pca_c-{i}' for i in range(c_data2.shape[1])])
c_test2 = pd.DataFrame(c_test2, columns=[f'pca_c-{i}' for i in range(c_data2.shape[1])])

display(c_data.head(3))
print('cell viability data',c_data.shape)

display(c_data2)
print('PCA cell viability data',c_data2.shape)

display(c_train2.head(3))
print('train data PCA',c_train2.shape)

display(c_test2.head(3))
print('test data PCA',c_test2.shape)


In [None]:
c_X_train = pd.concat((X_train[c_features], c_train2), axis=1)
c_X_test = pd.concat((X_test[c_features], c_test2), axis=1)

display(c_X_train.head(3))
print('train data PCA + train original data of cell viability',c_X_train.shape)

display(c_X_test.head(3))
print('test data PCA + test original data of cell viability',c_X_test.shape)



# # integration of gene expression features and cell viability features 

In [None]:
X_train = pd.concat((g_X_train,c_X_train), axis=1)
X_test = pd.concat((g_X_test, c_X_test), axis=1)

display(X_train.head(3))
print('train data PCA + train original data of gene expression & cell viability',X_train.shape)

display(X_test.head(3))
print('test data PCA + test original data of gene expression & cell viability',X_test.shape)

In [None]:
from sklearn.feature_selection import VarianceThreshold

var_thresh = VarianceThreshold(0.8)  
data = X_train.append(X_test)
data_transformed = var_thresh.fit_transform(data)

train_features = data_transformed[:X_train.shape[0]]
test_features = data_transformed[-X_test.shape[0] : ]

display(train_features)
print('train data PCA + train original data of gene expression & cell viability',train_features.shape)

display(test_features)
print('test data PCA + test original data of gene expression & cell viability',test_features.shape)

print(type(data))
print(data.shape)
print(type(train_features))

In [None]:
df_train_features=pd.DataFrame(train_features,columns=[f'feature_{i}' for i in range(train_features.shape[1])])

df_train_features['sig_id']=df_train['sig_id']

df_test_features=pd.DataFrame(test_features,columns=[f'feature_{i}' for i in range(test_features.shape[1])])

df_test_features['sig_id']=df_test['sig_id']


display(df_train_features.head(5))
print('train features data', df_train_features.shape)


display(df_test_features.head(5))
print('test features data', df_test_features.shape)



# # cp_dose, cp_time, cp_type information

In [None]:
#X = pd.DataFrame(X[['sig_id','cp_type', 'cp_time','cp_dose']].values.reshape(-1, 4),\
#                              columns=['sig_id','cp_type','cp_time','cp_dose'])

p_train=pd.DataFrame(df_train[['sig_id','cp_type', 'cp_time','cp_dose']].values.reshape(-1,4),columns=['sig_id','cp_type','cp_time','cp_dose'])
p_test=pd.DataFrame(df_test[['sig_id','cp_type', 'cp_time','cp_dose']].values.reshape(-1,4),columns=['sig_id','cp_type','cp_time','cp_dose'])

display(p_train.head(3))
print('train data dosing information',p_train.shape)

display(p_test.head(3))
print('test data dosing information',p_test.shape)

In [None]:
X_fp_train=pd.merge(df_train_features,p_train,on='sig_id')
X_fp_test=pd.merge(df_test_features,p_test,on='sig_id')

display(X_fp_train.head(3))
print('train data dosing information + feature data',X_fp_train.shape)

display(X_fp_test.head(3))
print('test data dosing information + feature data',X_fp_test.shape)

# Label Data

In [None]:
y=df_target_s
y0=df_target_ns

display(y.head(3))
print('target scored data label',y.shape)

display(y0.head(3))
print('target non-scored data label',y0.shape)

# Save Data

In [None]:
import pickle 
import os

print(os.getcwd())

with open('X_fp_train','wb') as web:
    pickle.dump(X_fp_train,web)

with open('X_fp_test','wb') as web:
    pickle.dump(X_fp_test,web)  
    
with open('y','wb') as web:
    pickle.dump(y,web)

with open('y0','wb') as web:
    pickle.dump(y0,web)  

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/working'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# MLP Model Define

https://machinelearningmastery.com/multi-label-classification-with-deep-learning/

In [None]:
with open('X_fp_test','rb') as web:
    X_fp_test=pickle.load(web)
    
with open('X_fp_train','rb') as web:
    X_fp_train=pickle.load(web)
    
with open('y','rb') as web:
    y=pickle.load(web)
    
with open('y0','rb') as web:
    y0=pickle.load(web)
    
X_fp_train=X_fp_train.drop('sig_id',axis=1)
X_fp_test=X_fp_test.drop('sig_id',axis=1)
y=y.drop('sig_id',axis=1)
y0=y0.drop('sig_id',axis=1)

display(X_fp_train.head(3))
display(X_fp_test.head(3))
display(y.head(3))
display(y0.head(3))

In [None]:
feats = np.arange(0,X_fp_train.shape[1],1)
inp_size = int(np.ceil(1* len(feats)))
print(inp_size)

In [None]:
X_fp_train.shape[1]
y.shape[1]

In [None]:
p_min = 0.001
p_max = 0.999
from tensorflow.keras import regularizers
from tensorflow.keras import layers,models

def logloss(y_true, y_pred):
    y_pred = tf.clip_by_value(y_pred,p_min,p_max)
    return -K.mean(y_true*K.log(y_pred) + (1-y_true)*K.log(1-y_pred))

def create_model1(num_cols, hid_layers, dropout_rate, num_cols_y):
    
    inp1 = tf.keras.layers.Input(shape = (num_cols, ))
    x1 = tf.keras.layers.BatchNormalization()(inp1)
    activations=['selu','relu','swish']
    for i, units in enumerate(hid_layers):
        x1 = tf.keras.layers.Dense(units, activation=activations[i])(x1)
        x1 = tf.keras.layers.Dropout(dropout_rate)(x1)
        x1 = tf.keras.layers.BatchNormalization()(x1)
    
    x1 = tf.keras.layers.Dense(num_cols_y,activation='sigmoid')(x1)
    model = tf.keras.models.Model(inputs= inp1, outputs= x1)
    
    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=0.000656575),
                 loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=0.001), metrics=logloss)
    
    return model 

def create_model2():
    
    model=models.Sequential()
    

    # First Layer
    model.add(layers.Dense(
    800,
    input_shape=(1042,),
        activation='relu'
    ))
    
    model.add(layers.Dropout(
    0.5))
    
    # Second Layer
    model.add(layers.Dense(
    600,
    activation='relu'
    ))
    
    model.add(layers.Dropout(
    0.5))
    
    # Third Layer
    model.add(layers.Dense(
    400,
    activation='relu'
    ))
    
    model.add(layers.Dropout(
    0.5))
    
    # Output Layer
    model.add(layers.Dense(
        y.shape[1],
        activation='sigmoid'
    ))
    
    model.compile(optimizer ='adam',
                 loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model 

model=create_model2()

model.summary()

model2=create_model1(1042,2,0.5,206)
model2.summary()

In [None]:
model2=create_model1(X_fp_train.shape[1],2,0.5,y.shape[1])
model2.summary()

In [None]:
!pip install iterative-stratification

In [None]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

np.random.seed(seed=42)
n_split = 5

for n, (tr, te) in enumerate(MultilabelStratifiedKFold(n_splits = n_split, random_state = 29, shuffle = True).split(X_fp_train, y)):
    print(tr,te)
    
print(tr)
print(len(tr))
print(len(te))

In [None]:
results=[]
for k, (train, test) in enumerate(MultilabelStratifiedKFold(n_splits = 5, random_state = 30, shuffle = True).split(X_fp_train, y)):
    
    x_tr = X_fp_train.astype('float64').values[train]
    x_val = X_fp_train.astype('float64').values[test]
    
    y_tr, y_val = y.astype(float).values[train], y.astype(float).values[test]
    
    model = create_model2()
    
    result=model.fit(x_tr,y_tr,validation_data=(x_val, y_val), epochs = 100, batch_size = 128,verbose = 1)
    
    results.append(result)

        
        