In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random

In [None]:
from sklearn.decomposition import PCA
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import QuantileTransformer, OneHotEncoder
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, VarianceThreshold

In [None]:
import sys
sys.path.append('../input/iterativestratificationmaster/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [None]:
import tensorflow as tf
from tensorflow import keras

In [None]:
SEED = 30

# 1. Import Data

In [None]:
FILE_DIR = '../input/lish-moa/'

In [None]:
train_df = pd.read_csv(FILE_DIR+'train_features.csv')
test_df = pd.read_csv(FILE_DIR+'test_features.csv')
target_df = pd.read_csv(FILE_DIR+'train_targets_scored.csv')

In [None]:
print('shape of train_df:{}'.format(train_df.shape))
print('shape of test_df:{}'.format(test_df.shape))
print('shape of target_df:{}'.format(target_df.shape))

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
target_df.head()

In [None]:
target_df.info()

In [None]:
GENE = [ col for col in train_df.columns.tolist() if col.startswith('g-')]
CELL = [col for col in train_df.columns.to_list() if col.startswith('c-')]
CAT = [col for col in train_df.columns.tolist() if col.startswith('cp_')]
TARGET = [col for col in target_df.columns.to_list()[1:]]
print('Length of GENE: {}'.format(len(GENE)))
print('Length of CELL: {}'.format(len(CELL)))
print('Length of CAT: {}'.format(len(CAT)))
print('Length of TARGET: {}'.format(len(TARGET)))

In [None]:
# check if there is na in train_df
train_df.isna().any().sum()

In [None]:
# check if there is na in train_df
target_df.isna().any().sum()

# 2. EDA

## 2.1 training data

In [None]:
fig ,axs = plt.subplots(1,3,figsize=(14,4))
plt.subplots_adjust(left=-0.1,right=1.1,bottom=-0.1,top=1.1)
for i,col in enumerate(CAT):
    sns.countplot(x=col, data=train_df,ax=axs[i])

In [None]:
# Draw random gene data 
N = 6 
col = random.choices(GENE,k=N)
plot_df = train_df.loc[:,col]
sns.pairplot(plot_df,diag_kind='hist')
plt.show()

In [None]:
N = 6 
col = random.choices(CELL,k=N)
plot_df = train_df.loc[:,col]
sns.pairplot(plot_df,diag_kind='hist')
plt.show()

## 2.2 Target data 

In [None]:
N=6

col = random.choices(TARGET,k=N)

fig =plt.figure(figsize=(14,8))

for i,v in enumerate(col):
    plt.subplot(2,3,i+1)
    sns.countplot(x=v,data=target_df)


In [None]:
plt_df = pd.DataFrame(target_df.iloc[:,1:].sum(axis=0).sort_values())
plt_df.tail(20).plot(kind='barh')

In [None]:
(target_df[TARGET].sum(axis=0)).sum()/206

## 2.3 Merge Data

In [None]:
# drop ctl_vehicle
drop_inx = train_df[train_df['cp_type']=='ctl_vehicle'].index.tolist()
train_df = train_df.drop(index=drop_inx,axis=0).reset_index(drop=True)

In [None]:
train_df.info()

In [None]:
# drop ctl_vehicle
target_df = target_df.drop(index=drop_inx,axis=0).reset_index(drop=True)

In [None]:
#test_df = test_df.loc[test_df['cp_type']!='ctl_vehicle',:]

### 2.3.1 Start Merging

In [None]:
data_df = train_df.join(target_df.set_index('sig_id'),on='sig_id',how='inner')
data_df.head()

In [None]:
data_df.drop('sig_id',axis=1,inplace=True)
data_df.drop('cp_type',axis=1,inplace=True)
data_df.head()

In [None]:
no_moa_index = test_df[test_df['cp_type']=='ctl_vehicle'].index

## 2.4 PCA 

In [None]:
N =500
pca_pipe = Pipeline([
    ('qt',QuantileTransformer(output_distribution='normal')),
    ('pca',PCA(n_components=N))
])

In [None]:
def get_comp_num(data,n,threshold):
    
    '''return specific component_num over variance explaination ratio'''
    
    pca_pipe = Pipeline([
        ('qt',QuantileTransformer(output_distribution='normal')),
        ('pca',PCA(n_components=n))
    ])
    
    pca_pipe.fit_transform(data)
    #check explained ratio
    ratio = pca_pipe.steps[1][1].explained_variance_ratio_.cumsum()
    for i,v in enumerate(ratio):
        if v>threshold:
            d=i+1
            break
    return d

In [None]:
d_GENE = get_comp_num(data_df.loc[:,GENE],n=700,threshold=0.85)
d_GENE

In [None]:
d_CELL = get_comp_num(data_df.loc[:,CELL],n=90,threshold=0.85)

d_CELL

# 3. Preprocess

In [None]:
gene_pipe = Pipeline([
    ('qt',QuantileTransformer(output_distribution='normal')),
    ('pca',PCA(n_components= d_GENE))
])

gene_union = FeatureUnion([
    ('pca',gene_pipe),
    ('qt',QuantileTransformer(output_distribution='normal'))
])

gene_variance = Pipeline([
    ('union',gene_union),
    ('var',VarianceThreshold(0.8))
])

cell_pipe = Pipeline([
    ('qt',QuantileTransformer(output_distribution='normal')),
    ('pca',PCA(n_components= d_CELL))
])

cell_union = FeatureUnion([
    ('pca',cell_pipe),
    ('qt',QuantileTransformer(output_distribution='normal'))
])

cell_variance = Pipeline([
    ('union',cell_union),
    ('var',VarianceThreshold(0.8))
])

cat_pipe = Pipeline([
    ('ohc',OneHotEncoder())
])

CAT = ['cp_time','cp_dose']
pre_pipe = ColumnTransformer([
    ('GENE',gene_variance,GENE),
    ('CELL',cell_variance,CELL),
    ('CAT',cat_pipe,CAT)
])

In [None]:
ff = pre_pipe.fit_transform(data_df.loc[:,CAT+GENE+CELL])

In [None]:
ff.shape

## 4.2 Callbacks

In [None]:
from tensorflow.keras import backend 
label_smooth = 0.001
p_min = 0*(1-label_smooth)+label_smooth/2
p_max = 1*(1-label_smooth)+label_smooth/2
# p_min =0.001
# p_max = 0.999

def logloss(y_true, y_pred):
    y_pred = tf.clip_by_value(y_pred,p_min,p_max)
    return -backend.mean(y_true*backend.log(y_pred) + (1-y_true)*backend.log(1-y_pred))

### 4.3.1 initial bias 

In [None]:
total = data_df.shape[0]
pos = data_df[TARGET].sum(axis=0).values
neg = (pos-total)*(-1)

initial_bias = np.log(pos/neg)

init = tf.constant_initializer(initial_bias)

## 4.1 build NN

In [None]:
callbacks = []
# lr 
# def exponential_decay(lr0, s):
#     def exponential_decay_fn(epoch):
#         return lr0 * 0.1**(epoch / s) 
#     return exponential_decay_fn
# exponential_decay_fn = exponential_decay(lr0=0.01, s=5)
# lr_scheduler = keras.callbacks.LearningRateScheduler(exponential_decay_fn)
# callbacks.append(lr_scheduler)

# def scheduler(epoch, lr):
#     if epoch < 5:
#         return lr
#     else:
#         return lr * tf.math.exp(-0.1)
# lr_scheduler = keras.callbacks.LearningRateScheduler(scheduler,verbose=0)
# callbacks.append(lr_scheduler)

reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_logloss', 
                                                 factor=0.1, 
                                                 verbose=0,
                                                 mode='min',
                                                 patience=5,
                                                 min_lr=1e-7)
callbacks.append(reduce_lr)

# check point 
checkpoint_cb = keras.callbacks.ModelCheckpoint("my_keras_model.h5",monitor='val_logloss', save_best_only=True)
callbacks.append(checkpoint_cb)


# early stop
early_st = tf.keras.callbacks.EarlyStopping(monitor='val_logloss',
                                            min_delta=1E-5,
                                            patience=7,
                                            verbose=0,
                                            mode='min',
                                            baseline=None,
                                            restore_best_weights=True)
callbacks.append(early_st)

# lr record 
# class LearningRateLoggingCallback(tf.keras.callbacks.Callback):
#     def on_epoch_end(self, epoch):
#         lr = self.model.optimizer.lr
#         tf.summary.scalar('learning rate', data=lr, step=epoch)
        
# callbacks.append(LearningRateLoggingCallback())        
# Tensorboard 
# root_logdir = os.path.join(os.curdir, "my_logs")

# def get_run_logdir(): 
#     import time
#     run_id = time.strftime("run_%Y_%m_%d-%H_%M_%S") 
#     return os.path.join(root_logdir, run_id)

# run_logdir = get_run_logdir()
    
# tensorboard_cb = keras.callbacks.TensorBoard(run_logdir)
# callbacks.append(tensorboard_cb)

In [None]:
import tensorflow_addons as tfa

def build_NN(n_hiddens,n_neurons,drop_rate=0.2,inputshape=447,smooth_rate=0.001,lr=3e-4):
    
    adam = tfa.optimizers.Lookahead(tf.optimizers.Adam(learning_rate=lr),sync_period = 10)
    #adam = tf.optimizers.Adam(learning_rate=lr,beta_1=0.9,beta_2=0.999)
    
    model = keras.models.Sequential()
    
    model.add(keras.layers.Input(shape=(inputshape,)))
    
    for i in range(n_hiddens):
        
        model.add(keras.layers.BatchNormalization())
        
#         model.add(tfa.layers.WeightNormalization(keras.layers.Dense(n_neurons,activation="relu", 
#                                      kernel_initializer="he_normal",
#                                      kernel_regularizer=keras.regularizers.l2(0.01))))
        
        model.add(keras.layers.Dense(n_neurons,activation="relu", 
                                      kernel_initializer="he_normal",
                                      kernel_regularizer=keras.regularizers.l2(0.01)))
        
        model.add(keras.layers.Dropout(rate = drop_rate))
    
      
    model.add(keras.layers.Dense(206,activation='sigmoid',bias_initializer=init))
    
    model.compile(loss = keras.losses.BinaryCrossentropy(label_smoothing=smooth_rate),
                  optimizer=adam,metrics=logloss)

    return model

In [None]:
data_df.shape

In [None]:
21948/6

In [None]:
n_folds = 6
mlkf = MultilabelStratifiedKFold(n_splits=n_folds,shuffle=True,random_state=SEED)
results = []
oof = tf.constant(0.0)
pred = tf.constant(0.0)
np.random.seed(SEED)
n_seeds=2
seeds = np.random.randint(0,100,size=n_seeds)
for seed in seeds:
    mskf = MultilabelStratifiedKFold(n_splits=n_folds,shuffle=True,random_state=seed)
    for n,(train, test) in enumerate(mlkf.split(data_df.loc[:,CAT+GENE+CELL],data_df[TARGET])):
            print('fold{}'.format(n))
            train_x = pre_pipe.fit_transform(data_df.loc[train,CAT+GENE+CELL])
            val_x = pre_pipe.transform(data_df.loc[test,CAT+GENE+CELL])
            test_x = pre_pipe.transform(test_df.loc[:,CAT+GENE+CELL])
            train_y = data_df.loc[train,TARGET]
            val_y = data_df.loc[test,TARGET]


            model = build_NN(n_hiddens=3, 
                             n_neurons=512,
                             lr=0.0003,
                             drop_rate=0.3,
                             inputshape=train_x.shape[1])

            hist = model.fit(train_x,train_y, 
                             batch_size=128,
                             epochs=200,
                             validation_data = (val_x,val_y),
                             callbacks =callbacks,
                             verbose=1
                             # test 

                            )
            pred_y = model.predict(val_x)
            oof += logloss(tf.constant(val_y,dtype=tf.float32),tf.constant(pred_y,dtype=tf.float32))/(n_folds*n_seeds)
            print(oof)
            pred+= (pred_y/(n_folds*n_seeds))
            results.append(model.predict(test_x)/(n_folds*n_seeds))
        
        

In [None]:
0.018790415

In [None]:
a=np.zeros_like(results[0])
for i in results:
    a+=i

#a= tf.clip_by_value(i,p_min,p_max).numpy()

submission_df = pd.DataFrame(a,columns=TARGET)
submission_df = pd.concat([test_df.loc[:,['sig_id']],submission_df],axis=1)
submission_df.iloc[no_moa_index,1:] =0.0
submission_df.to_csv('submission.csv', index=False)

In [None]:
submission_df

In [None]:
# pred = pd.DataFrame(pred,columns=TARGET)

In [None]:
# bce = tf.keras.losses.BinaryCrossentropy()
# ll=[]
# for i in range(len(TARGET)):
#     ll.append(bce(val_y.iloc[:,[i]],pred_y.iloc[:,[i]]))

In [None]:
# ll = np.array(ll)

# ll = pd.DataFrame(ll,index=TARGET,columns=['bce'])

# ll['bce'].sort_values(ascending=False).head(30).plot(kind='bar')

In [None]:
# import pickle

In [None]:
# with open('loss_feature.pkl','wb') as f:
#     pickle.dump(ll,f)