In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/working'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.metrics import log_loss
import tensorflow as tf
import tensorflow_addons as tfa

from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from tensorflow.keras.losses import BinaryCrossentropy

from tensorflow.keras.backend import clear_session

In [None]:
train_x = pd.read_csv('../input/lish-moa/train_features.csv')
test_x = pd.read_csv('../input/lish-moa/test_features.csv')

In [None]:
train_y = pd.read_csv('../input/lish-moa/train_targets_scored.csv')

In [None]:
data = pd.concat([train_x, test_x], axis=0) #두 dataframe을 합치세요
data = data.reset_index() #인덱스를 새로 만드세요

In [None]:
data['cp_type'].value_counts()

In [None]:
c_cols = []
g_cols = []

for colname in data.columns:
    if colname.startswith('c-'): # c-로 시작하는 column들을 c_cols에 넣으세요
        c_cols.append(colname)
    if colname.startswith('g-'): # g-로 시작하는 column들을 g_cols에 넣으세요
        g_cols.append(colname)

In [None]:
#c_cols

In [None]:
somthing_rate = 1e-15
P_MIN = somthing_rate
P_MAX = 1 - P_MIN

def loss_fn(yt, yp):
    yp = np.clip(yp, P_MIN, P_MAX)
    return log_loss(yt, yp, labels=[0,1])

In [None]:
train = data.copy()
train = train.drop(['sig_id', 'index'], axis=1) #train에서 'sig_id', 'index' 두 column들을 없애세요

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

label_encoder = LabelEncoder()
onehot_encoder = OneHotEncoder()

In [None]:
data['cp_dose']

In [None]:
cp_dose_num = pd.DataFrame(label_encoder.fit_transform(train['cp_dose']), columns=['cp_dose_num']) #label encoding을 하세요
cp_type_num = pd.DataFrame(label_encoder.fit_transform(train['cp_type']), columns=['cp_type_num']) #label encoding을 하세요

In [None]:
cp_dose_num

In [None]:
cp_type_num.value_counts()

In [None]:
train = pd.concat([cp_type_num, cp_dose_num, train], axis=1) # cp_type_num, cp_dose_num, train 세 dataframe을 합치세요
train = train.drop(['cp_type', 'cp_dose'], axis=1) #기존의 'cp_type', 'cp_dose' columns을 없애세요
train.head()

In [None]:
#one hot encoding을 하세요
cp_time_onehot = pd.DataFrame(onehot_encoder.fit_transform(train['cp_time'].to_numpy().reshape(-1, 1)).toarray())

#one hot columns에 'cp_time_onehot_'이라는 prefix을 앞에 붙혀주세요
cp_time_onehot = cp_time_onehot.add_prefix('cp_time_onehot_')

In [None]:
cp_time_onehot

In [None]:
train = pd.concat([cp_time_onehot, train], axis=1)
train = train.drop(['cp_time'], axis=1)
train.head()

# PCA
* gene columns : 772
* cell columns : 100

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca_g = PCA(n_components=2) #pca 벡터수를 2개로 해주세요
pca_c = PCA(n_components=100) #pca 벡터수를 100개로 해주세요

In [None]:
train_pca_g = pca_g.fit_transform(train[g_cols]) #pca를 g_cols에 적용시켜주세요
train_pca_c = pca_c.fit_transform(train[c_cols]) #pca를 c_cols에 적용시켜주세요

In [None]:
print(train[g_cols].shape, train_pca_g.shape)

In [None]:
print(train[c_cols].shape, train_pca_c.shape)

In [None]:
train = pd.concat([train, pd.DataFrame(train_pca_g).add_prefix('pca_g_'), 
                  pd.DataFrame(train_pca_c).add_prefix('pca_c_')], axis=1)

In [None]:
#c_cols와 g_cols의 평균을 구해 주세요
means = pd.concat([train[g_cols].mean(axis=1), train[c_cols].mean(axis=1)], 
                   keys=['c_mean', 'g_mean'], axis=1)

train = pd.concat([train, means], axis=1)

In [None]:
from sklearn.model_selection import train_test_split

#train_test_split을 해주세요
X_train, X_val, y_train, y_val = train_test_split(train.iloc[:len(train_x)], 
                                                  train_y.drop(['sig_id'], axis=1), 
                                                  test_size=0.2, random_state=224)

In [None]:
def build_model(hidden_layers, neurons, dropout_rate):
    #Sequential로 설정해주세요
    model = tf.keras.Sequential([tf.keras.layers.Input(len(train.columns))])

    for i in range(hidden_layers):
        model.add(tf.keras.layers.BatchNormalization())
        model.add(tf.keras.layers.Dropout(dropout_rate)) #dropout을 해주세요
        model.add(tfa.layers.WeightNormalization(tf.keras.layers.Dense(neurons // 2**i, activation='swish')))

    #============ Final Layer =================
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tfa.layers.WeightNormalization(tf.keras.layers.Dense(206, activation="sigmoid")))
    
    model.compile(optimizer=tfa.optimizers.AdamW(lr = 1e-3, weight_decay = 1e-5, clipvalue = 756),
                  loss=BinaryCrossentropy(label_smoothing=0.001))
    
    return model

In [None]:
best_model = build_model(2, 734, 0.5012546298076606)

In [None]:
reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.2, min_lr=1e-6, patience=4, verbose=1, mode='auto')
early = EarlyStopping(monitor="val_loss", mode="min", restore_best_weights=True, patience= 10, verbose = 1)

checkpoint_path = 'model.weights'
cb_checkpt = ModelCheckpoint(checkpoint_path, monitor = 'val_loss', verbose = 1, save_best_only = True, 
                             save_weights_only=True, mode = 'auto')

history = best_model.fit(X_train, y_train,
                    batch_size = 64,
                    epochs = 100,
                    validation_data = (X_val, y_val),
                    callbacks = [early, reduce_lr_loss, cb_checkpt])


In [None]:
best_model.load_weights('model.weights')

In [None]:
pred = best_model.predict(train.iloc[len(train_x):])

In [None]:
pred_df = pd.DataFrame(pred, columns=train_y.columns[1:])
submmission_df = pd.concat([test_x['sig_id'], pred_df], axis=1)

In [None]:
submmission_df.to_csv('submission.csv', index=False) #csv파일로 export하세요

In [None]:
pd.read_csv('submission.csv')