In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import tensorflow as tf
from tensorflow.keras.callbacks import ReduceLROnPlateau

In [None]:
PATH = '/kaggle/input/lish-moa/'

In [None]:
train_df = pd.read_csv(PATH + 'train_features.csv')
test_df = pd.read_csv(PATH + 'test_features.csv')

target_df = pd.read_csv(PATH + 'train_targets_scored.csv')
sub_df = pd.read_csv(PATH + 'sample_submission.csv')

In [None]:
train_df.head()

**Признаки**
- `sig_id` - уникальный идентификатор образца
- признаки с префиксом `g`- являются признаками экспрессии генов, и их 772 (от `g-0` до `g-771`).
- признаки с префиксом `c` - являются характеристиками жизнеспособности клеток, их 100 (от `c-0` до `c-99`).
- `cp_type` - категориальный признак с двумя категориями, который указывает, что образцы обрабатываются составом или управляющим возмущением (trt_cp или ctl_vehicle)
- `cp_time` - это категориальный признак, который указывает продолжительность лечения (24, 48 или 72 часа)
- `cp_dose` - категориальный признак с двумя катеuориями, который указывает, что доза низкая или высокая (`D1` или `D2`)

In [None]:
train_df.drop(['sig_id'], axis=1, inplace=True)
test_df.drop(['sig_id'], axis=1, inplace=True)

In [None]:
target_df.head()

In [None]:
target_df.drop(['sig_id'], axis=1, inplace=True)

In [None]:
target_df.sum(axis=1).sample(20)

**Предобработка**

In [None]:
idx = len(train_df)
data_df = pd.concat([train_df, test_df], axis = 0)
del train_df, test_df

In [None]:
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()

category_cols = ['cp_dose', 'cp_type']

for cols in category_cols:
    data_df[cols] = enc.fit_transform(data_df[cols])

In [None]:
X_train = data_df.iloc[:idx,:]
X_test = data_df.iloc[idx:,:]
y_train = target_df

**Построение модели**

In [None]:
 def create_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Input(875),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(4096, activation="relu"),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(4096, activation="relu"),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(206, activation="sigmoid")
        ])
    model.compile(optimizer=tf.keras.optimizers.Adam(lr=2.75e-5), loss='binary_crossentropy', metrics=["accuracy"])
    return model

In [None]:
from sklearn.model_selection import KFold

columns = target_df.columns

test_preds = sub_df.copy()
test_preds.loc[:,columns] = 0

val_preds = target_df.copy()
val_preds.loc[:,columns] = 0

kf = KFold(n_splits=5, random_state=42, shuffle=True)  

for ix, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    
    print(f'Fold {ix}')

    model = create_model()
    reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, epsilon=1e-4, mode='min')
     
    X_train_cv, X_val_cv = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_train_cv, y_val_cv = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    model.fit(X_train_cv, y_train_cv, 
            validation_data=(X_val_cv, y_val_cv), 
            epochs=30, batch_size=128,
            callbacks=[reduce_lr_loss], verbose=2)
    
    #print("Train loss", model.evaluate(X_train_cv, y_train_cv))
    #print("Val loss", model.evaluate(X_val_cv, y_val_cv))
    
    print("Val predict")    
    val_preds.loc[val_idx, columns] = model.predict(X_val_cv) 
    
    print("Test predict")
    test_preds.loc[:,columns] += model.predict(X_test)
    
    print('-'*20)
    
val_preds.loc[:,columns] /= 5 
test_preds.loc[:,columns] /= 5 

In [None]:
from sklearn.metrics import log_loss
def metric(y_true, y_pred):
    metrics = []
    for col in columns:
        metrics.append(log_loss(y_true.loc[:, col], y_pred.loc[:, col].astype(float), labels=[0,1]))
    return np.mean(metrics)

In [None]:
print(f"OOF Metric: {metric(target_df, val_preds)}")

In [None]:
mask = X_test['cp_type']=='ctl_vehicle'
test_preds[mask] = 0

In [None]:
test_preds.to_csv('submission.csv', index=False)