In [1]:
%%writefile inference.py
import sys
sys.path.append('../input/iterative-stratification/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Writing inference.py


In [2]:
%%writefile -a inference.py


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
import tensorflow_addons as tfa
from tensorflow.keras.layers import Input, Dense, Activation, Dropout, BatchNormalization
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras import layers,regularizers,Sequential,backend,callbacks,optimizers,metrics,losses
from keras.models import Sequential
from tqdm import tqdm
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from sklearn.metrics import log_loss





from sklearn import preprocessing
from sklearn.decomposition import PCA

print(keras.__version__)
print(tf.__version__)

Appending to inference.py


In [3]:
%%writefile -a inference.py
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
test_features = pd.read_csv('../input/lish-moa/test_features.csv')

Appending to inference.py


# Preprocess data

In [4]:
%%writefile -a inference.py
def preprocess_features(df):
    df = df.copy()
    
    df['cp_dose'] = df['cp_dose'].map({'D1': 0, 'D2': 1})
    df['cp_time'] = df['cp_time'].map({24: 0, 48: 1, 72: 2})
    df.drop(columns=['sig_id', 'cp_type'], inplace=True)
    df.drop(['g-219', 'g-307', 'g-104', 'g-550', 'g-331','g-15','g-481','g-435','g-536','g-661'],axis=1, inplace=True)

    return df
    
train_features = preprocess_features(train_features)
test_features = preprocess_features(test_features)
train_targets.drop(columns=['sig_id'], inplace=True)

Appending to inference.py


In [5]:
%%writefile -a inference.py
def create_training_graphs(training_history):
    plt.figure(figsize=(16, 6))

    plt.subplot(1, 2, 1)
    plt.plot(training_history.history['loss'])
    plt.plot(training_history.history['val_loss'])
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend(['Training', 'Validation'])
    plt.title('Loss')

    plt.subplot(1, 2, 2)
    plt.plot(training_history.history['accuracy'])
    plt.plot(training_history.history['val_accuracy'])
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend(['Training', 'Validation'])
    plt.title('Accuracy')

Appending to inference.py


In [6]:
%%writefile -a inference.py

def create_model(input_size):
    model = keras.Sequential([
        Dense(2048, input_shape=(input_size,), activation="relu", name="input"),
        Dropout(rate=0.5),
        BatchNormalization(),
        Dense(1024, activation="relu", name="dense1"),
        Dropout(rate=0.5),
        BatchNormalization(),
        Dense(206, activation="sigmoid", name="output")
    ])
    
    #optimizer = tf.keras.optimizers.Adam(lr=0.001)

    #model.compile(loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=0.1), optimizer=optimizer, metrics=['accuracy'])
    model.compile(optimizer=tfa.optimizers.Lookahead(tf.optimizers.Adam(), sync_period=10),
                  loss=losses.BinaryCrossentropy(label_smoothing=0.001),metrics=['accuracy'])

    return model


#model.compile(loss='binary_crossentropy', optimizer=optimizer, label_smoothing=0.001, metrics=['accuracy'])


Appending to inference.py


# Train

In [7]:
%%writefile -a inference.py

def log_loss_metric(y_true, y_pred):
    metrics = []
    for _target in train_targets.columns:
        metrics.append(log_loss(y_true.loc[:, _target], y_pred.loc[:, _target].astype(float), labels = [0,1]))
    return np.mean(metrics)

Appending to inference.py


In [8]:
%%writefile -a inference.py

#early_stopping = tf.keras.callbacks.EarlyStopping(
    #monitor="val_loss",
    #min_delta=0,
    #patience=10,
    #verbose=1,
    #mode="auto",
    #baseline=None,
    #restore_best_weights=True)


early_stopping = EarlyStopping(monitor="val_loss", min_delta=0, patience=10, verbose=1, mode="auto", baseline=None, restore_best_weights=True)
reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, epsilon=1e-4, mode='min')

Appending to inference.py


In [9]:
%%writefile -a inference.py
#model = create_model(len(train_features.columns))
#history = model.fit(train_features, train_targets, batch_size=64, epochs=500, validation_split=0.1, callbacks=[early_stopping])

histories = []

target_columns = train_targets.columns

val_predictions = train_targets.copy()
val_predictions.loc[:, target_columns] = 0

test_predictions = pd.read_csv('../input/lish-moa/sample_submission.csv')
test_predictions.loc[:, target_columns] = 0

SEEDS = 3
SPLITS = 5
MAX_EPOCHS = 500
BATCH_SIZE = 64

progress_bar = tqdm(range(SEEDS))
for seed in progress_bar:
    mskf = MultilabelStratifiedKFold(n_splits=SPLITS, random_state=seed, shuffle=True)
    for fold_idx, (train_idx, val_idx) in enumerate(mskf.split(X=train_features, y=train_targets)):
        X_train, X_val = train_features.loc[train_idx], train_features.loc[val_idx]
        y_train, y_val = train_targets.loc[train_idx], train_targets.loc[val_idx]

        model = create_model(len(train_features.columns))
        history = model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=MAX_EPOCHS, validation_data=(X_val, y_val), callbacks=[early_stopping, reduce_lr_loss])
        histories.append(history)

        val_predictions.loc[val_idx, train_targets.columns] +=  model.predict(X_val) / SEEDS
        test_predictions.loc[:, target_columns] += model.predict(test_features) / (SPLITS * SEEDS)

Appending to inference.py


In [10]:
#%%writefile -a inference.py
#create_training_graphs(history)

# Predictions

In [11]:
#%%writefile -a inference.py
#predictions = model.predict(test_features)

# Create submission

In [12]:
#import joblib
#loaded_model = joblib.load('../input/marge-keras-v2.sav')

In [13]:
%%writefile -a inference.py
#test_features1 = pd.read_csv('../input/lish-moa/test_features.csv')
#submission = pd.DataFrame({'sig_id': test_features1['sig_id'].values})

train_features1 = pd.read_csv('../input/lish-moa/train_features.csv')
test_features1 = pd.read_csv('../input/lish-moa/test_features.csv')

print(f'NN OOF before postprocessing: {log_loss_metric(train_targets, val_predictions):.6f}')
val_predictions.loc[train_features1['cp_type'] == 'ctl_vehicle', train_targets.columns] = 0
test_predictions.loc[test_features1['cp_type'] == 'ctl_vehicle', train_targets.columns] = 0
print(f'NN OOF after postprocessing: {log_loss_metric(train_targets, val_predictions):.6f}')

Appending to inference.py


In [14]:
#%%writefile -a inference.py
#for col in train_targets.columns.to_list():
    #submission[col] = 0

#submission.loc[:, train_targets.columns] = predictions

In [15]:
#%%writefile inference.py
# Set control drug prediction to 0
#submission.loc[submission['sig_id'].isin(test_features1.loc[test_features1['cp_type'] == 'ctl_vehicle', 'sig_id']), train_targets.columns] = 0

In [16]:
%%writefile -a inference.py

#submission.to_csv('submission.csv', index=False)

val_predictions.to_csv('val-submission.csv', index=False)
test_predictions.to_csv('submission.csv', index=False)

Appending to inference.py


In [17]:
! python inference.py

/kaggle/input/lish-moa/train_features.csv
/kaggle/input/lish-moa/train_drug.csv
/kaggle/input/lish-moa/test_features.csv
/kaggle/input/lish-moa/train_targets_nonscored.csv
/kaggle/input/lish-moa/sample_submission.csv
/kaggle/input/lish-moa/train_targets_scored.csv
/kaggle/input/marge-keras-v2/__results__.html
/kaggle/input/marge-keras-v2/submission.csv
/kaggle/input/marge-keras-v2/custom.css
/kaggle/input/marge-keras-v2/__notebook__.ipynb
/kaggle/input/marge-keras-v2/__output__.json
/kaggle/input/marge-keras-v2/__results___files/__results___14_0.png
/kaggle/input/iterative-stratification/iterative-stratification-master/.gitignore
/kaggle/input/iterative-stratification/iterative-stratification-master/LICENSE
/kaggle/input/iterative-stratification/iterative-stratification-master/setup.py
/kaggle/input/iterative-stratification/iterative-stratification-master/setup.cfg
/kaggle/input/iterative-stratification/iterative-stratification-master/.travis.yml
/kaggle/input/iterativ