In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
class NumericalFeatureEncoder:
    """
    Class encodes numerical features using tensorflow feature_columns
    """

    def __init__(self, features):
        self.features = features

    def encode(self, X=None):
        """
        Set inputs for numerical features
        """
        numerical_inputs, feature_encoders = {}, {}

        for feature in self.features:
            numerical_inputs[feature] = tf.keras.Input(shape=(1,), name=feature, dtype=tf.float32)
            feature_encoders[feature] = tf.feature_column.numeric_column(feature)
        return numerical_inputs, [feature for _, feature in feature_encoders.items()]


class CategoricalFeatureEncoder:
    """
    Class encodes Categorical features using tensorflow feature_columns
    """
    def __init__(self, features=None):
        self.features = features

    def encode(self, X=None):
        """
        Set inputs and catergorical vocab list
        """
        feature_vocab_list, categorical_inputs, feature_encoders = {}, {}, {}

        for feature in self.features:
            categorical_inputs[feature] = tf.keras.Input(shape=(1,), name=feature, dtype=tf.string)
            feature_vocab_list[feature] = tf.feature_column.categorical_column_with_vocabulary_list(feature, X[feature].unique().tolist())
            feature_encoders[feature] = tf.feature_column.indicator_column(feature_vocab_list[feature])
        return categorical_inputs, [feature for _, feature in feature_encoders.items()]

In [None]:
class FeatureTransformer:
    """
    Feature encoder specifically for Wide and Deep network
    """
    def __init__(self, gene_features=None, cell_features=None, categorical_features=None):
        self.gene_features = gene_features
        self.cell_features = cell_features
        self.categorical_features = categorical_features

    def transform(self, X):
        gene_feature_inputs, gene_feature_encoders = NumericalFeatureEncoder(self.gene_features).encode(X) 
        cell_feature_inputs, cell_feature_encoders = NumericalFeatureEncoder(self.cell_features).encode(X) 
        categorical_inputs, categorical_feature_encoders = CategoricalFeatureEncoder(self.categorical_features).encode(X)

        feature_layer_inputs = {
                                **gene_feature_inputs,
                                **cell_feature_inputs,
                                **categorical_inputs
                                }
 
        return feature_layer_inputs, categorical_feature_encoders, gene_feature_encoders, cell_feature_encoders

In [None]:
def wide_and_deep(inputs, linear_feature_columns, gene_feature_columns, cell_feature_columns, total_classes):
    metrics = [tf.keras.metrics.BinaryAccuracy(name='accuracy'),
               tf.keras.metrics.AUC(name='auc')]

    gene_deep = tf.keras.layers.DenseFeatures(gene_feature_columns)(inputs)
    gene_deep = tf.keras.layers.BatchNormalization()(gene_deep)
    cell_deep = tf.keras.layers.DenseFeatures(cell_feature_columns)(inputs)
    cell_deep = tf.keras.layers.BatchNormalization()(cell_deep)

    for numnodes in [256]:
        gene_deep = tf.keras.layers.Dense(numnodes, activation='relu', kernel_initializer='he_normal')(gene_deep)
        gene_deep = tf.keras.layers.Dropout(0.25)(gene_deep)

    for numnodes in [256]:
        cell_deep = tf.keras.layers.Dense(numnodes, activation='relu', kernel_initializer='he_normal')(cell_deep)
        cell_deep = tf.keras.layers.Dropout(0.25)(cell_deep)

    deep = tf.keras.layers.concatenate([gene_deep, cell_deep])

    for numnodes in [256, 64]:
        deep = tf.keras.layers.Dense(numnodes, activation='relu', kernel_initializer='he_normal')(deep)
        deep = tf.keras.layers.Dropout(0.25)(deep)

    wide = tf.keras.layers.DenseFeatures(linear_feature_columns)(inputs)
    both = tf.keras.layers.concatenate([deep, wide])
    output = tf.keras.layers.Dense(total_classes, activation='sigmoid')(both)
    model = tf.keras.Model(inputs=[v for v in inputs.values()], outputs=output)
    model.compile(
        loss=tf.keras.losses.BinaryCrossentropy(),
        optimizer=tf.keras.optimizers.RMSprop(lr=0.0005),
        metrics=metrics)
    return model

In [None]:
class TFDataTransformer:
    """Transform pandas data frame to tensorflow data set
    """
    def __init__(self):
        pass

    def data_frame_to_dataset(self, data_frame_features=None, data_frame_labels=None):
        """Transform from data frame to data_set

        Args:
            data_frame ([type], optional): [description]. Defaults to None.
            labels ([type], optional): [description]. Defaults to None.
        """
        dataset = tf.data.Dataset.from_tensor_slices((dict(data_frame_features), data_frame_labels.values)).shuffle(1000)
        return dataset       

    def transform(self, data_frame_features=None, data_frame_labels=None):
        """transform data

        Args:
            data_frame_features ([type], optional): [description]. Defaults to None.
            data_frame_labels ([type], optional): [description]. Defaults to None.
        """
        return self.data_frame_to_dataset(data_frame_features, data_frame_labels)


In [None]:
BATCH_SIZE = 32
train_data_features = pd.read_csv("/kaggle/input/lish-moa/train_features.csv").drop('sig_id', axis=1)
test_data_features = pd.read_csv("/kaggle/input/lish-moa/test_features.csv").drop('sig_id', axis=1)
train_data_features['cp_time'] = train_data_features['cp_time'].map(str)
test_data_features['cp_time'] = test_data_features['cp_time'].map(str)

raw_labels = pd.read_csv("/kaggle/input/lish-moa/train_targets_scored.csv").drop('sig_id', axis=1)
test_data_features_copy = test_data_features.copy()
data_types = train_data_features.dtypes


CATEGORICAL_FEATURES = ['cp_type', 'cp_dose', 'cp_time']
NUMERICAL_FEATURES = data_types[data_types=='float64'].index.tolist()
NUMERICAL_FEATURES_GENE = [feature for feature in NUMERICAL_FEATURES if 'g' in feature]
NUMERICAL_FEATURES_CELL = [feature for feature in NUMERICAL_FEATURES if 'c' in feature]

train_data_features[NUMERICAL_FEATURES_GENE] = (train_data_features[NUMERICAL_FEATURES_GENE]-train_data_features[NUMERICAL_FEATURES_GENE].mean(axis=0))/train_data_features[NUMERICAL_FEATURES_GENE].std(axis=0)
train_data_features[NUMERICAL_FEATURES_CELL] = (train_data_features[NUMERICAL_FEATURES_CELL]-train_data_features[NUMERICAL_FEATURES_CELL].mean(axis=0))/train_data_features[NUMERICAL_FEATURES_CELL].std(axis=0)

test_data_features[NUMERICAL_FEATURES_GENE] = (test_data_features[NUMERICAL_FEATURES_GENE]-test_data_features[NUMERICAL_FEATURES_GENE].mean(axis=0))/test_data_features[NUMERICAL_FEATURES_GENE].std(axis=0)
test_data_features[NUMERICAL_FEATURES_CELL] = (test_data_features[NUMERICAL_FEATURES_CELL]-test_data_features[NUMERICAL_FEATURES_CELL].mean(axis=0))/test_data_features[NUMERICAL_FEATURES_CELL].std(axis=0)

X_train, X_test, y_train, y_test = train_test_split(train_data_features, raw_labels)

train_dataset = TFDataTransformer().transform(X_train, y_train).batch(BATCH_SIZE)
val_dataset = TFDataTransformer().transform(X_test, y_test).batch(BATCH_SIZE)
test_dataset = tf.data.Dataset.from_tensor_slices(dict(test_data_features)).batch(BATCH_SIZE)

feature_layer_inputs, categorical_feature_encoders, gene_feature_encoders, cell_feature_encoders = FeatureTransformer(gene_features=NUMERICAL_FEATURES_GENE,
                                                                                                                      cell_features=NUMERICAL_FEATURES_CELL,
                                                                                                                      categorical_features=CATEGORICAL_FEATURES)\
                                                                                                   .transform(X_train)


In [None]:

early_stopping = tf.keras.callbacks.EarlyStopping(**{'monitor': 'val_loss',
                                                     'mode': 'min',
                                                     'verbose': 1,
                                                     'patience': 5})

model_checkpoint = tf.keras.callbacks.ModelCheckpoint(**{'filepath': '/tmp/best_model',
                                                         'monitor': 'val_loss',
                                                         'mode': 'min',
                                                         'verbose': 1,
                                                         'save_weights_only': True,
                                                         'save_best_only': True})
    

model = wide_and_deep(feature_layer_inputs, categorical_feature_encoders, gene_feature_encoders, cell_feature_encoders, total_classes=raw_labels.shape[-1])
model.fit(train_dataset,
          epochs=20,
          validation_data=val_dataset,
          callbacks=[early_stopping, model_checkpoint],
          )


In [None]:
model.load_weights('/tmp/best_model')

def submit_run():
    submit_data_frame = pd.read_csv("/kaggle/input/lish-moa/sample_submission.csv")
    submit_data_frame.iloc[:, 1:] = model.predict(test_dataset)
    submit_data_frame.to_csv('submission.csv', index=False)


submit_run()

In [None]:
%ls