In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from datetime import datetime
from tensorflow import keras
from tensorflow import feature_column
from tensorflow.keras import layers, regularizers

print(tf.__version__)
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

with tf.device("gpu:0"):
   print("tf.keras code in this scope will run on GPU")

In [None]:
# load dataset
x_train = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
y_train = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')
x_test = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')

cols_x = x_train.columns
cols_y = y_train.columns

num_x = len(cols_x)
num_y = len(cols_y)

print(f"Features: {num_x}")
print(f"Labels: {num_y}")

In [None]:
x_train.head()

In [None]:
y_train.head()

In [None]:
x_test.head()

In [None]:
default_records_features = [tf.string, tf.string, tf.string, tf.string] + [tf.float32] * (num_x-4)
default_records_targets =  [tf.string] + [tf.float32] *(num_y-1)

train_features = tf.data.experimental.CsvDataset("/kaggle/input/lish-moa/train_features.csv",
                                           record_defaults=default_records_features,
                                           #select_cols
                                           header=True)

train_targets = tf.data.experimental.CsvDataset("/kaggle/input/lish-moa/train_targets_scored.csv",
                                          record_defaults=default_records_targets,
                                          header=True)

test_features = tf.data.experimental.CsvDataset("/kaggle/input/lish-moa/test_features.csv",
                                           record_defaults=default_records_features,
                                           header=True)

train_dataset = tf.data.Dataset.zip((train_features, train_targets))
test_dataset = tf.data.Dataset.zip((test_features,train_targets))

In [None]:
# split dataset into train and val
dataset_size = len(x_train)
train_size = int(0.7*dataset_size)
val_size = dataset_size - train_size
test_size = len(x_test)

train = train_dataset.take(train_size)
val = train_dataset.skip(train_size)
val = train_dataset.take(val_size)

print("Full dataset size:", dataset_size)
print("Train dataset size:", train_size)
print("Val dataset size:", val_size)
print("Test dataset size:", test_size)

In [None]:
BATCH_SIZE = 1024

def _preprocess_train(features, targets):
    features = dict(zip(cols_x, features))
    features.pop('sig_id')
    targets = tf.stack(targets[1:])
    return features, targets

def _preprocess_test(features, targets):
    features = dict(zip(cols_x, features))
    features.pop('sig_id')
    return features

train_ds = train.map(_preprocess_train).shuffle(buffer_size=num_x).batch(BATCH_SIZE)
val_ds = val.map(_preprocess_train).batch(BATCH_SIZE)
test_ds = test_dataset.map(_preprocess_test).batch(BATCH_SIZE)

In [None]:
feature_columns = []
feature_layer_inputs = {}

# numeric cols
for num_col in list(cols_x[4:]):
    feature_layer_inputs[num_col] = keras.Input(shape=(1,), name=num_col)
    mean = x_train[num_col].mean()
    std = x_train[num_col].std()
    feature_columns.append(feature_column.numeric_column(num_col, normalizer_fn=lambda x: (x - mean) / std))

# indicator_columns
for col_name in list(cols_x[1:4]):
    feature_layer_inputs[col_name] = keras.Input(shape=(1,), name=col_name,  dtype=tf.string)
    categorical_column = feature_column.categorical_column_with_vocabulary_list(col_name, x_train[col_name].unique().astype(str))
    indicator_column = feature_column.indicator_column(categorical_column)
    feature_columns.append(indicator_column)
    
feature_layer_inputs

In [None]:
output_size = num_y-1

feature_layer = layers.DenseFeatures(feature_columns)
feature_layer_outputs = feature_layer(feature_layer_inputs)
x = layers.Dense(832)(feature_layer_outputs)
x = layers.BatchNormalization()(x)
x = layers.LeakyReLU(0.2)(x)
x = layers.Dropout(0.5)(x)
x = layers.Dense(448)(x)
x = layers.BatchNormalization()(x)
x = layers.LeakyReLU(0.5)(x)
x = layers.Dropout(0.5)(x)
out = layers.Dense(output_size, activation='sigmoid')(x)

model = keras.Model([v for v in feature_layer_inputs.values()], outputs=out)

opt = keras.optimizers.Adam(learning_rate=0.005863958845877649)
model.compile(loss='binary_crossentropy', optimizer=opt)

def set_mode(model, train_mode):
    for i in range(0, len(model.layers)):
        model.layers[i].training = train_mode
    model.compile(loss='binary_crossentropy', optimizer=opt)

In [None]:
# train model
early_stopping_callback = keras.callbacks.EarlyStopping(monitor='val_loss', patience=50)
history_callback = keras.callbacks.History()

model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=100,
    callbacks=[early_stopping_callback, history_callback]
)

In [None]:
figure, ax = plt.subplots()
ax.plot(history_callback.history['loss'], label='loss')
ax.plot(history_callback.history['val_loss'], label='val_loss')
ax.grid(True)
ax.legend()

plt.show()

In [None]:
# predictions
with tf.device("gpu:0"):
    set_mode(model, train_mode=False)
    prediction = model.predict(test_ds)

In [None]:
df = pd.DataFrame(prediction)
sig_id = x_test.pop('sig_id').values

df.insert(0, 'sig_id', sig_id)
df.columns = ['sig_id'] + cols_y.tolist()[1:]
df.to_csv('/kaggle/working/submission.csv', index=False)

In [None]:
df.head()