In [None]:
# INSTALL NECESSARY MODULES 
#!pip install -q -U keras-tuner

In [None]:
import os
import datetime
import IPython

import numpy as np
import pandas as pd
import tensorflow as tf
import kerastuner as kt
import matplotlib.pyplot as plt


from tensorflow import feature_column
from tensorflow.keras import layers
#from google.colab import drive
from tensorflow.keras import regularizers
from tensorflow import keras

# drive.mount('/content/drive', force_remount=True)
# os.chdir("/content/drive/My Drive/Deep Learning/RENDUS-Groupe/Project")
# !ls

# tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Read metadata about our data

In [None]:
# GET TESTS IDS
targets = pd.read_csv("/kaggle/input/lish-moa/test_features.csv")
ids = targets.pop('sig_id')

In [None]:
# HAVE PREVIEW OF METADATA
features = pd.read_csv("/kaggle/input/lish-moa/train_features.csv", nrows=10)
targets = pd.read_csv("/kaggle/input/lish-moa/train_targets_scored.csv", nrows=10)

columns = targets.columns

cols_features = features.columns
cols_targets = targets.columns

num_features = len(cols_features)
num_targets = len(cols_targets)

print("Number of features:" , num_features)
print("Number of targets:" , num_targets)

# Reading data using tf.data.experimental.CsvDataset

In [None]:
BATCH_SIZE = 32
features_types = [str(), str(), str(), str()] + [float()]*(num_features-4)
targets_types = [str()] + [float()]*(num_targets-1)

features = tf.data.experimental.CsvDataset("/kaggle/input/lish-moa/train_features.csv",
                                           record_defaults=features_types,
                                           #select_cols
                                           header=True)

targets = tf.data.experimental.CsvDataset("/kaggle/input/lish-moa/train_targets_scored.csv",
                                          record_defaults=targets_types,
                                          header=True)

test= tf.data.experimental.CsvDataset("/kaggle/input/lish-moa/test_features.csv",
                                          record_defaults=features_types,
                                          header=True)

dataset = tf.data.Dataset.zip((features, targets))

In [None]:
# split dataset into train and val
dataset_size = dataset.reduce(0, lambda x, _: x + 1).numpy()

train_size = int(0.7*dataset_size)
val_size = dataset_size - train_size

train = dataset.take(train_size)
val = dataset.skip(train_size)
val = dataset.take(val_size)

#We get the training size dataset and the validation size dataset
train_size = train.reduce(0, lambda x, _: x + 1).numpy()
val_size = val.reduce(0, lambda x, _: x + 1).numpy()

test_size = test.reduce(0, lambda x, _: x + 1).numpy()
test = dataset.take(test_size)

print("Full dataset size:", dataset_size)
print("Train dataset size:", train_size)
print("Val dataset size:", val_size)
print("Test size", test_size)

In [None]:
def _preprocess_line(features, targets):
    # Pack the result into a dictionary
    features = dict(zip(cols_features, features))
    features.pop('sig_id')
    targets = tf.stack(targets[1:])
    
    return features, targets

def _preprocess_line_bis(features, targets):
    # Pack the result into a dictionary but keep sig_id
    features = dict(zip(cols_features, features))
    targets = tf.stack(targets[1:])
    
    return features, targets

train = train.map(_preprocess_line)
train = train.shuffle(train_size,seed = 123)
train = train.batch(BATCH_SIZE)

val = val.map(_preprocess_line)
val = val.shuffle(val_size,seed = 123)
val = val.batch(BATCH_SIZE)

test = test.map(_preprocess_line_bis)
test = test.batch(BATCH_SIZE)

# Features Engineering

In [None]:
feature_columns =  []

# A utility method to create a feature column
# and to transform a batch of data
def demo(feature_column):
  feature_layer = layers.DenseFeatures(feature_column)
  feature_columns.append(feature_column)

In [None]:
# HANDLE CATEGORICAL COLUMN
categorical_columns=['cp_time','cp_type','cp_dose']
cat_cp_time = feature_column.categorical_column_with_vocabulary_list( categorical_columns[0], ['24', '48', '72'])
cat_cp_dose = feature_column.categorical_column_with_vocabulary_list(categorical_columns[1],['D1','D2'])
cat_cp_type = feature_column.categorical_column_with_vocabulary_list(categorical_columns[2],['trt_cp','ctl_vehicle'])

cat_one_encod_cp_time = feature_column.indicator_column(cat_cp_time)
cat_one_encod_cp_dose = feature_column.indicator_column(cat_cp_dose)
cat_one_encod_cp_type = feature_column.indicator_column(cat_cp_type)

demo(cat_one_encod_cp_time)
demo(cat_one_encod_cp_dose)
demo(cat_one_encod_cp_type)

In [None]:
#HANDLE NUMERICAL VALUE
L=[]
for batch, label in train.take(1):
    L.append(list(batch.keys()))

numerical_columns=L[0]
numerical_columns.remove('cp_time')
numerical_columns.remove('cp_type')
numerical_columns.remove('cp_dose')

for i in range(len(numerical_columns)):
    num = feature_column.numeric_column(numerical_columns[i])
    demo(num)

In [None]:
# CREATE INPUT LAYER OF FEATURES
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

# Baseline Modeling & HYPERPARAMETER TUNING

In [None]:
# DEFINE A MODEL WRAPPER FOR HP TUNING

model = keras.Sequential()
#hp_learning_rate = hp.Choice('learning_rate', values = [1e-2, 1e-3, 1e-4, 1e-5]) # tune lr
#n_layers = hp.Int('num_layers', min_value = 2, max_value = 10, step =1) # tune nb of layers

#FEATURE LAYERS + BATCH NORRMALIZATION
model.add(feature_layer)
model.add(tf.keras.layers.BatchNormalization(momentum=0.8))

# TUNE NUMBER OF LAYERS

model.add(layers.Dense(units=800,activation='relu')) # tune nb of neurons
model.add(keras.layers.Dropout(0.1)) # tune dropout
model.add(tf.keras.layers.BatchNormalization(momentum=0.8)) #NORM BATCH AFTER EACH DENSE LAYER

model.add(layers.Dense(units=950,activation='relu')) # tune nb of neurons
model.add(keras.layers.Dropout(0.1)) # tune dropout
model.add(tf.keras.layers.BatchNormalization(momentum=0.8)) #NORM BATCH AFTER EACH DENSE LAYER

# OUTPUT LAYER
model.add(keras.layers.Dense(206, activation=tf.nn.sigmoid))  

# COMPILE MODEL
model.compile(optimizer = keras.optimizers.Adam(learning_rate = 0.001),
            loss = 'binary_crossentropy',
            metrics = ['accuracy'])


In [None]:
# Build the model with the optimal hyperparameters and train it on the data
#model = tuner.hypermodel.build(best_hps)
history =model.fit(train, epochs = 30, validation_data = val)

In [None]:
# TAKE A LOOK AT OUR MODEL
model.summary()

In [None]:
#model.predict
predictions_test = model.predict(test)
print(predictions_test.shape)
predictions_test_pd = pd.DataFrame(predictions_test)
print(type(predictions_test_pd))
predictions_test_pd.head()

In [None]:
print(ids.shape)
ids_pd = pd.DataFrame(ids)
print(type(ids_pd))
ids_pd.head()

In [None]:
predictions_test_pd.head()

In [None]:

columns_name = list(columns)
print(columns_name)
print(len(columns_name))

In [None]:
predictions_test_pd.insert(0,"sig_id",ids_pd)
predictions_test_pd.columns =columns_name
predictions_test_pd.head()


In [None]:
predictions_test_pd.to_csv("submission.csv", index=False)

In [None]:
predictions_test[0]

# Model Variance & Bias Analysis