<a href="https://colab.research.google.com/github/rubenfh/MOA/blob/dev%2Falex/MOA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ML library
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow import feature_column
from tensorboard.plugins.hparams import api as hp
import tensorflow_addons as tfa
#Tensor Flow doc
!pip3 install -q git+https://github.com/tensorflow/docs
import tensorflow_docs as tfdocs
import tensorflow_docs.modeling
import tensorflow_docs.plots

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

# Data manipulation library
import pandas as pd
import numpy as np
# Make numpy values easier to read.
np.set_printoptions(precision=3, suppress=True)

#Other library
from  IPython import display
from matplotlib import pyplot as plt
import uuid 

import numpy as np

import shutil
import tempfile

import os
from google.colab import drive

  Building wheel for tensorflow-docs (setup.py) ... [?25l[?25hdone


In [2]:
if 'google.colab' in str(get_ipython()):
    drive.mount('/content/drive',force_remount=True)
    TRAIN_FEATURES_PATH =  "drive/MyDrive/lish-moa/train_features.csv"
    TRAIN_LABELS_PATH   =  "drive/MyDrive/lish-moa/train_targets_scored.csv"
    TEST_FEATURES_PATH  =  "drive/MyDrive/lish-moa/test_features.csv"
    BEST_FEATURES_PATH  =  "drive/MyDrive/lish-moa/best_features.csv"
else:
    TRAIN_FEATURES_PATH =  "lish-moa/train_features.csv"
    TRAIN_LABELS_PATH   =  "lish-moa/train_targets_scored.csv"
    TEST_FEATURES_PATH  =  "lish-moa/test_features.csv"
    BEST_FEATURES_PATH  =  "lish-moa/best_features.csv"


Mounted at /content/drive


In [3]:
features = pd.read_csv(TRAIN_FEATURES_PATH, nrows=10)
targets = pd.read_csv(TRAIN_LABELS_PATH, nrows=10)
#best_features = pd.read_csv(BEST_FEATURES_PATH)
cols_features = features.columns
cols_targets = targets.columns

num_features = len(cols_features) 
num_targets = len(cols_targets)
print("Number of features:" , num_features)
print("Number of targets:" , num_targets)

Number of features: 876
Number of targets: 207


In [4]:
features_types = [str(), str(), str(), str()] + [float()]*(num_features-4)
targets_types = [str()] + [float()]*(num_targets-1)

features = tf.data.experimental.CsvDataset(TRAIN_FEATURES_PATH,
                                           record_defaults=features_types,
                                           #select_cols
                                           header=True)

targets = tf.data.experimental.CsvDataset(TRAIN_LABELS_PATH,
                                          record_defaults=targets_types,
                                          header=True)

dataset = tf.data.Dataset.zip((features, targets))

In [5]:
# split dataset into train and val
dataset_size = dataset.reduce(0, lambda x, _: x + 1).numpy()

train_size = int(0.7*dataset_size)
val_size = int(0.15*dataset_size)
test_size = int(0.15*dataset_size)

train = dataset.take(train_size)
val = dataset.skip(train_size)
val = dataset.take(val_size)
test = dataset.skip(train_size + val_size)
test = dataset.take(test_size)

train_size = train.reduce(0, lambda x, _: x + 1).numpy()
val_size = val.reduce(0, lambda x, _: x + 1).numpy()
test_size = test.reduce(0, lambda x, _: x + 1).numpy()

print("Full dataset size:", dataset_size)
print("Train dataset size:", train_size)
print("Val dataset size:", val_size)
print("Test dataset size:", test_size)

Full dataset size: 23814
Train dataset size: 16669
Val dataset size: 3572
Test dataset size: 3572


In [6]:
BATCH_SIZE = 32

def _preprocess_line(features, targets):
    # Pack the result into a dictionary
    features = dict(zip(cols_features, features))
    features.pop('sig_id')
    targets = tf.stack(targets[1:])
    return features, targets

train = train.map(_preprocess_line)
train = train.batch(BATCH_SIZE)

val = val.map(_preprocess_line)
val = val.batch(BATCH_SIZE)

test = test.map(_preprocess_line)
test = test.batch(BATCH_SIZE)

In [7]:
for feature_batch, label_batch in train.take(1):
    print('First 5 features:', list(feature_batch.keys())[:5])
    print('A batch of cp_types:', feature_batch['cp_type'].numpy())
    print('A batch of cp_times:', feature_batch['cp_time'].numpy())
    print('A batch of targets:', label_batch.numpy() ) 

First 5 features: ['cp_type', 'cp_time', 'cp_dose', 'g-0', 'g-1']
A batch of cp_types: [b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp'
 b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp'
 b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp'
 b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'ctl_vehicle' b'trt_cp'
 b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp']
A batch of cp_times: [b'24' b'72' b'48' b'48' b'72' b'24' b'24' b'48' b'48' b'48' b'72' b'48'
 b'48' b'48' b'72' b'48' b'48' b'24' b'72' b'48' b'48' b'48' b'72' b'72'
 b'72' b'48' b'72' b'48' b'48' b'72' b'72' b'48']
A batch of targets: [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [8]:
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
  # Create a StringLookup layer which will turn strings into integer indices
  if dtype == 'string':
    index = preprocessing.StringLookup(max_tokens=max_tokens)
  else:
    index = preprocessing.IntegerLookup(max_values=max_tokens)

  # Prepare a Dataset that only yields our feature
  feature_ds = dataset.map(lambda x, y: x[name])

  # Learn the set of possible values and assign them a fixed integer index.
  index.adapt(feature_ds)

  # Create a Discretization for our integer indices.
  encoder = preprocessing.CategoryEncoding(max_tokens=index.vocab_size())

  # Prepare a Dataset that only yields our feature.
  feature_ds = feature_ds.map(index)

  # Learn the space of possible indices.
  encoder.adapt(feature_ds)

  # Apply one-hot encoding to our indices. The lambda function captures the
  # layer so we can use them, or include them in the functional model later.
  return lambda feature: encoder(index(feature))  

In [9]:
encoded_features = []
all_inputs = []
for header in cols_features[4:]:
    numeric_col = tf.keras.Input(shape=(1,), name=header)
    encoded_numeric_col= tf.linalg.normalize(numeric_col, ord='euclidean', axis=None, name=None)
    all_inputs.append(numeric_col)
    encoded_features.append(encoded_numeric_col[0]) 

In [10]:
categorical_cols = ['cp_type', 'cp_dose','cp_time']
for header in categorical_cols:
  categorical_col = tf.keras.Input(shape=(1,), name=header, dtype='string')
  encoding_layer = get_category_encoding_layer(header, train, dtype='string',
                                               max_tokens=5)
  encoded_categorical_col = encoding_layer(categorical_col)
  all_inputs.append(categorical_col)
  encoded_features.append(encoded_categorical_col)
  print("Normalization of ", str(header), " done !")

Normalization of  cp_type  done !
Normalization of  cp_dose  done !
Normalization of  cp_time  done !


In [11]:
all_features = []
feature_layer = []
all_features = tf.keras.layers.concatenate(encoded_features)

In [12]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [13]:
# Clear any logs from previous runs
!rm -rf ./logs/ 

In [14]:
from tensorboard.plugins.hparams import api as hp
import datetime
!rm -rf ./logs/

logdir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

In [22]:
HP_NUM_UNITS_1 = hp.HParam('num_units_1', hp.Discrete([128,256,512]))
HP_NUM_UNITS_2 = hp.HParam('num_units_2', hp.Discrete([128,256,512]))
HP_DROPOUT = hp.HParam('dropout', hp.RealInterval(0.2, 0.5))
HP_OPTIMIZER = hp.HParam('optimizer', hp.Discrete(['adam','adadelta']))
HP_ACTIVATION = hp.HParam('activation', hp.Discrete(['relu','elu']))
HP_ACTIVATION_OUTPUT = hp.HParam('activation_output', hp.Discrete(['sigmoid']))

METRIC_CATEGORICAL_ACCURACY = "categorical_accuracy"
METRIC_BINARY_ACCURACY = "binary_accuracy"
METRIC_CATEGORICAL_CROSSENTROPY = "categorical_crossentropy"
METRIC_BINARY_CROSSENTROPY = "binary_crossentropy"
METRIC_MSE = "mean_squared_error"

metrics = ["categorical_accuracy","binary_accuracy","categorical_crossentropy","binary_crossentropy","mean_squared_error"]

with tf.summary.create_file_writer(logdir).as_default():
  hp.hparams_config(
    hparams=[HP_NUM_UNITS_1, HP_NUM_UNITS_2, HP_DROPOUT, HP_ACTIVATION, HP_ACTIVATION_OUTPUT, HP_OPTIMIZER],
    metrics=[ hp.Metric(METRIC_CATEGORICAL_ACCURACY, display_name='Categorical Accuracy'),
              hp.Metric(METRIC_BINARY_ACCURACY, display_name='Binary Accuracy'),
              hp.Metric(METRIC_CATEGORICAL_CROSSENTROPY, display_name='Categorical Cross Entropy Accuracy'),
              hp.Metric(METRIC_BINARY_CROSSENTROPY, display_name='Binary Cross Entropy'),
              hp.Metric(METRIC_MSE, display_name='MSE'),
    ],
  )

In [23]:
def train_test_model(hparams):

  x = tf.keras.layers.BatchNormalization()(all_features)

  x =   tf.keras.layers.Dense(hparams[HP_NUM_UNITS_1],activation=hparams[HP_ACTIVATION])(x)
  x = tf.keras.layers.Dropout(hparams[HP_DROPOUT])(x)
  x = tf.keras.layers.BatchNormalization()(x)

  x =   tf.keras.layers.Dense(hparams[HP_NUM_UNITS_2],activation=hparams[HP_ACTIVATION])(x)
  x = tf.keras.layers.Dropout(hparams[HP_DROPOUT])(x)
  x = tf.keras.layers.BatchNormalization()(x)

  output = tf.keras.layers.Dense(206, activation=hparams[HP_ACTIVATION_OUTPUT])(x)
  model = tf.keras.Model(all_inputs,output)

  model.compile(
      optimizer = hparams[HP_OPTIMIZER],
      loss = tf.keras.losses.BinaryCrossentropy(),
      metrics = ["categorical_accuracy","binary_accuracy","categorical_crossentropy","binary_crossentropy","mean_squared_error"],
  )

  model.fit(train,
            validation_data= val,
            epochs=15,
            shuffle=True,
            verbose =1,
            callbacks=[ tf.keras.callbacks.TensorBoard(logdir),  # log metrics
                        hp.KerasCallback(logdir, hparams),  # log hparams
                        tf.keras.callbacks.EarlyStopping(monitor='val_binary_crossentropy', patience=10),
    ]) 
  _, categorical_accuracy, binary_accuracy, categorical_crossentropy, binary_crossentropy, mean_squared_error = model.evaluate(test)
  return categorical_accuracy, binary_accuracy, categorical_crossentropy, binary_crossentropy, mean_squared_error

In [24]:
def run(run_dir, hparams):
  with tf.summary.create_file_writer(run_dir).as_default():
    hp.hparams(hparams)  # record the values used in this trial
    categorical_accuracy, binary_accuracy, categorical_crossentropy, binary_crossentropy, mean_squared_error = train_test_model(hparams)
    tf.summary.scalar(METRIC_CATEGORICAL_ACCURACY, categorical_accuracy, step=1)
    tf.summary.scalar(METRIC_BINARY_ACCURACY, binary_accuracy, step=1)
    tf.summary.scalar(METRIC_CATEGORICAL_CROSSENTROPY, categorical_crossentropy, step=1)
    tf.summary.scalar(METRIC_BINARY_CROSSENTROPY, binary_crossentropy, step=1)
    tf.summary.scalar(METRIC_MSE, mean_squared_error, step=1)

In [None]:
session_num = 0
 
for num_units_1 in HP_NUM_UNITS_1.domain.values:
  for num_units_2 in HP_NUM_UNITS_2.domain.values:
      for dropout_rate in (HP_DROPOUT.domain.min_value, HP_DROPOUT.domain.max_value):
        for optimizer in HP_OPTIMIZER.domain.values:
          for activation in HP_ACTIVATION.domain.values:
            for activation_output in HP_ACTIVATION_OUTPUT.domain.values:
              hparams = {
                HP_NUM_UNITS_1: num_units_1,
                HP_NUM_UNITS_2: num_units_2,
                HP_DROPOUT : dropout_rate,
                HP_OPTIMIZER: optimizer,
                HP_ACTIVATION: activation,
                HP_ACTIVATION_OUTPUT: activation_output
              }
              run_name = "run-%d" % session_num
              print('--- Starting trial: %s' % run_name)
              print({h.name: hparams[h] for h in hparams})
              run(logdir + run_name, hparams)
              session_num += 1          

--- Starting trial: run-0
{'num_units_1': 128, 'num_units_2': 128, 'dropout': 0.2, 'optimizer': 'adadelta', 'activation': 'elu', 'activation_output': 'sigmoid'}
Epoch 1/15
Instructions for updating:
use `tf.profiler.experimental.stop` instead.
Epoch 2/15
Epoch 3/15

In [None]:
%tensorboard --logdir logdir

# Brouillon en **dessous**


In [None]:
STEPS_PER_EPOCH = train_size//BATCH_SIZE/3-9
print(STEPS_PER_EPOCH)
lr_schedule = tf.keras.optimizers.schedules.InverseTimeDecay(
  0.001,
  decay_steps=STEPS_PER_EPOCH*1000,
  decay_rate=1,
  staircase=False)

def get_optimizer():
  return tf.keras.optimizers.Adam(lr_schedule)

2.0


In [None]:
def get_callbacks(name):
  return [
    tfdocs.modeling.EpochDots(),
    tf.keras.callbacks.EarlyStopping(monitor='val_binary_crossentropy', patience=200),
    tf.keras.callbacks.TensorBoard(logdir/name),
  ]

In [None]:
def compile_and_fit(model, name, optimizer=None, max_epochs=10):
  if optimizer is None:
    optimizer = get_optimizer()
  model.compile(optimizer=optimizer,
                loss=tf.keras.losses.BinaryCrossentropy(),
                metrics=[
                  tf.keras.losses.BinaryCrossentropy(name='binary_crossentropy'),
                  'categorical_accuracy'])

  model.summary()

  history = model.fit(
    train,
    steps_per_epoch = STEPS_PER_EPOCH,
    epochs=max_epochs,
    validation_data=val,
    callbacks=get_callbacks(name),
    verbose=0)
  return history

In [None]:
x_tiny = tf.keras.layers.Dense(32, activation="relu")(all_features)
output_tiny = tf.keras.layers.Dense(206)(x_tiny)
tiny_model = tf.keras.Model(all_inputs, output_tiny)

In [None]:
  size_histories = {}

In [None]:
size_histories['Tiny'] = compile_and_fit(tiny_model, 'sizes/Tiny')

In [None]:
plotter = tfdocs.plots.HistoryPlotter(metric = 'binary_crossentropy', smoothing_std=10)
plotter.plot(size_histories)
plt.ylim([0.5, 0.7])

In [None]:
x = tf.keras.layers.BatchNormalization()(all_features)

x = tf.keras.layers.Dense(32,activation='relu')(x)
x = tf.keras.layers.Dropout(0.2)(x)
x = tf.keras.layers.BatchNormalization()(x)

x = tf.keras.layers.Dense(32,activation='relu')(x)
x = tf.keras.layers.Dropout(0.5)(x)
x = tf.keras.layers.BatchNormalization()(x)

output = tf.keras.layers.Dense(206, activation='sigmoid')(x)
model = tf.keras.Model(all_inputs,output)

model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=["categorical_accuracy","binary_accuracy","categorical_crossentropy","binary_crossentropy","mean_squared_error"])

In [None]:
model.fit(train,validation_data=val,epochs=10)
#model.save('MOA_model_1')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7ffad41edb38>

In [None]:
model.evaluate(test)




[0.018200933933258057,
 0.060470324009656906,
 0.9968891143798828,
 2.9398598670959473,
 0.018200933933258057,
 0.003036072012037039]

In [None]:

tf.keras.utils.plot_model(model, show_shapes=True, rankdir="LR")
#model.save("drive/MyDrive/lish-moa")
