**Introduction**

In this notebook, we will be predicting multiple targets of the Mechanism of Action (MoA) response(s) of different samples (sig_id), given various inputs such as gene expression data and cell viability data.


## Import Library

In [1]:
# ML library
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

# Data manipulation library
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
# Make numpy values easier to read.
np.set_printoptions(precision=3, suppress=True)


# **Step 1:** Reading data using TF.DATA


In [3]:
TRAIN_DATA_URL = "https://drive.google.com/file/d/1cWd2o55ed-pf-Ia9umqJGTOlCFydwgyf/view?usp=sharing"
#Use Data Folder call : lish-moa who contains all csv
TRAIN_FEATURES_PATH =  "lish-moa/train_features.csv"
TRAIN_LABELS_PATH   =  "lish-moa/train_targets_scored.csv"

In [4]:
## TRAIN FEATURES 
#Read just one of features dataset and make a list of columns name (we remove sig_id with index_col=0)
train_features_header_list  = pd.read_csv(TRAIN_FEATURES_PATH, 
                                          index_col=0,
                                          nrows=1).columns.tolist()
#Just the 3 first colunm are categorical, all of the rest are nuerical
#We create 3 list with the features names (one categorical, one numerical)
numerical_col_list   = train_features_header_list[3:]
categorical_col_list = train_features_header_list[0:3]

## TRAIN LABELS
#Read just one of labels dataset and make a list of columns name (we remove sig_id with index_col=0)
train_labels_header_list    = pd.read_csv(TRAIN_LABELS_PATH, 
                                          index_col=0,
                                          nrows=1).columns.tolist()
# Get the size of train dataset
train_features_size = len(pd.read_csv(TRAIN_FEATURES_PATH,usecols=[numerical_col_list[0]]))
train_label_size = len(pd.read_csv(TRAIN_LABELS_PATH,usecols=[train_labels_header_list[0]]))

In [5]:
print("NUMBER OF FEATURES ; ", len(train_features_header_list))
print("NUMBER OF TARGET : ", len(train_labels_header_list))

print("TRAIN DATASET SIZE (features) : ", train_features_size)
print("TRAIN DATASET SIZE (labels) : ", train_label_size)

NUMBER OF FEATURES ;  875
NUMBER OF TARGET :  206
TRAIN DATASET SIZE (features) :  23814
TRAIN DATASET SIZE (labels) :  23814


In [6]:
# We define the size of each batch at 2048
BATCH_SIZE=32
# Load train features dataset with shuffling (seed = 54), batching (size = 2048)
features = tf.data.experimental.make_csv_dataset(
    file_pattern = TRAIN_FEATURES_PATH,
    select_columns=train_features_header_list,
    header=True,
    batch_size=BATCH_SIZE,
    shuffle_seed=54,
    shuffle=True,
    num_epochs=1,
    num_parallel_reads=100
)
targets = tf.data.experimental.make_csv_dataset(
    file_pattern=TRAIN_LABELS_PATH,
    select_columns=train_labels_header_list,
    header=True,
    batch_size=BATCH_SIZE,
    shuffle_seed=54,
    shuffle=True,
    num_epochs=1,
    num_parallel_reads=100
)
dataset = tf.data.Dataset.zip((features, targets))

In [9]:
%%time
for i,batch in enumerate(dataset.take(20)):
  print('.',end='')

............CPU times: user 26.1 s, sys: 1.15 s, total: 27.3 s
Wall time: 25.8 s


### Split the training dataset into training and validation to be able to perform the hyperparameter tuning

In [7]:
DATASET_SIZE = train_features_size
print("Size of the datset : " ,DATASET_SIZE)

Size of the datset :  23814


In [8]:
train_size = int(0.8 * DATASET_SIZE)
val_size = int(0.2 * DATASET_SIZE)

train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)
val_dataset = dataset.take(val_size)

print("Full dataset size:", DATASET_SIZE)
print("Train dataset size:", train_size)
print("Val dataset size:", val_size)

Full dataset size: 23814
Train dataset size: 19051
Val dataset size: 4762


### Shuffing, Batching, Mapping data

In [41]:
#shuffle_train_dataset = train_dataset.cache().shuffle(buffer_size=train_features_size)

In [9]:
def _preprocess_line(features, targets):
    #Convert labels column to an array  
    targets = tf.nest.flatten(targets)
    return features, targets
train_dataset = train_dataset.map(_preprocess_line)

val_dataset = val_dataset.map(_preprocess_line)

In [10]:
for feature_batch, label_batch in train_dataset.take(1):
    print('First 5 features:', list(feature_batch.keys())[:5])
    print('A batch of cp_types:', feature_batch['cp_type'].numpy())
    print('A batch of cp_times:', feature_batch['cp_time'].numpy())
    print('A batch of targets:', label_batch.numpy()) 

First 5 features: ['cp_type', 'cp_time', 'cp_dose', 'g-0', 'g-1']
A batch of cp_types: [b'ctl_vehicle' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp'
 b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp'
 b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp'
 b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp'
 b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp' b'trt_cp']
A batch of cp_times: [72 72 24 48 72 48 48 48 24 24 72 24 24 48 24 24 24 24 48 72 24 72 72 48
 72 48 24 24 48 72 48 48]
A batch of targets: [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


# **Step 2**: Feature Engineering

### We use tensorflow	feature_columns or keras preprocessing	layers 

### Now we differentiate between numerical features and categorical features and apply the corresponding adequate feature engineering strategies to the right format.

Normalization of numerical columns:

In [36]:
def get_normalization_layer(name, dataset):
  # Create a Normalization layer for our feature.
  normalizer = preprocessing.Normalization()

  # Prepare a Dataset that only yields our feature.
  feature_ds = dataset.map(lambda x, y: x[name])

  # Learn the statistics of the data.
  normalizer.adapt(feature_ds)

  return normalizer

In [39]:
all_inputs=[]
# Here just normalisation of 2 features because crash with more 
# For crash test switch line below
#for header in numerical_col_list:
for header in ['g-0', 'g-1']:
    numeric_col = tf.keras.Input(shape=(1,), name=header)
    normalization_layer = get_normalization_layer(header, train_dataset)
    encoded_numeric_col = normalization_layer(numeric_col)
    all_inputs.append(numeric_col)
    encoded_features.append(encoded_numeric_col)

### Categorical encoding with vocabulary list

In [35]:
encoded_features = []
CATEGORIES = {
    'cp_type': ['ctrl_vehicle', 'cp_vehicle'],
    'cp_time' : ['24', '48', '78'],
    'cp_dose' : ['D1','D2']
}


categorical_columns = []
for feature, vocab in CATEGORIES.items():
    cat_col = tf.feature_column.categorical_column_with_vocabulary_list(
        key=feature, vocabulary_list=vocab)
    indicator_column = tf.feature_column.indicator_column(cat_col)
    encoded_features.append(indicator_column)

# **Step 3:** Baseline Modeling 

Using either the Keras sequential API or the Keras function API, we should have at least one hidden layer

L2 regularization

Dropout regularization

Batch Normalization

Weights Initialization

#**Step 4:** Model Variance & Bias Analysis

Plot the training and validation loss of your model as a function of the number of epochs

Plot the learning curve

Diagnose Variance & Bias and propose a way to solve high variance or high bias if it’s being detected

#**Steps 5:** Hyperparameter tuning with Keras Tuner 