iPython Notebook based on the R code wich was provided by the team around Joachim L. Schulzte. The team wrote the "[Scalable Prediction of Acute Myeloid Leukemia Using High-Dimensional Machine Learning and Blood Transcriptomics](https://www.cell.com/action/showPdf?pii=S2589-0042%2819%2930525-5)" paper.



## 1. Imports

In [None]:
# Importing Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.regularizers import l1_l2
from keras.callbacks import EarlyStopping, ModelCheckpoint

Using TensorFlow backend.


## 2. Obtaining the data

In [None]:
# Mounting a google drive, in which the data should be stored in .csv format
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
# Reading Data from the google drive

# df1 = pd.read_csv("/content/drive/My Drive/TranscriptomicData/data1.csv")
# info1 = pd.read_csv("/content/drive/My Drive/TranscriptomicData/info1.csv")
df2 = pd.read_csv("/content/drive/My Drive/TranscriptomicData/data2.csv")
info2 = pd.read_csv("/content/drive/My Drive/TranscriptomicData/info2.csv")
# df3 = pd.read_csv("/content/drive/My Drive/TranscriptomicData/data3.csv")
# info3 = pd.read_csv("/content/drive/My Drive/TranscriptomicData/info3.csv")

In [None]:
# Choosing data

data = df2
target = info2

## 3. Cleaning the data 

In [None]:
# Cleaning of the data

data = data.rename(columns = {'Unnamed: 0':'Gene'})
data = data.T
data = data.rename(columns=data.iloc[0])
data.index.names = ['Sample']
data = data.iloc[1:]

In [None]:
# data.shape

In [None]:
# data.head()

In [None]:
# Cleaning of the target data

target = target.rename(columns = {'Unnamed: 0':'Sample'})
target = target.set_index("Sample")

In [None]:
# target.shape

In [None]:
# target.head()

In [None]:
# Joining the data and the target data and cleaning it

dt = data.join(target)
dt = dt.drop(columns=['Dataset', 'GSE', 'Disease', 'Tissue', 'FAB', 'Filename', 'FAB_all'])
dt.Condition = dt.Condition.map(dict(CASE=1, CONTROL=0))
dt = dt.astype('int64')
dt = dt.dropna()

In [None]:
# dt.shape

In [None]:
# dt.head ()

## 4. Preparing the data


In [None]:
# Splitting the data into source (X) and target (y) variables

X = dt.drop(['Condition'], axis=1)
y = dt['Condition']

In [None]:
# Normalizing the source data

scaler = StandardScaler()
X = scaler.fit_transform(X)
X = pd.DataFrame(X)

In [None]:
# X.shape

In [None]:
# X.head()

In [None]:
# Splitting the Data in train (80%) and test (20%)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

## 5. Model training

In [None]:
# Defining some parameters

r = X_train.shape[0]  # number of rows
c = X_train.shape[1] # number of columns 
epochs = 100
batch_size = 512
num_nodes = 1024
dropout_rate = 0.3
l1_v = 0.0
l2_v = 0.005


In [None]:
# NN (fit2 in Paper)

model = Sequential()
#input layer
model.add(Dense(256, activation='relu', kernel_regularizer = l1_l2(l1=0.0, l2=0.0), input_dim=c))
model.add(Dropout(0.4))

# first layer
model.add(Dense(num_nodes, activation='relu', kernel_regularizer = l1_l2(l1=l1_v, l2=l2_v), input_dim=c))
model.add(Dropout(dropout_rate))
# second layer
model.add(Dense(int(num_nodes / 2), activation='relu', kernel_regularizer = l1_l2(l1=l1_v, l2=l2_v), input_dim=c))
model.add(Dropout(dropout_rate))
# third layer
model.add(Dense(int(num_nodes / 2), activation='relu', kernel_regularizer = l1_l2(l1=l1_v, l2=l2_v), input_dim=c))
model.add(Dropout(dropout_rate))
# fourth layer
model.add(Dense(int(num_nodes / 4), activation='relu', kernel_regularizer = l1_l2(l1=l1_v, l2=l2_v), input_dim=c))
model.add(Dropout(dropout_rate))
# fifth layer
model.add(Dense(int(num_nodes / 4), activation='relu', kernel_regularizer = l1_l2(l1=l1_v, l2=l2_v), input_dim=c))
model.add(Dropout(dropout_rate))
# sixth layer
model.add(Dense(int(num_nodes / 8), activation='relu', kernel_regularizer = l1_l2(l1=l1_v, l2=l2_v), input_dim=c))
model.add(Dropout(dropout_rate))
# seventh layer
model.add(Dense(int(num_nodes / 8), activation='relu', kernel_regularizer = l1_l2(l1=l1_v, l2=l2_v), input_dim=c))
model.add(Dropout(dropout_rate))
# eighth layer
model.add(Dense(int(num_nodes / 16), activation='relu', kernel_regularizer = l1_l2(l1=l1_v, l2=l2_v), input_dim=c))
model.add(Dropout(dropout_rate))

# output layer
model.add(Dense(units = 1, activation = "tanh"))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.summary()





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 256)               3253504   
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1024)              263168    
_________________________________________________________________
dropout_2 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 512)               524800    
_____________________________________

In [None]:
# Creating a callback that implements early stopping if the loss function decreases and saves the best model based on the loss function in the h5 format in the mounted drive.

callbacks = [EarlyStopping(monitor='loss', patience=25),
         ModelCheckpoint(filepath='/content/drive/My Drive/TranscriptomicData/best_model_fit2.h5', monitor='loss', save_best_only=True)]

In [None]:
# Training the model
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, callbacks=callbacks)





Epoch 1/100





Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100


<keras.callbacks.History at 0x7f54856a8e80>

In [None]:
# Evaluating the score the of the model against unseen data

score = model.evaluate(X_test, y_test, verbose = 0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 0.6700315555412613
Test accuracy: 0.9934131736526947


In [None]:
#plot_model(model)