# Personalized Cancer Diagnosis using Deep Learning

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import time
import nltk
from nltk.corpus import stopwords
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM,SimpleRNN,GRU
from tensorflow.keras.layers import Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
print('-'*10,'Train variants','-'*10)
train_variants_path = '/content/drive/My Drive/data/personalized medicine/training_variants'
train_variants = pd.read_csv(train_variants_path)
print(train_variants.head())
print('-'*10,'Test variants','-'*10)
test_variants_path = '/content/drive/My Drive/data/personalized medicine/test_variants'
test_variants = pd.read_csv(test_variants_path)
print(test_variants.head())

---------- Train variants ----------
   ID    Gene             Variation  Class
0   0  FAM58A  Truncating Mutations      1
1   1     CBL                 W802*      2
2   2     CBL                 Q249E      2
3   3     CBL                 N454D      3
4   4     CBL                 L399V      4
---------- Test variants ----------
   ID     Gene Variation
0   0    ACSL4     R570S
1   1    NAGLU     P521L
2   2      PAH     L333F
3   3     ING1     A148D
4   4  TMEM216      G77A


In [3]:
print('-'*10,'Train text','-'*10)
train_text_path = '/content/drive/My Drive/data/personalized medicine/training_text'
train_text = pd.read_csv(train_text_path,sep='\|\|', header=None, skiprows=1, names=["ID","Text"])
print(train_text.head())
print('-'*10,'Test text','-'*10)
test_text_path = '/content/drive/My Drive/data/personalized medicine/test_text'
test_text = pd.read_csv(test_text_path,sep='\|\|', header=None, skiprows=1, names=["ID","Text"])
print(test_text.head())

---------- Train text ----------


  This is separate from the ipykernel package so we can avoid doing imports until


   ID                                               Text
0   0  Cyclin-dependent kinases (CDKs) regulate a var...
1   1   Abstract Background  Non-small cell lung canc...
2   2   Abstract Background  Non-small cell lung canc...
3   3  Recent evidence has demonstrated that acquired...
4   4  Oncogenic mutations in the monomeric Casitas B...
---------- Test text ----------


  import sys


   ID                                               Text
0   0  2. This mutation resulted in a myeloproliferat...
1   1   Abstract The Large Tumor Suppressor 1 (LATS1)...
2   2  Vascular endothelial growth factor receptor (V...
3   3  Inflammatory myofibroblastic tumor (IMT) is a ...
4   4   Abstract Retinoblastoma is a pediatric retina...


In [4]:
train_data = pd.merge(train_variants, train_text, how='left', on='ID')
train_data.head()

Unnamed: 0,ID,Gene,Variation,Class,Text
0,0,FAM58A,Truncating Mutations,1,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,CBL,W802*,2,Abstract Background Non-small cell lung canc...
2,2,CBL,Q249E,2,Abstract Background Non-small cell lung canc...
3,3,CBL,N454D,3,Recent evidence has demonstrated that acquired...
4,4,CBL,L399V,4,Oncogenic mutations in the monomeric Casitas B...


In [7]:
x = train_data['Text']
y = train_data['Class']
y = pd.get_dummies(y).values
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42,stratify=y)

In [8]:
vocab_size = 2000
max_len=1000
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(str(x_train))
x_train = tokenizer.texts_to_sequences(str(x_train))
x_train = pad_sequences(x_train,maxlen=max_len)
tokenizer.fit_on_texts(str(x_test))
x_test = tokenizer.texts_to_sequences(str(x_test))
x_test = pad_sequences(x_test,maxlen=max_len)

In [9]:
# size of input layer
input_dim = len(x_train[0])

model = Sequential()
model.add(Dense(32, activation='relu', input_dim=input_dim))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu', input_dim=input_dim))
model.add(Dropout(0.5))
model.add(Dense(9, activation='sigmoid'))

print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 32)                32032     
_________________________________________________________________
dropout (Dropout)            (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                1056      
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 9)                 297       
Total params: 33,385
Trainable params: 33,385
Non-trainable params: 0
_________________________________________________________________
None


In [10]:
model.compile(optimizer='Adam',loss='categorical_crossentropy',metrics=['accuracy'])

In [11]:
callback = EarlyStopping(monitor='loss',patience=5)
model.fit(x_train,y_train,epochs=30,batch_size=32,validation_split=0.2,callbacks=callback)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30


<tensorflow.python.keras.callbacks.History at 0x7f2dc007c780>

In [27]:
test_x = x_test[11:]
test_x.shape

(665, 1000)

In [30]:
from sklearn.metrics  import log_loss
probas = model.predict(test_x)
pred_indices = np.argmax(probas,axis=1)
classes = np.array(range(1,10))
print('Log loss: {}'.format(log_loss(classes[np.argmax(y_test, axis=1)], probas)))

Log loss: 1.8796060477880607


In [38]:
embed_dim = 128
lstm_out = 196
# Model saving callback
checkpt_callback = ModelCheckpoint('keras_model', 
                                 monitor='val_loss', 
                                 verbose=1, 
                                 save_best_only=True, 
                                 mode='auto')

model = Sequential()
model.add(Embedding(vocab_size, embed_dim, input_length = input_dim))
model.add(LSTM(lstm_out, recurrent_dropout=0.2, dropout=0.2,return_sequences=True))
model.add(LSTM(64))
model.add(Dense(9,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['categorical_crossentropy'])
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1000, 128)         256000    
_________________________________________________________________
lstm_3 (LSTM)                (None, 1000, 196)         254800    
_________________________________________________________________
lstm_4 (LSTM)                (None, 64)                66816     
_________________________________________________________________
dense_4 (Dense)              (None, 9)                 585       
Total params: 578,201
Trainable params: 578,201
Non-trainable params: 0
_________________________________________________________________
None


In [42]:
model.fit(x_train,y_train,epochs=10,batch_size=64,validation_split=0.2,callbacks=[callback,checkpt_callback])

Epoch 1/10
Epoch 00001: val_loss improved from inf to 1.80193, saving model to keras_model
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: keras_model/assets
Epoch 2/10
Epoch 00002: val_loss did not improve from 1.80193
Epoch 3/10
Epoch 00003: val_loss did not improve from 1.80193
Epoch 4/10
Epoch 00004: val_loss did not improve from 1.80193
Epoch 5/10
Epoch 00005: val_loss did not improve from 1.80193
Epoch 6/10
Epoch 00006: val_loss did not improve from 1.80193
Epoch 7/10
Epoch 00007: val_loss did not improve from 1.80193
Epoch 8/10
Epoch 00008: val_loss did not improve from 1.80193
Epoch 9/10
Epoch 00009: val_loss did not improve from 1.80193
Epoch 10/10
Epoch 00010: val_loss did not improve from 1.80193


<tensorflow.python.keras.callbacks.History at 0x7f2d6e3cbb38>

In [44]:
model = tf.keras.models.load_model('keras_model')
probas = model.predict(test_x)
pred_indices = np.argmax(probas,axis=1)
classes = np.array(range(1,10))
print('Log loss: {}'.format(log_loss(classes[np.argmax(y_test, axis=1)], probas)))

Log loss: 1.8477547545182078
