## Triage Task

### Import training set file

In [46]:
import json
import numpy as np

import json 
import pandas as pd 
from pandas.io.json import json_normalize

with open('PMtask_Triage_TrainingSet.json') as json_file:
    data = json.load(json_file)
    
documents = json_normalize(data['documents'])
documents.head(3)

Unnamed: 0,id,infons.relevant,passages,relations
0,9685346,no,[{'text': 'The molecular basis of Rieger syndr...,[]
1,10364224,no,[{'text': 'Identification of the cell cycle re...,[]
2,10688642,no,[{'text': 'The Est1 subunit of yeast telomeras...,[]


### Unpack the passages column into a standalone dataframe

In [33]:
passages = json_normalize(data = data['documents'], record_path = 'passages', record_prefix = 'passage.', meta = 'id')
passages.head(3)

Unnamed: 0,passage.annotations,passage.infons,passage.offset,passage.relations,passage.sentences,passage.text,id
0,[],{'type': 'title'},0,[],[],The molecular basis of Rieger syndrome. Analys...,9685346
1,[],{'type': 'abstract'},90,[],[],Rieger syndrome is an autosomal-dominant devel...,9685346
2,[],{'type': 'title'},0,[],[],Identification of the cell cycle regulator VCP...,10364224


### Unpack the passage.infons column into a standalone dataframe

In [34]:
types = json_normalize(passages['passage.infons'], meta = 'id')
types.head(3)

Unnamed: 0,type
0,title
1,abstract
2,title


### Merge

In [35]:
documents = documents.merge(passages, on = "id", how = "inner")
documents = documents.merge(types, left_index = True, right_index = True)
documents = documents.drop(columns = ['passages', 'relations', 'passage.annotations', 'passage.infons', 'passage.relations', 'passage.sentences'])
documents['infons.relevant'].replace('no', 0, inplace = True)
documents['infons.relevant'].replace('yes', 1, inplace = True)
documents.head(4)

Unnamed: 0,id,infons.relevant,passage.offset,passage.text,type
0,9685346,0,0,The molecular basis of Rieger syndrome. Analys...,title
1,9685346,0,90,Rieger syndrome is an autosomal-dominant devel...,abstract
2,10364224,0,0,Identification of the cell cycle regulator VCP...,title
3,10364224,0,134,The human band 4.1-related protein-tyrosine ph...,abstract


### Import test set file

In [36]:
with open('PMtask_Triage_TestSet.json') as json_file:
    data_test = json.load(json_file)
    
documents_test = json_normalize(data_test['documents'])
passages_test = json_normalize(data = data_test['documents'], record_path = 'passages', record_prefix = 'passage.', meta = 'id')
types_test = json_normalize(passages_test['passage.infons'], meta = 'id')

documents_test = documents_test.merge(passages_test, on = "id", how = "inner")
documents_test = documents_test.merge(types_test, left_index = True, right_index = True)
documents_test = documents_test.drop(columns = ['passages', 'relations', 'passage.annotations', 'passage.infons', 'passage.relations', 'passage.sentences'])
documents_test['infons.relevant'].replace('no', 0, inplace = True)
documents_test['infons.relevant'].replace('yes', 1, inplace = True)
documents_test.head(4)

Unnamed: 0,id,infons.relevant,passage.offset,passage.text,type
0,10220326,0,0,High-conductance calcium-activated potassium c...,title
1,10220326,0,121,"In rat brain, high-conductance Ca2+-activated ...",abstract
2,9119005,1,0,Photoaffinity labeling analysis of the interac...,title
3,9119005,1,145,To identify residues and domains of the peptid...,abstract


### Defining a baseline model

In [37]:
text_train = documents['passage.text'].values
text_train_batch= text_train[:1000]
y_train = documents['infons.relevant'].values
y_train_batch= y_train[:1000]

text_test = documents_test['passage.text'].values
text_test_batch= text_test[:500]
y_test = documents_test['infons.relevant'].values
y_test_batch= text_test[:500]

print(text_train.shape[0])
print(text_train_batch.shape[0])
print(text_test.shape[0])
print(text_test_batch.shape[0])
#text_train['passage.text'].map(len).max()
max(documents.astype('str').applymap(lambda x: len(x)).max())

8162
1000
2854
500


3307

### Word embedding

In [66]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_train_batch)

# X_train = tokenizer.texts_to_sequences(text_train)
X_train = tokenizer.texts_to_sequences(text_train_batch)
# X_test = tokenizer.texts_to_sequences(text_test)
X_test = tokenizer.texts_to_sequences(text_test_batch)

vocab_size = len(tokenizer.word_index) + 1

from keras.preprocessing.sequence import pad_sequences

maxlen = 3307

X_train = pad_sequences(X_train, padding = 'post', maxlen = maxlen)
X_test = pad_sequences(X_test, padding = 'post', maxlen = maxlen)

print(tokenizer.word_index)



### Keras embedding layer

In [41]:
from keras.models import Sequential
from keras import layers

embedding_dim = 1000

model = Sequential()
model.add(layers.Embedding(input_dim = vocab_size, 
                           output_dim = embedding_dim, 
                           input_length = maxlen))
# model.add(layers.Flatten())
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(10, activation = 'relu'))
model.add(layers.Dense(1, activation = 'sigmoid'))
model.compile(optimizer = 'adam',
              loss = 'binary_crossentropy',
              metrics = ['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 3307, 1000)        9309000   
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 1000)              0         
_________________________________________________________________
dense_11 (Dense)             (None, 10)                10010     
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 11        
Total params: 9,319,021
Trainable params: 9,319,021
Non-trainable params: 0
_________________________________________________________________


### Model fitting and accuracy

In [71]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

def plot_history(history):
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

history = model.fit(X_train, y_train_batch,
                    epochs = 20,
                    verbose = True,
#                    validation_data = (X_test, y_test_batch),
                    validation_split = 0.1,
                    batch_size = 100)
loss, accuracy = model.evaluate(X_train, y_train_batch, verbose = False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test_batch, verbose = False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history)

Train on 900 samples, validate on 100 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


ValueError: Input arrays should have the same number of samples as target arrays. Found 1000 input samples and 8162 target samples.

In [72]:
loss, accuracy = model.evaluate(X_train, y_train_batch, verbose = False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test_batch, verbose = False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history)

Training Accuracy: 1.0000


ValueError: could not convert string to float: 'High-conductance calcium-activated potassium channels in rat brain: pharmacology, distribution, and subunit composition.'

In [69]:
np.set_printoptions(threshold=1000)
print(X_test)

[[ 340 4593  993 ...    0    0    0]
 [   4 1060  442 ...    0    0    0]
 [4018  185    2 ...    0    0    0]
 ...
 [ 814 1529   84 ...    0    0    0]
 [  47 1426  735 ...    0    0    0]
 [ 340 3527   47 ...    0    0    0]]
