## Triage Task

### Import pre-trained Word2Vec vector space

In [1]:
import gensim
from gensim.models import KeyedVectors

wv_from_bin = gensim.models.KeyedVectors.load_word2vec_format('wikipedia-pubmed-and-PMC-w2v.bin', binary = True)

### Import training set file

In [24]:
import json
import numpy as np

import json 
import pandas as pd 
from pandas.io.json import json_normalize

with open('PMtask_Triage_TrainingSet.json') as json_file:
    data = json.load(json_file)
    
documents = json_normalize(data['documents'])
documents.head(3)

Unnamed: 0,id,infons.relevant,passages,relations
0,9685346,no,[{'text': 'The molecular basis of Rieger syndr...,[]
1,10364224,no,[{'text': 'Identification of the cell cycle re...,[]
2,10688642,no,[{'text': 'The Est1 subunit of yeast telomeras...,[]


### Unpack the passages column into a standalone dataframe

In [25]:
passages = json_normalize(data = data['documents'], record_path = 'passages', record_prefix = 'passage.', meta = 'id')
passages.head(3)

Unnamed: 0,passage.annotations,passage.infons,passage.offset,passage.relations,passage.sentences,passage.text,id
0,[],{'type': 'title'},0,[],[],The molecular basis of Rieger syndrome. Analys...,9685346
1,[],{'type': 'abstract'},90,[],[],Rieger syndrome is an autosomal-dominant devel...,9685346
2,[],{'type': 'title'},0,[],[],Identification of the cell cycle regulator VCP...,10364224


### Unpack the passage.infons column into a standalone dataframe

In [26]:
types = json_normalize(passages['passage.infons'], meta = 'id')
types.head(3)

Unnamed: 0,type
0,title
1,abstract
2,title


### Merge

In [27]:
documents = documents.merge(passages, on = "id", how = "inner")
documents = documents.merge(types, left_index = True, right_index = True)
documents = documents.drop(columns = ['passages', 'relations', 'passage.annotations', 'passage.infons', 'passage.relations', 'passage.sentences'])
documents['infons.relevant'].replace('no', 0, inplace = True)
documents['infons.relevant'].replace('yes', 1, inplace = True)
documents.name = 'training'
documents.head(4)

Unnamed: 0,id,infons.relevant,passage.offset,passage.text,type
0,9685346,0,0,The molecular basis of Rieger syndrome. Analys...,title
1,9685346,0,90,Rieger syndrome is an autosomal-dominant devel...,abstract
2,10364224,0,0,Identification of the cell cycle regulator VCP...,title
3,10364224,0,134,The human band 4.1-related protein-tyrosine ph...,abstract


### Import test set file

In [28]:
with open('PMtask_Triage_TestSet.json') as json_file:
    data_test = json.load(json_file)
    
documents_test = json_normalize(data_test['documents'])
passages_test = json_normalize(data = data_test['documents'], record_path = 'passages', record_prefix = 'passage.', meta = 'id')
types_test = json_normalize(passages_test['passage.infons'], meta = 'id')

documents_test = documents_test.merge(passages_test, on = "id", how = "inner")
documents_test = documents_test.merge(types_test, left_index = True, right_index = True)
documents_test = documents_test.drop(columns = ['passages', 'relations', 'passage.annotations', 'passage.infons', 'passage.relations', 'passage.sentences'])
documents_test['infons.relevant'].replace('no', 0, inplace = True)
documents_test['infons.relevant'].replace('yes', 1, inplace = True)
documents_test.name = 'test'
documents_test.head(4)

Unnamed: 0,id,infons.relevant,passage.offset,passage.text,type
0,10220326,0,0,High-conductance calcium-activated potassium c...,title
1,10220326,0,121,"In rat brain, high-conductance Ca2+-activated ...",abstract
2,9119005,1,0,Photoaffinity labeling analysis of the interac...,title
3,9119005,1,145,To identify residues and domains of the peptid...,abstract


### Text to word sequence (embedding)

In [29]:
from keras.preprocessing.text import text_to_word_sequence

def vectorize(row, text, embedding_matrix):
    for index, word in enumerate(text):
        try:
            embedding_matrix[row][index] = wv_from_bin.wv.vocab[word].index
        except:
            pass
        
embedding_matrix_train = np.zeros((8162, 3559))
embedding_matrix_test = np.zeros((2854, 3559))

def word_sequence(df):
    df['passage.text'] = df['passage.text'].apply(lambda x: text_to_word_sequence(x, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower = False, split=' '))
    for index, row in df.iterrows():
        if(df.name == 'training'):
            vectorize(index, row['passage.text'], embedding_matrix_train)
        if(df.name == 'test'):
            vectorize(index, row['passage.text'], embedding_matrix_test)

### Defining a baseline model

In [30]:
# text_train = documents['passage.text'].values
# text_train_batch= text_train[:1000]
# y_train = documents['infons.relevant'].values
# y_train_batch= y_train[:1000]

# text_test = documents_test['passage.text'].values
# text_test_batch= text_test[:500]
# y_test = documents_test['infons.relevant'].values
# y_test_batch= text_test[:500]

# print(text_train.shape[0])
# print(text_train_batch.shape[0])
# print(text_test.shape[0])
# print(text_test_batch.shape[0])

# max(documents.astype('str').applymap(lambda x: len(x)).max())
# max(documents_test.astype('str').applymap(lambda x: len(x)).max())

# Training data
word_sequence(documents)
X_train = embedding_matrix_train
X_train_batch = X_train[:50]

y_train = documents['infons.relevant'].values
y_train_batch = y_train[:50]

# Test data
word_sequence(documents_test)
X_test = embedding_matrix_test
X_test_batch = X_test[:25]

y_test = documents_test['infons.relevant'].values
y_test_batch = documents_test[:25]

# Check matrix
print(X_train)
print(X_test)

vocab_size = len(np.unique(X_train_batch)) + len(np.unique(X_test_batch))
embedding_dim = 50
maxlen = 3559

  


[[1.40000e+01 3.89000e+02 7.54000e+02 ... 0.00000e+00 0.00000e+00
  0.00000e+00]
 [1.13185e+05 5.01000e+02 1.50000e+01 ... 0.00000e+00 0.00000e+00
  0.00000e+00]
 [4.23200e+03 4.00000e+00 1.00000e+00 ... 0.00000e+00 0.00000e+00
  0.00000e+00]
 ...
 [1.40000e+01 1.05650e+04 1.81400e+03 ... 0.00000e+00 0.00000e+00
  0.00000e+00]
 [1.40000e+01 5.06000e+02 4.20000e+01 ... 0.00000e+00 0.00000e+00
  0.00000e+00]
 [3.08060e+04 1.25800e+03 1.00000e+01 ... 0.00000e+00 0.00000e+00
  0.00000e+00]]
[[1.27200e+03 5.28800e+03 9.95000e+02 ... 0.00000e+00 0.00000e+00
  0.00000e+00]
 [2.90000e+01 4.31000e+02 3.28000e+02 ... 0.00000e+00 0.00000e+00
  0.00000e+00]
 [1.01863e+05 3.11000e+03 8.50000e+01 ... 0.00000e+00 0.00000e+00
  0.00000e+00]
 ...
 [4.63782e+05 1.50000e+01 1.00000e+00 ... 0.00000e+00 0.00000e+00
  0.00000e+00]
 [6.76020e+04 1.26400e+03 4.00000e+00 ... 0.00000e+00 0.00000e+00
  0.00000e+00]
 [1.40000e+01 2.86000e+02 4.00000e+00 ... 0.00000e+00 0.00000e+00
  0.00000e+00]]


### Word embedding

In [31]:
# from keras.preprocessing.text import Tokenizer

# tokenizer = Tokenizer()
# tokenizer.fit_on_texts(text_train_batch)

# X_train = tokenizer.texts_to_sequences(text_train)
# X_train = tokenizer.texts_to_sequences(text_train_batch)
# X_test = tokenizer.texts_to_sequences(text_test)
# X_test = tokenizer.texts_to_sequences(text_test_batch)

# vocab_size = len(tokenizer.word_index) + 1

# from keras.preprocessing.sequence import pad_sequences

# maxlen = 3559

# X_train = pad_sequences(X_train, padding = 'post', maxlen = maxlen)
# X_test = pad_sequences(X_test, padding = 'post', maxlen = maxlen)

# print(X_train)
# print(tokenizer.word_index)

### Keras embedding layer

In [32]:
from keras.models import Sequential
from keras import layers

model = Sequential()
model.add(layers.Embedding(input_dim = vocab_size, 
                           output_dim = embedding_dim, 
                           input_length = maxlen))
# model.add(layers.Flatten())
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(10, activation = 'relu'))
model.add(layers.Dense(1, activation = 'sigmoid'))
model.compile(optimizer = 'adam',
              loss = 'binary_crossentropy',
              metrics = ['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 3559, 50)          128750    
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 50)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 10)                510       
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 11        
Total params: 129,271
Trainable params: 129,271
Non-trainable params: 0
_________________________________________________________________


### Model fitting and accuracy

In [36]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

def plot_history(history):
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

history = model.fit(X_train_batch, y_train_batch,
                    epochs = 20,
                    verbose = True,
                    # validation_data = (X_test_batch, y_test_batch),
                    validation_split = 0.1,
                    batch_size = 50)
loss, accuracy = model.evaluate(X_train_batch, y_train_batch, verbose = False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test_batch, y_test_batch, verbose = False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history)

Train on 45 samples, validate on 5 samples
Epoch 1/20


InvalidArgumentError: indices[18,4] = 7512 is not in [0, 2575)
	 [[{{node embedding_4/embedding_lookup}} = GatherV2[Taxis=DT_INT32, Tindices=DT_INT32, Tparams=DT_FLOAT, _class=["loc:@training_3/Adam/Assign_2"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](embedding_4/embeddings/read, embedding_4/Cast, training_3/Adam/gradients/embedding_4/embedding_lookup_grad/concat/axis)]]

In [22]:
loss, accuracy = model.evaluate(X_train, y_train_batch, verbose = False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test_batch, verbose = False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history)

Training Accuracy: 0.9980


ValueError: could not convert string to float: 'High-conductance calcium-activated potassium channels in rat brain: pharmacology, distribution, and subunit composition.'

In [69]:
np.set_printoptions(threshold=1000)
print(X_test)

[[ 340 4593  993 ...    0    0    0]
 [   4 1060  442 ...    0    0    0]
 [4018  185    2 ...    0    0    0]
 ...
 [ 814 1529   84 ...    0    0    0]
 [  47 1426  735 ...    0    0    0]
 [ 340 3527   47 ...    0    0    0]]
