In [2]:
import keras
import json
import numpy as np
import keras_tqdm
import re

with open('biorxiv_scraped.json', 'r') as fp:
    articles = json.load(fp)


In [3]:
print(articles[0])

{'title': 'Ludicrous Speed Linear Mixed Models for Genome-Wide Association Studies', 'url': 'https://www.biorxiv.org/content/early/2018/01/03/154682', 'abstract': 'We have developed Ludicrous Speed Linear Mixed Models, a version of FaST-LMM optimized for the cloud. The approach can perform a genome-wide association analysis on a dataset of one million SNPs across one million individuals at a cost of about 868 CPU days with an elapsed time on the order of two weeks.', 'tag': 'Bioinformatics'}


In [2]:
word_blob = []
for a in articles:
    sentences = a['abstract'].lower() + " " + a['title'].lower()
    words = sentences.replace('(','').replace(')','').replace(':','').split(' ')
    processed_words = [w if not w.endswith('.') else w.replace('.','') for w in words if 'https' not in w]
    word_blob.extend(processed_words)

vocab = list(set(word_blob))[1:]
vocab.insert(0, '<TOK>')
vocab.insert(0, '<PAD>')
no_nums_vocab = [w for w in vocab if w.isalpha()]
no_nums_vocab.insert(0, '<NUM>')
no_nums_vocab.insert(0, '<SYM>')
no_nums_vocab.insert(0, '<TOK>')
no_nums_vocab.insert(0, '<PAD>')

numchecker = re.compile("^[0-9][0-9|\.]+$")
punc_checker = re.compile("[^\w\s]|_+$")
freqs = {}
for w in word_blob:
    if w.isalpha():
        freqs[w] = freqs.get(w, 0) + 1
    elif numchecker.match(w):
        freqs['<NUM>'] = freqs.get('<NUM>', 0) + 1
    elif punc_checker.match(w):
        freqs['<SYM>'] = freqs.get('<SYM>', 0) +1
    else:
        freqs['<TOK>'] = freqs.get('<TOK>', 0) + 1

freqslist = [(i,j) for i,j in freqs.items()]
freqslist = sorted(freqslist, key=lambda x:x[1], reverse=True)
no_nums_vocab = [f[0] for f in freqslist]
no_nums_vocab.insert(0, '<PAD>')
word_idx = dict((c, i) for i, c in enumerate(no_nums_vocab))
print(len(no_nums_vocab))

max_lab_len = 0
max_src_len = 0
final_articles = []
for a in articles:
    title = [w.replace('.', '') if w.endswith('.') else w for w in a['title'].lower().replace(')','').replace('(','').replace(':','').split(' ')]
    tmp_title = []
    for w in title:
        if w.isalpha():
            tmp_title.append(w)
        elif numchecker.match(w):
            tmp_title.append('<NUM>')
        elif punc_checker.match(w):
            tmp_title.append('<SYM>')
        else:
            tmp_title.append('<TOK>')
    title = tmp_title
    abstract = [w.replace('.', '') if w.endswith('.') else w for w in a['abstract'].lower().replace(')','').replace('(','').replace(':','').split(' ')]
    tmp_abs = []
    for w in abstract:
        if w.isalpha():
            tmp_abs.append(w)
        elif numchecker.match(w):
            tmp_abs.append('<NUM>')
        elif punc_checker.match(w):
            tmp_abs.append('<SYM>')
        else:
            tmp_abs.append('<TOK>')
    abstract = tmp_abs        
    final_articles.append({'lab': title, 'src': abstract})
    if len(title) > max_lab_len:
        max_lab_len = len(title)
    if len(abstract) > max_src_len:
        max_src_len = len(abstract)

for a in final_articles:
    while len(a['src']) < max_src_len:
        a['src'].append('<PAD>')
    while len(a['lab']) < max_lab_len:
        a['lab'].append('<PAD>')

print(max_lab_len, max_src_len)

15292
29 553


In [3]:
def vectorize(data, word_idx):
    X = []; Y = []
    
    for w in data['src']:
    
        try:
            x = word_idx[w]
        except:
            print('missing word... ', w)
            x = '<TOK>'
        X.append(x)

    for w in data['lab']:
        try:
            y = word_idx[w]
        except:
            print('missing word... ', w)
            y = '<TOK>'
        Y.append(y)

    return (X, Y)

train_articles = final_articles[:1988]
test_articles = final_articles[1988:]
#print(len(train) + len(test))
train_inputs = []
train_labels = []
train_next_word = []
test_inputs = []
test_labels = []
test_next_word = []

for a in train_articles:
    X, Y = vectorize(a, word_idx)
    
    for j in range(max_lab_len):
        if Y[j] == 0:
            break
        train_inputs.append(X)
        Y_one_hot = [0 for k in range(max_lab_len)]
        if j > 0:
            n = 0
            while n != j +1:
                Y_one_hot[j-n] = Y[j-n]
                n = n + 1
        train_labels.append(Y_one_hot)
        next_word = np.zeros(len(no_nums_vocab), dtype=np.uint8)
        next_word[Y[j]] = 1
        train_next_word.append(next_word)

for a in test_articles:
    X, Y = vectorize(a, word_idx)    
    for j in range(max_lab_len):
        if Y[j] == 0:
            break
        test_inputs.append(X)
        Y_one_hot = [0 for k in range(max_lab_len)]
        if j > 0:
            n = 0
            while n != j +1:
                Y_one_hot[j-n] = Y[j-n]
                n = n + 1
        test_labels.append(Y_one_hot)
        next_word = np.zeros(len(no_nums_vocab), dtype=np.uint8)
        next_word[Y[j]] = 1
        test_next_word.append(next_word)

train_inputs = np.array(train_inputs)
train_labels = np.array(train_labels)

test_inputs = np.array(test_inputs)
test_labels = np.array(test_labels)
print(train_inputs.shape, max_src_len)

missing word...  https
(22225, 553) 553


In [16]:
from keras.layers import *


dim = 240

def get_rnn(return_sequences= True): 
    return CuDNNLSTM(dim, return_sequences=return_sequences)



In [17]:
vocab_size = len(no_nums_vocab)
print(vocab_size)
src_txt_length = max_src_len
sum_txt_length = max_lab_len

inp = Input((src_txt_length,))
x = Embedding(vocab_size, 128)(inp)

x = Bidirectional(get_rnn())(x)
x = get_rnn(False)(x)

x = RepeatVector(max_lab_len)(x)
x = get_rnn()(x)
x = get_rnn()(x)
x = TimeDistributed(Dense(vocab_size, activation='softmax'))(x)



15292


In [18]:
model = keras.Model(inp, x)

model.compile(keras.optimizers.Adam(), 'sparse_categorical_crossentropy', metrics=['acc'])

hist=model.fit(train_inputs, np.expand_dims(train_labels,-1), 
          validation_data=[test_inputs, np.expand_dims(test_labels,-1)], 
          batch_size=64, nb_epoch=5)


  import sys


Train on 22225 samples, validate on 5379 samples
Epoch 1/5


InvalidArgumentError: Node 'IsVariableInitialized_315': Unknown input node 'embedding_1/embeddings'

In [13]:
def lstmfn(seq):
    return keras.layers.CuDNNLSTM(128, return_sequences=seq)

vocab_size = len(no_nums_vocab)
print(vocab_size)
src_txt_length = max_src_len
sum_txt_length = max_lab_len
# source text input model
inputs1 = keras.Input(shape=(src_txt_length,))
am1 = keras.layers.Embedding(vocab_size, 128)(inputs1)
am2 = keras.layers.Bidirectional(lstmfn(True))(am1)
am3 = lstmfn(False)(am2)
# summary input model
#inputs2 = keras.Input(shape=(sum_txt_length,))
#sm1 = keras.layers.Embedding(vocab_size, 128)(inputs2)
#sm2 = keras.layers.Bidirectional(lstmfn(True))(sm1)
#sm3 = lstmfn(False)(sm2)
# decoder output model
decoder1 = am3#concatenate([am3, sm3])
decoder2 = RepeatVector(max_lab_len)(decoder1)
decoder3 = lstmfn(True)(decoder2)
outputs = TimeDistributed(Dense(vocab_size, activation='softmax'))(decoder3)
#print (outputs.shape)
# tie it together [article, summary] [word]
model = keras.Model(inputs=inputs1, outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')

15292


In [14]:
print(train_inputs.shape)

(22225, 553)


In [15]:
keras.backend.set_value(model.optimizer.lr, 1e-3)
parms = {'verbose': 1, 'callbacks': [keras_tqdm.TQDMNotebookCallback(leave_inner=False)]}
hist=model.fit(train_inputs, np.array(train_next_word), **parms, epochs=2, batch_size=32)

keras.backend.set_value(model.optimizer.lr, 1e-2)
parms = {'verbose': 1, 'callbacks': [keras_tqdm.TQDMNotebookCallback(leave_inner=False)]}
hist=model.fit(train_inputs, np.array(train_next_word), **parms, epochs=3, batch_size=32)

InvalidArgumentError: Node 'IsVariableInitialized_177': Unknown input node 'embedding_1/embeddings'

In [13]:
word_idx_r = {v:k for k,v in word_idx.items()}

In [25]:
title = np.zeros((1,max_lab_len))
pred = np.argmax(model.predict(train_inputs[0:1]), 1).flatten()
[word_idx_r[i] for i in pred]
pred.shape

(15292,)

In [None]:
print(test_labels[18])

In [None]:
train_articles[0]