In [12]:
import pandas as pd
import numpy as np
from sklearn.externals import joblib
import nltk
import gensim
import spacy
import random

from sklearn import metrics

import torch as tt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchtext.data import Field, LabelField, BucketIterator, TabularDataset, Iterator



SEED = 42
EMBED_SIZE=100
random.seed(SEED)
np.random.seed(SEED)
tt.manual_seed(SEED);

In [2]:
trial=pd.read_table('labeledEligibilitySample1000000.csv', header=None)

  """Entry point for launching an IPython kernel.


In [3]:
trial

Unnamed: 0,0,1
0,__label__0,study interventions are recombinant CD40-ligan...
1,__label__0,study interventions are Liposomal doxorubicin ...
2,__label__0,study interventions are BI 836909 . multiple m...
3,__label__0,study interventions are Immunoglobulins . recu...
4,__label__0,study interventions are Paclitaxel . stage ova...
5,__label__0,"study interventions are Antibodies, Monoclonal..."
6,__label__0,study interventions are Hormones . prostate ca...
7,__label__0,study interventions are Bendamustine Hydrochlo...
8,__label__0,study interventions are Nivolumab . recovered ...
9,__label__0,study interventions are Thalidomide . kidney c...


In [4]:
import spacy


spacy_en = spacy.load('en')
spacy_en.remove_pipe('tagger')
spacy_en.remove_pipe('ner')

def tokenizer(text): # create a tokenizer function
    return [tok.lemma_ for tok in spacy_en.tokenizer(text) if tok.text.isalpha()]            

In [5]:
classes={
    '__label__0':0,
    '__label__1':1
}

TEXT = Field(include_lengths=True, batch_first=True, 
             tokenize=tokenizer,
             eos_token='<eos>',
             lower=True,
             stop_words=nltk.corpus.stopwords.words('english')
            )
LABEL = LabelField(dtype=tt.int64, use_vocab=True, preprocessing=lambda x: classes[x])

dataset = TabularDataset('labeledEligibilitySample1000000.csv', format='tsv', 
                         fields=[('label', LABEL),('text', TEXT)], 
                         skip_header=True)

In [6]:
TEXT.build_vocab(dataset, min_freq=5)
LABEL.build_vocab(dataset)

In [7]:
train, test = dataset.split(0.8)
train, valid = train.split(0.8, stratified = True)

In [8]:
wv_from_bin = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin", binary=True)

In [14]:
matrix_len = len(TEXT.vocab.itos)
weights_matrix = np.zeros((matrix_len, EMBED_SIZE))
words_found = 0

for i, word in enumerate(TEXT.vocab.itos):
    try: 
        weights_matrix[i] = wv_from_bin[word][:100]
        words_found += 1
    except KeyError:
        weights_matrix[i] = np.random.normal(scale=0.6, size=(EMBED_SIZE, ))
weights = tt.FloatTensor(weights_matrix)

In [27]:
class RNNClassifier(nn.Module):
    
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(RNNClassifier, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(weights)
        
        self.rnn = nn.LSTM(input_size=embed_size,
                           hidden_size=hidden_size,
                           bidirectional=True,
                           batch_first=True,
                          )
        
        self.fc = nn.Linear(hidden_size * 2 *2, 2)
        
    def forward(self, batch):
        x, x_lengths = batch.text
        
        x = self.embedding(x)

        if x_lengths is not None:
            x_lengths = x_lengths.view(-1).tolist()
            x = nn.utils.rnn.pack_padded_sequence(x, x_lengths, batch_first=True)
            
        _, (hidden, cell) = self.rnn(x)
        
        hidden = hidden.transpose(0,1)
        cell = cell.transpose(0,1)
        hidden = hidden.contiguous().view(hidden.size(0),-1)
        cell = cell.contiguous().view(cell.size(0),-1)
        x = tt.cat([hidden, cell], dim=1).squeeze(1)
        x = self.fc(x)
        return x

In [28]:
batch_size = 64

model = RNNClassifier(len(TEXT.vocab.itos),
                embed_size=100,
                hidden_size=32,
               )

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train, valid, test),
    batch_sizes=(batch_size, batch_size, batch_size),
    shuffle=True,
    sort_key=lambda x: len(x.text),
    sort_within_batch=True,
)

optimizer = optim.Adam(model.parameters())
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True, cooldown=5)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5)
criterion = nn.CrossEntropyLoss()

In [29]:
from tqdm import tqdm_notebook

def _train_epoch(model, iterator, optimizer, criterion, curr_epoch):

    model.train()

    running_loss = 0

    n_batches = len(iterator)
    iterator = tqdm_notebook(iterator, total=n_batches, desc='epoch %d' % (curr_epoch), leave=True)

    for i, batch in enumerate(iterator):
        optimizer.zero_grad()

        pred = model(batch)
        loss = criterion(pred, batch.label)
        loss.backward()
        optimizer.step()

        curr_loss = loss.data.cpu().detach().item()
        
        loss_smoothing = i / (i+1)
        running_loss = loss_smoothing * running_loss + (1 - loss_smoothing) * curr_loss

        iterator.set_postfix(loss='%.5f' % running_loss)

    return running_loss

def _test_epoch(model, iterator, criterion):
    model.eval()
    epoch_loss = 0

    n_batches = len(iterator)
    with tt.no_grad():
        for batch in iterator:
            pred = model(batch)
            loss = criterion(pred, batch.label)
            epoch_loss += loss.data.item()

    return epoch_loss / n_batches


def nn_train(model, train_iterator, valid_iterator, criterion, optimizer, n_epochs=100,
          scheduler=None, early_stopping=0):

    prev_loss = 100500
    es_epochs = 0
    best_epoch = None
    history = pd.DataFrame()

    for epoch in range(n_epochs):
        train_loss = _train_epoch(model, train_iterator, optimizer, criterion, epoch)
        valid_loss = _test_epoch(model, valid_iterator, criterion)

        valid_loss = valid_loss
        print('validation loss %.5f' % valid_loss)

        record = {'epoch': epoch, 'train_loss': train_loss, 'valid_loss': valid_loss}
        history = history.append(record, ignore_index=True)

        if early_stopping > 0:
            if valid_loss > prev_loss:
                es_epochs += 1
            else:
                es_epochs = 0

            if es_epochs >= early_stopping:
                best_epoch = history[history.valid_loss == history.valid_loss.min()].iloc[0]
                print('Early stopping! best epoch: %d val %.5f' % (best_epoch['epoch'], best_epoch['valid_loss']))
                break

            prev_loss = min(prev_loss, valid_loss)

In [30]:
nn_train(model, train_iterator, valid_iterator, criterion, optimizer, scheduler=scheduler, 
        n_epochs=1, early_stopping=2)

HBox(children=(IntProgress(value=0, description='epoch 0', max=10000, style=ProgressStyle(description_width='i…

validation loss 0.36220


In [43]:
def roc_auc_compute_fn(y_preds, y_targets):
    from sklearn.metrics import roc_auc_score

    y_true = y_targets.numpy()
    y_pred = y_preds.numpy()
    return roc_auc_score(y_true, np.argmax(y_pred, axis=1))

In [45]:
model.eval()
epoch_loss = 0

n_batches = len(test_iterator)
with tt.no_grad():
    for batch in test_iterator:
        pred = model(batch)
        loss = roc_auc_compute_fn(pred, batch.label)
        epoch_loss += loss

print(epoch_loss / n_batches,': Test rocauc')

0.8410504589055244 : Test rocauc
