# MLP, LSTM, and RoBERTa for RQE

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xml.etree.ElementTree as ET
import nltk
import re

In [0]:
# Mount Drive containing train and test XML files
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# Parse Train and Text XML files
def parse_XML(xml_file, df_cols): 
    """Parse the input XML file and store the result in a pandas 
    DataFrame with the given columns. 
    
    The first element of df_cols is supposed to be the identifier 
    variable, which is an attribute of each node element in the 
    XML data; other features will be parsed from the text content 
    of each sub-element. 
    """
    
    xtree = ET.parse(xml_file)
    xroot = xtree.getroot()
    rows = []
    
    for node in xroot: 
        res = []
        res.append(node.attrib.get(df_cols[0]))
        res.append(node.attrib.get(df_cols[1]))
        res.append(node.attrib.get(df_cols[2]))
        for el in df_cols[3:]: 
            if node is not None and node.find(el) is not None:
                res.append(node.find(el).text)
            else: 
                res.append(None)
        rows.append({df_cols[i]: res[i] 
                     for i, _ in enumerate(df_cols)})
    
    out_df = pd.DataFrame(rows, columns=df_cols)
        
    return out_df

train = parse_XML('/content/drive/My Drive/Colab Notebooks/Applied NLP/Final Notebooks/RQE_Train_8588_AMIA2016.xml', ['pid', 'type', 'value', 'chq', 'faq'])
val = parse_XML('/content/drive/My Drive/Colab Notebooks/Applied NLP/Final Notebooks/RQE_Test_302_pairs_AMIA2016.xml', ['pid', 'type', 'value', 'chq', 'faq'])
test = parse_XML('/content/drive/My Drive/Colab Notebooks/Applied NLP/Final Notebooks/MEDIQA2019-Task2-RQE-TestSet-wLabels.xml', ['pid', 'type', 'value', 'chq', 'faq'])
train.head()

Unnamed: 0,pid,type,value,chq,faq
0,1,originalQ-shortQ,True,\n How should I treat polymenorrhea in a 14-...,\n How should I treat polymenorrhea in a 14-...
1,2,originalQ-shortQ,True,\n Have there been any studies with low mole...,\n Can I use low molecular weight heparin in...
2,3,originalQ-shortRandQ,False,\n Have there been any studies with low mole...,\n What are the side effects of Florinef? C...
3,4,originalQ-shortQ,True,\n Let's give these immunizations. That's r...,\n Let's give these immunizations. That's r...
4,5,originalQ-shortRandQ,False,\n Let's give these immunizations. That's r...,\n Is there more support we can provide pati...


In [0]:
# Binarize outcome variable
train['outcome'] = np.where(train['value'] == 'true', 1, 0)
val['outcome'] = np.where(val['value'] == 'true', 1, 0)
test['outcome'] = np.where(test['value'] == 'true', 1, 0)
train.head()

Unnamed: 0,pid,type,value,chq,faq,outcome
0,1,originalQ-shortQ,True,\n How should I treat polymenorrhea in a 14-...,\n How should I treat polymenorrhea in a 14-...,1
1,2,originalQ-shortQ,True,\n Have there been any studies with low mole...,\n Can I use low molecular weight heparin in...,1
2,3,originalQ-shortRandQ,False,\n Have there been any studies with low mole...,\n What are the side effects of Florinef? C...,0
3,4,originalQ-shortQ,True,\n Let's give these immunizations. That's r...,\n Let's give these immunizations. That's r...,1
4,5,originalQ-shortRandQ,False,\n Let's give these immunizations. That's r...,\n Is there more support we can provide pati...,0


In [0]:
# drop unnecessary variables
train.drop(columns = ['pid', 'type', 'value'], inplace=True)
val.drop(columns = ['pid', 'type', 'value'], inplace=True)
test.drop(columns = ['pid', 'type', 'value'], inplace=True)
train.head()

Unnamed: 0,chq,faq,outcome
0,\n How should I treat polymenorrhea in a 14-...,\n How should I treat polymenorrhea in a 14-...,1
1,\n Have there been any studies with low mole...,\n Can I use low molecular weight heparin in...,1
2,\n Have there been any studies with low mole...,\n What are the side effects of Florinef? C...,0
3,\n Let's give these immunizations. That's r...,\n Let's give these immunizations. That's r...,1
4,\n Let's give these immunizations. That's r...,\n Is there more support we can provide pati...,0


## MLP

NOTE: These models take some time to run, even on a GPU.

In [0]:
# Import sklearn wrapper for BERT modeling
!git clone -b master https://github.com/charles9n/bert-sklearn
!cd bert-sklearn; pip install .
import os
os.chdir("bert-sklearn")
print(os.listdir())

Cloning into 'bert-sklearn'...
remote: Enumerating objects: 12, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 259 (delta 3), reused 2 (delta 0), pack-reused 247[K
Receiving objects: 100% (259/259), 519.36 KiB | 1.31 MiB/s, done.
Resolving deltas: 100% (125/125), done.
Processing /content/bert-sklearn
Building wheels for collected packages: bert-sklearn
  Building wheel for bert-sklearn (setup.py) ... [?25l[?25hdone
  Created wheel for bert-sklearn: filename=bert_sklearn-0.3.1-cp36-none-any.whl size=54234 sha256=9d818018b4f0bd5c5f0fa0b59bcf781b03b142933f9b10bbe1154311a7a7fa6d
  Stored in directory: /root/.cache/pip/wheels/61/95/c6/5790aae8fb377f5ff356dbe58205aab28858595d6bff8197d0
Successfully built bert-sklearn
Installing collected packages: bert-sklearn
Successfully installed bert-sklearn-0.3.1
['setup.py', 'README.md', '.git', 'bert_sklearn', 'other_examples', 'tests', 'Options.md', 'demo_tuning_hyperpar

In [0]:
import os
import math
import random
import csv
import sys

import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
import statistics as stats

from bert_sklearn import BertClassifier
from bert_sklearn import BertRegressor
from bert_sklearn import BertTokenClassifier
from bert_sklearn import load_model

# BERT
model = BertClassifier(max_seq_length=64, train_batch_size=16)
model.num_mlp_layers = 3
model

Building sklearn text classifier...


BertClassifier(bert_config_json=None, bert_model='bert-base-uncased',
               bert_vocab=None, do_lower_case=None, epochs=3, eval_batch_size=8,
               fp16=False, from_tf=False, gradient_accumulation_steps=1,
               ignore_label=None, label_list=None, learning_rate=2e-05,
               local_rank=-1, logfile='bert_sklearn.log', loss_scale=0,
               max_seq_length=64, num_mlp_hiddens=500, num_mlp_layers=3,
               random_state=42, restore_file=None, train_batch_size=16,
               use_cuda=True, validation_fraction=0.1, warmup_proportion=0.1)

In [0]:
# Set X_train, y_train, X_test, y_test
y_train = train['outcome']
X_train = train.drop(columns = ['pid', 'type', 'value', 'outcome'])

y_test = test['outcome']
X_test = test.drop(columns = ['pid', 'type', 'value', 'outcome'])

X_train

Unnamed: 0,chq,faq
0,\n How should I treat polymenorrhea in a 14-...,\n How should I treat polymenorrhea in a 14-...
1,\n Have there been any studies with low mole...,\n Can I use low molecular weight heparin in...
2,\n Have there been any studies with low mole...,\n What are the side effects of Florinef? C...
3,\n Let's give these immunizations. That's r...,\n Let's give these immunizations. That's r...
4,\n Let's give these immunizations. That's r...,\n Is there more support we can provide pati...
...,...,...
8583,\n What is the dose for this drip?\n,\n What is the dose for this drip?\n
8584,\n What is the dose for this drip?\n,\n Patient got sore breasts from Premarin 0....
8585,\n How is a heart transplant done? What are ...,\n How is a heart transplant done?\n
8586,\n How is a heart transplant done? What are ...,\n How should I treat polymenorrhea in a 14-...


In [0]:
# Run 
scores_BERT = []; 
for seed in [4, 27, 33]:
    model.random_state = seed
    model.fit(X_train, y_train)
    scores_BERT.append(model.score(X_test, y_test))

100%|██████████| 231508/231508 [00:00<00:00, 2748243.61B/s]


Loading bert-base-uncased model...


100%|██████████| 440473133/440473133 [00:09<00:00, 47585255.46B/s]
100%|██████████| 313/313 [00:00<00:00, 127942.42B/s]


Using mlp with D=768,H=500,K=2,n=3
Loading Pytorch checkpoint

train data size: 7730, validation data size: 858



HBox(children=(IntProgress(value=0, description='Training  ', max=483, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Validating', max=108, style=ProgressStyle(description_width='…



Epoch 1, Train loss: 0.0964, Val loss: 0.0279, Val accy: 99.07%



HBox(children=(IntProgress(value=0, description='Training  ', max=483, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Validating', max=108, style=ProgressStyle(description_width='…



Epoch 2, Train loss: 0.0214, Val loss: 0.0235, Val accy: 99.18%



HBox(children=(IntProgress(value=0, description='Training  ', max=483, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Validating', max=108, style=ProgressStyle(description_width='…



Epoch 3, Train loss: 0.0080, Val loss: 0.0277, Val accy: 99.07%



HBox(children=(IntProgress(value=0, description='Testing', max=38, style=ProgressStyle(description_width='init…



Loss: 1.2952, Accuracy: 77.81%
Loading bert-base-uncased model...
Using mlp with D=768,H=500,K=2,n=3
Loading Pytorch checkpoint

train data size: 7730, validation data size: 858



HBox(children=(IntProgress(value=0, description='Training  ', max=483, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Validating', max=108, style=ProgressStyle(description_width='…



Epoch 1, Train loss: 0.0971, Val loss: 0.0292, Val accy: 99.18%



HBox(children=(IntProgress(value=0, description='Training  ', max=483, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Validating', max=108, style=ProgressStyle(description_width='…



Epoch 2, Train loss: 0.0152, Val loss: 0.0174, Val accy: 99.77%



HBox(children=(IntProgress(value=0, description='Training  ', max=483, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Validating', max=108, style=ProgressStyle(description_width='…



Epoch 3, Train loss: 0.0068, Val loss: 0.0166, Val accy: 99.77%



HBox(children=(IntProgress(value=0, description='Testing', max=38, style=ProgressStyle(description_width='init…



Loss: 1.2305, Accuracy: 77.81%
Loading bert-base-uncased model...
Using mlp with D=768,H=500,K=2,n=3
Loading Pytorch checkpoint

train data size: 7730, validation data size: 858



HBox(children=(IntProgress(value=0, description='Training  ', max=483, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Validating', max=108, style=ProgressStyle(description_width='…



Epoch 1, Train loss: 0.0972, Val loss: 0.0363, Val accy: 98.83%



HBox(children=(IntProgress(value=0, description='Training  ', max=483, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Validating', max=108, style=ProgressStyle(description_width='…



Epoch 2, Train loss: 0.0159, Val loss: 0.0413, Val accy: 98.95%



HBox(children=(IntProgress(value=0, description='Training  ', max=483, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Validating', max=108, style=ProgressStyle(description_width='…



Epoch 3, Train loss: 0.0063, Val loss: 0.0384, Val accy: 99.18%



HBox(children=(IntProgress(value=0, description='Testing', max=38, style=ProgressStyle(description_width='init…



Loss: 1.1344, Accuracy: 77.48%


In [0]:
scores_BERT

[77.81456953642383, 77.81456953642383, 77.48344370860927]

In [0]:
y_pred = model.predict(X_test)
print((X_test[y_pred == y_test].shape[0])/X_test.shape[0])

HBox(children=(IntProgress(value=0, description='Predicting', max=38, style=ProgressStyle(description_width='i…


0.7748344370860927


In [0]:
# Average
sum(scores_BERT)/3

77.70419426048564

SciBERT

In [0]:
# SciBERT
model_scibert = BertClassifier(max_seq_length=64, train_batch_size=16)
model_scibert.num_mlp_layers = 3
model_scibert.bert_model = 'scibert-scivocab-uncased'
model_scibert

Building sklearn text classifier...


BertClassifier(bert_config_json=None, bert_model='scibert-scivocab-uncased',
               bert_vocab=None, do_lower_case=None, epochs=3, eval_batch_size=8,
               fp16=False, from_tf=False, gradient_accumulation_steps=1,
               ignore_label=None, label_list=None, learning_rate=2e-05,
               local_rank=-1, logfile='bert_sklearn.log', loss_scale=0,
               max_seq_length=64, num_mlp_hiddens=500, num_mlp_layers=3,
               random_state=42, restore_file=None, train_batch_size=16,
               use_cuda=True, validation_fraction=0.1, warmup_proportion=0.1)

In [0]:
# Run 
scores_sciBERT = []; 
for seed in [4, 27, 33]:
    model_scibert.random_state = seed
    model_scibert.fit(X_train, y_train)
    scores_sciBERT.append(model_scibert.score(X_test_actual, y_test_actual))

100%|██████████| 410593280/410593280 [00:08<00:00, 47592247.64B/s]


Loading scibert-scivocab-uncased model...


100%|██████████| 410593280/410593280 [00:05<00:00, 70971087.36B/s]


Using mlp with D=768,H=500,K=2,n=3
Loading Pytorch checkpoint

train data size: 7730, validation data size: 858



HBox(children=(IntProgress(value=0, description='Training  ', max=483, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Validating', max=108, style=ProgressStyle(description_width='…



Epoch 1, Train loss: 0.0829, Val loss: 0.0141, Val accy: 99.53%



HBox(children=(IntProgress(value=0, description='Training  ', max=483, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Validating', max=108, style=ProgressStyle(description_width='…



Epoch 2, Train loss: 0.0137, Val loss: 0.0089, Val accy: 99.88%



HBox(children=(IntProgress(value=0, description='Training  ', max=483, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Validating', max=108, style=ProgressStyle(description_width='…



Epoch 3, Train loss: 0.0060, Val loss: 0.0068, Val accy: 99.88%



HBox(children=(IntProgress(value=0, description='Testing', max=38, style=ProgressStyle(description_width='init…



Loss: 1.1166, Accuracy: 79.47%
Loading scibert-scivocab-uncased model...
Using mlp with D=768,H=500,K=2,n=3
Loading Pytorch checkpoint

train data size: 7730, validation data size: 858



HBox(children=(IntProgress(value=0, description='Training  ', max=483, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Validating', max=108, style=ProgressStyle(description_width='…



Epoch 1, Train loss: 0.0796, Val loss: 0.0091, Val accy: 99.77%



HBox(children=(IntProgress(value=0, description='Training  ', max=483, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Validating', max=108, style=ProgressStyle(description_width='…



Epoch 2, Train loss: 0.0128, Val loss: 0.0043, Val accy: 100.00%



HBox(children=(IntProgress(value=0, description='Training  ', max=483, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Validating', max=108, style=ProgressStyle(description_width='…



Epoch 3, Train loss: 0.0076, Val loss: 0.0026, Val accy: 100.00%



HBox(children=(IntProgress(value=0, description='Testing', max=38, style=ProgressStyle(description_width='init…



Loss: 1.1957, Accuracy: 77.81%
Loading scibert-scivocab-uncased model...
Using mlp with D=768,H=500,K=2,n=3
Loading Pytorch checkpoint

train data size: 7730, validation data size: 858



HBox(children=(IntProgress(value=0, description='Training  ', max=483, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Validating', max=108, style=ProgressStyle(description_width='…



Epoch 1, Train loss: 0.0873, Val loss: 0.0151, Val accy: 99.65%



HBox(children=(IntProgress(value=0, description='Training  ', max=483, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Validating', max=108, style=ProgressStyle(description_width='…



Epoch 2, Train loss: 0.0125, Val loss: 0.0122, Val accy: 99.77%



HBox(children=(IntProgress(value=0, description='Training  ', max=483, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Validating', max=108, style=ProgressStyle(description_width='…



Epoch 3, Train loss: 0.0062, Val loss: 0.0148, Val accy: 99.77%



HBox(children=(IntProgress(value=0, description='Testing', max=38, style=ProgressStyle(description_width='init…



Loss: 1.0863, Accuracy: 79.47%


In [0]:
scores_sciBERT

[79.47019867549669, 77.81456953642383, 79.47019867549669]

In [0]:
y_pred_scibert = model_scibert.predict(X_test)
print((X_test[y_pred_scibert == y_test].shape[0])/X_test.shape[0])

HBox(children=(IntProgress(value=0, description='Predicting', max=38, style=ProgressStyle(description_width='i…


0.7947019867549668


In [0]:
# Average
sum(scores_sciBERT)/3

78.91832229580574

In [0]:
target_names = ['negative', 'positive']
print(classification_report(y_test, y_pred_scibert, target_names=target_names))

              precision    recall  f1-score   support

    negative       0.95      0.68      0.79       173
    positive       0.69      0.95      0.80       129

    accuracy                           0.79       302
   macro avg       0.82      0.81      0.79       302
weighted avg       0.84      0.79      0.79       302



In [0]:
# Save model
savefile = 'scibert_model.bin'
model_scibert.save(savefile)

BioBERT

In [0]:
# BioBERT
model_biobert = BertClassifier(max_seq_length=64, train_batch_size=16)
model_biobert.num_mlp_layers = 3
model_biobert.bert_model = 'biobert-base-cased' 
model_biobert

Building sklearn text classifier...


BertClassifier(bert_config_json=None, bert_model='biobert-base-cased',
               bert_vocab=None, do_lower_case=None, epochs=3, eval_batch_size=8,
               fp16=False, from_tf=False, gradient_accumulation_steps=1,
               ignore_label=None, label_list=None, learning_rate=2e-05,
               local_rank=-1, logfile='bert_sklearn.log', loss_scale=0,
               max_seq_length=64, num_mlp_hiddens=500, num_mlp_layers=3,
               random_state=42, restore_file=None, train_batch_size=16,
               use_cuda=True, validation_fraction=0.1, warmup_proportion=0.1)

In [0]:
# Run
scores_bioBERT = []; 
for seed in [4, 27, 33]:
    model_biobert.random_state = seed
    model_biobert.fit(X_train, y_train)
    scores_bioBERT.append(model_biobert.score(X_test, y_test))

100%|██████████| 401403346/401403346 [00:08<00:00, 48557675.73B/s]


Loading biobert-base-cased model...


100%|██████████| 401403346/401403346 [00:08<00:00, 49031832.77B/s]


Using mlp with D=768,H=500,K=2,n=3


Loading Tensorflow checkpoint from  model.ckpt-1000000

train data size: 7730, validation data size: 858



HBox(children=(IntProgress(value=0, description='Training  ', max=483, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Validating', max=108, style=ProgressStyle(description_width='…



Epoch 1, Train loss: 0.0872, Val loss: 0.0171, Val accy: 99.53%



HBox(children=(IntProgress(value=0, description='Training  ', max=483, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Validating', max=108, style=ProgressStyle(description_width='…



Epoch 2, Train loss: 0.0122, Val loss: 0.0142, Val accy: 99.77%



HBox(children=(IntProgress(value=0, description='Training  ', max=483, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Validating', max=108, style=ProgressStyle(description_width='…



Epoch 3, Train loss: 0.0064, Val loss: 0.0143, Val accy: 99.77%



HBox(children=(IntProgress(value=0, description='Testing', max=38, style=ProgressStyle(description_width='init…



Loss: 1.1474, Accuracy: 78.81%
Loading biobert-base-cased model...
Using mlp with D=768,H=500,K=2,n=3
Loading Tensorflow checkpoint from  model.ckpt-1000000

train data size: 7730, validation data size: 858



HBox(children=(IntProgress(value=0, description='Training  ', max=483, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Validating', max=108, style=ProgressStyle(description_width='…



Epoch 1, Train loss: 0.0843, Val loss: 0.0220, Val accy: 99.18%



HBox(children=(IntProgress(value=0, description='Training  ', max=483, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Validating', max=108, style=ProgressStyle(description_width='…



Epoch 2, Train loss: 0.0121, Val loss: 0.0276, Val accy: 99.30%



HBox(children=(IntProgress(value=0, description='Training  ', max=483, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Validating', max=108, style=ProgressStyle(description_width='…



Epoch 3, Train loss: 0.0060, Val loss: 0.0288, Val accy: 99.30%



HBox(children=(IntProgress(value=0, description='Testing', max=38, style=ProgressStyle(description_width='init…



Loss: 1.2103, Accuracy: 76.49%
Loading biobert-base-cased model...
Using mlp with D=768,H=500,K=2,n=3
Loading Tensorflow checkpoint from  model.ckpt-1000000

train data size: 7730, validation data size: 858



HBox(children=(IntProgress(value=0, description='Training  ', max=483, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Validating', max=108, style=ProgressStyle(description_width='…



Epoch 1, Train loss: 0.0831, Val loss: 0.0254, Val accy: 99.30%



HBox(children=(IntProgress(value=0, description='Training  ', max=483, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Validating', max=108, style=ProgressStyle(description_width='…



Epoch 2, Train loss: 0.0116, Val loss: 0.0310, Val accy: 99.30%



HBox(children=(IntProgress(value=0, description='Training  ', max=483, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Validating', max=108, style=ProgressStyle(description_width='…



Epoch 3, Train loss: 0.0053, Val loss: 0.0309, Val accy: 99.30%



HBox(children=(IntProgress(value=0, description='Testing', max=38, style=ProgressStyle(description_width='init…



Loss: 1.1838, Accuracy: 77.48%


In [0]:
scores_bioBERT

[78.80794701986756, 76.49006622516556, 77.48344370860927]

In [0]:
y_pred_biobert = model_biobert.predict(X_test)
print((X_test[y_pred_biobert == y_test].shape[0])/X_test.shape[0])

HBox(children=(IntProgress(value=0, description='Predicting', max=38, style=ProgressStyle(description_width='i…


0.7748344370860927


In [0]:
# Average
sum(scores_bioBERT)/3

77.59381898454747

Inspect results

In [0]:
test[y_pred_scibert == y_test]

Unnamed: 0,pid,type,value,chq,faq,outcome
3,4,part1,false,EAR LOBE CREASES. Are ear lobe creases always ...,What is Coronary Heart Disease?,0
5,6,part1,true,No. hi my name is NAME I'm currently working w...,How is HIV/AIDS treated?,1
7,8,part1,true,I want more information on Hypertension and fi...,What is fibromyalgia?,1
14,15,part1,true,"is there any help available,for fibromalgia. i...",How Is Fibromyalgia Treated?,1
22,23,part1,true,"lupus. Hi, I want to know about Lupus and it...",What is lupus?,1
...,...,...,...,...,...,...
296,297,part3,false,blind technology. I am a student doing a resea...,How do you get rid of a canker sore? |How are ...,0
297,298,part3,false,I have exercise induced asthma. Would any of t...,Are there any treatments or cures for albinism?,0
298,299,part3,false,Body Lice. How do I get rid of these little de...,What is the treatment for thyroid nodules?,0
300,301,part3,false,unusal appetite after a stroke. My sister has ...,What is the treatment for thyroid nodules?,0


In [0]:
test[y_pred_scibert != y_test]

Unnamed: 0,pid,type,value,chq,faq,outcome
0,1,part1,false,High Blood Pressure. I know you may not answer...,What is High Blood Pressure?,0
1,2,part1,false,Arrhythmia. can arrhythmia occurs after ablati...,What is an Arrhythmia?,0
2,3,part1,false,medicine and allied. I LIKE TO KNOW RECENT THE...,What is an Arrhythmia?,0
4,5,part1,false,sleep apnea. I was diagnosed with sleep apnea ...,What is Sleep Apnea?,0
6,7,part1,false,"is there any help available,for fibromalgia. i...",What is fibromyalgia?,0
...,...,...,...,...,...,...
84,85,part2,true,Cure for hole in lung. I certainly would like ...,How Are Pleurisy and Other Pleural Disorders T...,1
137,138,part2,true,ClinicalTrials.gov - Compliment. Hi I have ret...,Are there treatments for RP?,1
168,169,part2,true,Pilot and Lazy eye. Dear MedlinePus I started...,What is Vision Therapy When and why is it need...,1
172,173,part2,true,Can a woman catch pelvic inflammatory disease ...,How do I get PID?,1


In [0]:
# write to csvs
# train.to_csv('/content/drive/My Drive/Colab Notebooks/Applied NLP/rqe_train.csv')
# val.to_csv('/content/drive/My Drive/Colab Notebooks/Applied NLP/rqe_val.csv')
# test.to_csv('/content/drive/My Drive/Colab Notebooks/Applied NLP/rqe_test.csv')

## LSTM

https://www.kaggle.com/nilanml/quora-similarity-task-wide-neural-network

In [0]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [0]:
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english")) 
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = re.sub(r'[^\w\s]','',text, re.UNICODE)
    text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [lemmatizer.lemmatize(token, "v") for token in text]
#     text = [word for word in text if not word in stop_words]
    text = " ".join(text)
    return text

In [0]:
train['chq'] = train.chq.apply(lambda x: clean_text(str(x)))
train['faq'] = train.faq.apply(lambda x: clean_text(str(x)))

val['chq'] = val.chq.apply(lambda x: clean_text(str(x)))
val['faq'] = val.faq.apply(lambda x: clean_text(str(x)))

test['chq'] = test.chq.apply(lambda x: clean_text(str(x)))
test['faq'] = test.faq.apply(lambda x: clean_text(str(x)))

test.head()

Unnamed: 0,pid,type,value,chq,faq,outcome
0,1,,False,atypical pnuemonia what be the possibility ...,what be the possible treatment for atypical pn...,0
1,2,,False,glaucoma can you mail me patient information a...,how be glaucoma diagnose,0
2,3,,True,can you mail me patient information about glau...,what be glaucoma,1
3,4,,True,i be suffer from kartageners syndrome and want...,what be primary ciliary dyskinesia,1
4,5,,True,please help me with my brother with lockedin s...,be there a cure for lockedin syndrome,1


In [0]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense , Input , CuDNNLSTM , Embedding, Dropout , Activation, GRU, Flatten
from keras.layers import Bidirectional, GlobalMaxPool1D, concatenate, dot
from keras.models import Model, Sequential



total_text = pd.concat([train['chq'], train['faq']]).reset_index(drop=True)
max_features = 6000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(total_text)
chq_sequenced = tokenizer.texts_to_sequences(train['chq'])
faq_sequenced = tokenizer.texts_to_sequences(train['faq'])

Using TensorFlow backend.


In [0]:
maxlen = 100
chq_padded = pad_sequences(chq_sequenced, maxlen=maxlen)
faq_padded = pad_sequences(faq_sequenced, maxlen=maxlen)

In [0]:
y_train = train['outcome']

In [0]:
# validation data
val_text = pd.concat([val['chq'], val['faq']]).reset_index(drop=True)
max_features = 6000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(val_text)
chq_sequenced_val = tokenizer.texts_to_sequences(val['chq'])
faq_sequenced_val = tokenizer.texts_to_sequences(val['faq'])

chq_padded_val = pad_sequences(chq_sequenced_val, maxlen=maxlen)
faq_padded_val = pad_sequences(faq_sequenced_val, maxlen=maxlen)

y_val = val['outcome']

In [0]:
embedding_size = 128

inp1 = Input(shape=(100,))
inp2 = Input(shape=(100,))

x1 = Embedding(max_features, embedding_size)(inp1)
x2 = Embedding(max_features, embedding_size)(inp2)

x3 = Bidirectional(CuDNNLSTM(32, return_sequences = True))(x1)
x4 = Bidirectional(CuDNNLSTM(32, return_sequences = True))(x2)

x5 = GlobalMaxPool1D()(x3)
x6 = GlobalMaxPool1D()(x4)

x7 =  dot([x5, x6], axes=1)

x8 = Dense(40, activation='relu')(x7)
x9 = Dropout(0.05)(x8)
x10 = Dense(10, activation='relu')(x9)
output = Dense(1, activation="sigmoid")(x10)

model = Model(inputs=[inp1, inp2], outputs=output)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
batch_size = 100
epochs = 100
model.fit([chq_padded, faq_padded], y_train, batch_size=batch_size, epochs=epochs, validation_data=([chq_padded_val, faq_padded_val], y_val))





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Train on 8588 samples, validate on 302 samples
Epoch 1/100





Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/10

<keras.callbacks.History at 0x7fa6dcf77780>

In [0]:
# test
test_text = pd.concat([test['chq'], test['faq']]).reset_index(drop=True)
max_features = 6000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(val_text)
chq_sequenced_test = tokenizer.texts_to_sequences(test['chq'])
faq_sequenced_test = tokenizer.texts_to_sequences(test['faq'])

chq_padded_test = pad_sequences(chq_sequenced_test, maxlen=maxlen)
faq_padded_test = pad_sequences(faq_sequenced_test, maxlen=maxlen)

y_test = test['outcome']

y_pred = model.predict([chq_padded_test, faq_padded_test])

In [0]:
test_actual

Unnamed: 0,pid,type,value,chq,faq,outcome
0,1,,false,atypical pnuemonia what be the possibility ...,what be the possible treatment for atypical pn...,0
1,2,,false,glaucoma can you mail me patient information a...,how be glaucoma diagnose,0
2,3,,true,can you mail me patient information about glau...,what be glaucoma,1
3,4,,true,i be suffer from kartageners syndrome and want...,what be primary ciliary dyskinesia,1
4,5,,true,please help me with my brother with lockedin s...,be there a cure for lockedin syndrome,1
...,...,...,...,...,...,...
225,226,,false,anal fissure i may have a couple anal fissure ...,what be the treatment for anal fissure,0
226,227,,false,we have 14 sibling in our family at least 10 ...,what be the first sign of freeze shoulder,0
227,228,,false,when and how do you know when you have congeni...,who get congenital night blindness,0
228,229,,false,would you help me to fine article or jurnal ab...,why be i cry without an evident reason,0


In [0]:
test_actual['pred'] = y_pred
test_actual

Unnamed: 0,pid,type,value,chq,faq,outcome,pred
0,1,,false,atypical pnuemonia what be the possibility ...,what be the possible treatment for atypical pn...,0,0.996723
1,2,,false,glaucoma can you mail me patient information a...,how be glaucoma diagnose,0,1.000000
2,3,,true,can you mail me patient information about glau...,what be glaucoma,1,1.000000
3,4,,true,i be suffer from kartageners syndrome and want...,what be primary ciliary dyskinesia,1,1.000000
4,5,,true,please help me with my brother with lockedin s...,be there a cure for lockedin syndrome,1,0.999995
...,...,...,...,...,...,...,...
225,226,,false,anal fissure i may have a couple anal fissure ...,what be the treatment for anal fissure,0,1.000000
226,227,,false,we have 14 sibling in our family at least 10 ...,what be the first sign of freeze shoulder,0,1.000000
227,228,,false,when and how do you know when you have congeni...,who get congenital night blindness,0,1.000000
228,229,,false,would you help me to fine article or jurnal ab...,why be i cry without an evident reason,0,1.000000


In [0]:
len(test_actual[(test_actual['pred'] >= 0.5) & (test_actual['outcome'] == 1)])

109

In [0]:
len(test_actual[(test_actual['pred'] < 0.5) & (test_actual['outcome'] == 0)])

2

In [0]:
(111)/230

0.4826086956521739

## RoBERTa (pre-trained on MNLI)

In [0]:
!pip install torch



In [0]:
!pip install git+https://github.com/pytorch/fairseq.git

Collecting git+https://github.com/pytorch/fairseq.git
  Cloning https://github.com/pytorch/fairseq.git to /tmp/pip-req-build-3ha_f1t7
  Running command git clone -q https://github.com/pytorch/fairseq.git /tmp/pip-req-build-3ha_f1t7
Collecting regex
[?25l  Downloading https://files.pythonhosted.org/packages/8c/db/4b29a0adec5881542cd81cb5d1929b5c0787003c5740b3c921e627d9c2e5/regex-2019.12.9.tar.gz (669kB)
[K     |████████████████████████████████| 675kB 6.4MB/s 
[?25hCollecting sacrebleu
  Downloading https://files.pythonhosted.org/packages/45/31/1a135b964c169984b27fb2f7a50280fa7f8e6d9d404d8a9e596180487fd1/sacrebleu-1.4.3-py3-none-any.whl
Collecting portalocker
  Downloading https://files.pythonhosted.org/packages/91/db/7bc703c0760df726839e0699b7f78a4d8217fdc9c7fcb1b51b39c5a22a4e/portalocker-1.5.2-py2.py3-none-any.whl
Building wheels for collected packages: fairseq, regex
  Building wheel for fairseq (setup.py) ... [?25l[?25hdone
  Created wheel for fairseq: filename=fairseq-0.9.0-cp3

In [0]:
import torch
roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli')
roberta.eval()

Downloading: "https://github.com/pytorch/fairseq/archive/master.zip" to /root/.cache/torch/hub/master.zip


running build_ext
cythoning fairseq/data/data_utils_fast.pyx to fairseq/data/data_utils_fast.cpp
cythoning fairseq/data/token_block_utils_fast.pyx to fairseq/data/token_block_utils_fast.cpp




building 'fairseq.libbleu' extension
creating build
creating build/temp.linux-x86_64-3.6
creating build/temp.linux-x86_64-3.6/fairseq
creating build/temp.linux-x86_64-3.6/fairseq/clib
creating build/temp.linux-x86_64-3.6/fairseq/clib/libbleu
x86_64-linux-gnu-gcc -pthread -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -fPIC -I/usr/include/python3.6m -c fairseq/clib/libbleu/libbleu.cpp -o build/temp.linux-x86_64-3.6/fairseq/clib/libbleu/libbleu.o -std=c++11 -O3 -DTORCH_API_INCLUDE_EXTENSION_H -DTORCH_EXTENSION_NAME=libbleu -D_GLIBCXX_USE_CXX11_ABI=0
x86_64-linux-gnu-gcc -pthread -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -fPIC -I/usr/include/python3.6m -c fairseq/clib/libbleu/module.cpp -o build/temp.linux-x86_64-3.6/fairseq/clib/libbleu/module.o -std=c++11 -O3 -DTORCH_API_INCLUDE_EXTENSION_H -DTORCH_EXTENSION_NAME=libbleu -D_GLIBCXX_

100%|██████████| 751652118/751652118 [01:03<00:00, 11906759.70B/s]


loading archive file http://dl.fbaipublicfiles.com/fairseq/models/roberta.large.mnli.tar.gz from cache at /root/.cache/torch/pytorch_fairseq/7685ba8546f9a5ce1a00c7a6d7d44f7e748d22681172f0f391c3d48f487c801c.74e37d47306b3cc51c5f8d335022a392c29f1906c8cd9e9cd3446d7422cf55d8
extracting archive file /root/.cache/torch/pytorch_fairseq/7685ba8546f9a5ce1a00c7a6d7d44f7e748d22681172f0f391c3d48f487c801c.74e37d47306b3cc51c5f8d335022a392c29f1906c8cd9e9cd3446d7422cf55d8 to temp dir /tmp/tmp8dvq_auw
| dictionary: 50264 types


1042301B [00:00, 1055222.64B/s]
456318B [00:00, 654299.43B/s]


RobertaHubInterface(
  (model): RobertaModel(
    (decoder): RobertaEncoder(
      (sentence_encoder): TransformerSentenceEncoder(
        (embed_tokens): Embedding(50265, 1024, padding_idx=1)
        (embed_positions): LearnedPositionalEmbedding(514, 1024, padding_idx=1)
        (layers): ModuleList(
          (0): TransformerSentenceEncoderLayer(
            (self_attn): MultiheadAttention(
              (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
            )
            (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (fc1): Linear(in_features=1024, out_features=4096, bias=True)
            (fc2): Linear(in_features=4096, out_features=1024, bias=True)
            (final_layer_norm): L

Train and Validation sets are commented out because they take too long to run. 

In [0]:
# train['tensPairCF'] = 0
# train['tensPairFC'] = 0

# val['tensPairCF'] = 0
# val['tensPairFC'] = 0

test['tensPairCF'] = 0
test['tensPairFC'] = 0

In [0]:
# # encode
# train['cf'] = train.apply(lambda row: roberta.encode(row['chq'], row['faq']), axis = 1)
# train['fc'] = train.apply(lambda row: roberta.encode(row['faq'], row['chq']), axis = 1)

# val['cf'] = val.apply(lambda row: roberta.encode(row['chq'], row['faq']), axis = 1)
# val['fc'] = val.apply(lambda row: roberta.encode(row['faq'], row['chq']), axis = 1)

test['cf'] = test.apply(lambda row: roberta.encode(row['chq'], row['faq']), axis = 1)
test['fc'] = test.apply(lambda row: roberta.encode(row['faq'], row['chq']), axis = 1)

In [0]:
# # train
# for index, row in train.iterrows():

#   # cf = roberta.predict('mnli', tokensCF).argmax()
#   # fc = roberta.predict('mnli', tokensFC).argmax()

#   cf = roberta.predict('mnli', row['cf']).argmax()
#   fc = roberta.predict('mnli', row['fc']).argmax()

#   train.at[index,'tensPairCF'] = cf.numpy()
#   train.at[index,'tensPairFC'] = fc.numpy()

In [0]:
# # Val
# for index, row in test.iterrows():


#   # cf = roberta.predict('mnli', tokensCF).argmax()
#   # fc = roberta.predict('mnli', tokensFC).argmax()

#   cf = roberta.predict('mnli', row['cf']).argmax()
#   fc = roberta.predict('mnli', row['fc']).argmax()

#   test.at[index,'tensPairCF'] = cf.numpy()
#   test.at[index,'tensPairFC'] = fc.numpy()

In [0]:
# Test
for index, row in test.iterrows():

  # cf = roberta.predict('mnli', tokensCF).argmax()
  # fc = roberta.predict('mnli', tokensFC).argmax()

  cf = roberta.predict('mnli', row['cf']).argmax()
  fc = roberta.predict('mnli', row['fc']).argmax()


  test.at[index,'tensPairCF'] = cf.numpy()
  test.at[index,'tensPairFC'] = fc.numpy()

In [0]:
test

Unnamed: 0,chq,faq,outcome,tensPairCF,tensPairFC,cf,fc
0,atypical pnuemonia. what is the possibility...,What are the possible treatments for atypical ...,0,2,1,"[tensor(0), tensor(415), tensor(18198), tensor...","[tensor(0), tensor(2264), tensor(32), tensor(5..."
1,Glaucoma Can you mail me patient information a...,How is glaucoma diagnosed ?,0,1,1,"[tensor(0), tensor(16389), tensor(1180), tenso...","[tensor(0), tensor(6179), tensor(16), tensor(5..."
2,Can you mail me patient information about Glau...,What is Glaucoma ?,1,0,1,"[tensor(0), tensor(10836), tensor(47), tensor(...","[tensor(0), tensor(2264), tensor(16), tensor(4..."
3,I am suffering from Kartagener's syndrome and ...,What is primary ciliary dyskinesia ?,1,1,1,"[tensor(0), tensor(100), tensor(524), tensor(3...","[tensor(0), tensor(2264), tensor(16), tensor(2..."
4,Please help me with my brother with locked-in ...,Is there a cure for Locked-in Syndrome ?,1,1,1,"[tensor(0), tensor(6715), tensor(244), tensor(...","[tensor(0), tensor(6209), tensor(89), tensor(1..."
...,...,...,...,...,...,...,...
225,Anal Fissure I may have a couple anal fissures...,What are the treatments for anal fissure ?,0,1,1,"[tensor(0), tensor(4688), tensor(337), tensor(...","[tensor(0), tensor(653), tensor(32), tensor(5)..."
226,"We have 14 siblings in our family , at least 1...",What are the first signs of frozen shoulder ?,0,1,1,"[tensor(0), tensor(170), tensor(33), tensor(50...","[tensor(0), tensor(2264), tensor(32), tensor(5..."
227,When and how do you know when you have congeni...,Who gets congenital night blindness ?,0,2,1,"[tensor(0), tensor(1779), tensor(8), tensor(14...","[tensor(0), tensor(12375), tensor(1516), tenso..."
228,would you help me to fine article or jurnal ab...,Why am I crying without an evident reason ?,0,1,1,"[tensor(0), tensor(14656), tensor(47), tensor(...","[tensor(0), tensor(7608), tensor(524), tensor(..."


In [0]:
test['entail'] = np.where(((test['tensPairCF'] == 2) | (test['tensPairFC'] == 2)), 1, 0)
test['no_entail'] = np.where(((test['tensPairCF'] != 2) & (test['tensPairFC'] != 2)), 1, 0)

Unnamed: 0,pid,type,value,chq,faq,outcome,tensPairCF,tensPairFC,cf,fc,entail,no_entail
0,1,,false,atypical pnuemonia. what is the possibility...,What are the possible treatments for atypical ...,0,0,0,"[tensor(0), tensor(415), tensor(18198), tensor...","[tensor(0), tensor(2264), tensor(32), tensor(5...",0,1
1,2,,false,Glaucoma Can you mail me patient information a...,How is glaucoma diagnosed ?,0,0,0,"[tensor(0), tensor(16389), tensor(1180), tenso...","[tensor(0), tensor(6179), tensor(16), tensor(5...",0,1
2,3,,true,Can you mail me patient information about Glau...,What is Glaucoma ?,1,0,0,"[tensor(0), tensor(10836), tensor(47), tensor(...","[tensor(0), tensor(2264), tensor(16), tensor(4...",0,1
3,4,,true,I am suffering from Kartagener's syndrome and ...,What is primary ciliary dyskinesia ?,1,0,0,"[tensor(0), tensor(100), tensor(524), tensor(3...","[tensor(0), tensor(2264), tensor(16), tensor(2...",0,1
4,5,,true,Please help me with my brother with locked-in ...,Is there a cure for Locked-in Syndrome ?,1,0,0,"[tensor(0), tensor(6715), tensor(244), tensor(...","[tensor(0), tensor(6209), tensor(89), tensor(1...",0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
225,226,,false,Anal Fissure I may have a couple anal fissures...,What are the treatments for anal fissure ?,0,0,0,"[tensor(0), tensor(4688), tensor(337), tensor(...","[tensor(0), tensor(653), tensor(32), tensor(5)...",0,1
226,227,,false,"We have 14 siblings in our family , at least 1...",What are the first signs of frozen shoulder ?,0,0,0,"[tensor(0), tensor(170), tensor(33), tensor(50...","[tensor(0), tensor(2264), tensor(32), tensor(5...",0,1
227,228,,false,When and how do you know when you have congeni...,Who gets congenital night blindness ?,0,0,0,"[tensor(0), tensor(1779), tensor(8), tensor(14...","[tensor(0), tensor(12375), tensor(1516), tenso...",0,1
228,229,,false,would you help me to fine article or jurnal ab...,Why am I crying without an evident reason ?,0,0,0,"[tensor(0), tensor(14656), tensor(47), tensor(...","[tensor(0), tensor(7608), tensor(524), tensor(...",0,1


In [0]:
test['roberta_pred'] = np.where((test['entail'] == 1), 1, 0)
test

Unnamed: 0,pid,type,value,chq,faq,outcome,tensPairCF,tensPairFC,cf,fc,entail,no_entail,roberta_pred
0,1,,false,atypical pnuemonia. what is the possibility...,What are the possible treatments for atypical ...,0,0,0,"[tensor(0), tensor(415), tensor(18198), tensor...","[tensor(0), tensor(2264), tensor(32), tensor(5...",0,1,0
1,2,,false,Glaucoma Can you mail me patient information a...,How is glaucoma diagnosed ?,0,0,0,"[tensor(0), tensor(16389), tensor(1180), tenso...","[tensor(0), tensor(6179), tensor(16), tensor(5...",0,1,0
2,3,,true,Can you mail me patient information about Glau...,What is Glaucoma ?,1,0,0,"[tensor(0), tensor(10836), tensor(47), tensor(...","[tensor(0), tensor(2264), tensor(16), tensor(4...",0,1,0
3,4,,true,I am suffering from Kartagener's syndrome and ...,What is primary ciliary dyskinesia ?,1,0,0,"[tensor(0), tensor(100), tensor(524), tensor(3...","[tensor(0), tensor(2264), tensor(16), tensor(2...",0,1,0
4,5,,true,Please help me with my brother with locked-in ...,Is there a cure for Locked-in Syndrome ?,1,0,0,"[tensor(0), tensor(6715), tensor(244), tensor(...","[tensor(0), tensor(6209), tensor(89), tensor(1...",0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,226,,false,Anal Fissure I may have a couple anal fissures...,What are the treatments for anal fissure ?,0,0,0,"[tensor(0), tensor(4688), tensor(337), tensor(...","[tensor(0), tensor(653), tensor(32), tensor(5)...",0,1,0
226,227,,false,"We have 14 siblings in our family , at least 1...",What are the first signs of frozen shoulder ?,0,0,0,"[tensor(0), tensor(170), tensor(33), tensor(50...","[tensor(0), tensor(2264), tensor(32), tensor(5...",0,1,0
227,228,,false,When and how do you know when you have congeni...,Who gets congenital night blindness ?,0,0,0,"[tensor(0), tensor(1779), tensor(8), tensor(14...","[tensor(0), tensor(12375), tensor(1516), tenso...",0,1,0
228,229,,false,would you help me to fine article or jurnal ab...,Why am I crying without an evident reason ?,0,0,0,"[tensor(0), tensor(14656), tensor(47), tensor(...","[tensor(0), tensor(7608), tensor(524), tensor(...",0,1,0


In [0]:
test[test['roberta_pred'] ==  test['outcome']].shape

(234, 13)

In [0]:
test.shape

(302, 13)

In [0]:
234/302

0.7748344370860927

In [0]:
test_actual[test_actual['roberta_pred'] ==  test_actual['outcome']].shape

(143, 11)

In [0]:
143/230

0.6217391304347826

In [0]:
test_actual['entail_cf'] = np.where((test_actual['tensPairCF'] == 2), 1, 0)
test_actual[test_actual['entail_cf'] ==  test_actual['outcome']].shape

(144, 12)

In [0]:
144/230

0.6260869565217392

## RoBERTa SequenceClassification

https://nbviewer.jupyter.org/github/devkosal/fastai_roberta/blob/master/fastai_roberta_superglue/RoBERTa%20with%20Fastai%20-%20RTE.ipynb

In [0]:
!pip install pytorch_transformers

Collecting pytorch_transformers
[?25l  Downloading https://files.pythonhosted.org/packages/a3/b7/d3d18008a67e0b968d1ab93ad444fc05699403fa662f634b2f2c318a508b/pytorch_transformers-1.2.0-py3-none-any.whl (176kB)
[K     |████████████████████████████████| 184kB 2.7MB/s 
[?25hCollecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/14/3d/efb655a670b98f62ec32d66954e1109f403db4d937c50d779a75b9763a29/sentencepiece-0.1.83-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 40.0MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/1f/8e/ed5364a06a9ba720fddd9820155cc57300d28f5f43a6fd7b7e817177e642/sacremoses-0.0.35.tar.gz (859kB)
[K     |████████████████████████████████| 860kB 32.1MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.35-cp36-none-any.whl size=883999 sha256=

In [0]:
from fastai.text import *
from fastai.metrics import *
from pytorch_transformers import RobertaTokenizer

In [0]:
# Creating a config object to store task specific information
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)
    
    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)
        
config = Config(
    task = "RTE",
    testing=False,
    seed = 2019,
    roberta_model_name='roberta-base', # can also be exchnaged with roberta-large 
    max_lr=1e-5,
    epochs=10,
    use_fp16=False,
    bs=4, 
    max_seq_len=256, 
    num_labels = 2,
    hidden_dropout_prob=.05,
    hidden_size=768, # 1024 for roberta-large
    start_tok = "<s>",
    end_tok = "</s>",
    mark_fields=True,
)

In [0]:
train.head()

Unnamed: 0,pid,type,value,chq,faq,outcome
0,1,originalQ-shortQ,True,\n How should I treat polymenorrhea in a 14-...,\n How should I treat polymenorrhea in a 14-...,1
1,2,originalQ-shortQ,True,\n Have there been any studies with low mole...,\n Can I use low molecular weight heparin in...,1
2,3,originalQ-shortRandQ,False,\n Have there been any studies with low mole...,\n What are the side effects of Florinef? C...,0
3,4,originalQ-shortQ,True,\n Let's give these immunizations. That's r...,\n Let's give these immunizations. That's r...,1
4,5,originalQ-shortRandQ,False,\n Let's give these immunizations. That's r...,\n Is there more support we can provide pati...,0


In [0]:
feat_cols = ["chq","faq"]
label_cols = "outcome"

In [0]:
class FastAiRobertaTokenizer(BaseTokenizer):
    """Wrapper around RobertaTokenizer to be compatible with fastai"""
    def __init__(self, tokenizer: RobertaTokenizer, max_seq_len: int=128, **kwargs): 
        self._pretrained_tokenizer = tokenizer
        self.max_seq_len = max_seq_len 
    def __call__(self, *args, **kwargs): 
        return self 
    def tokenizer(self, t:str) -> List[str]: 
        """Adds Roberta bos and eos tokens and limits the maximum sequence length""" 
        if config.mark_fields:
            sub = 2 # subtraction in totoal seq_length to be made due to adding spcl tokens
            assert "xxfld" in t
            t = t.replace("xxfld 1","") # remove the xxfld 1 special token from fastai
            # converting fastai field sep token to Roberta
            t = re.split(r'xxfld \d+', t) 
            res = []
            for i in range(len(t)-1): # loop over the number of additional fields and the Roberta sep
                res += self._pretrained_tokenizer.tokenize(t[i]) + [config.end_tok, config.end_tok]
                sub += 2 # increase our subtractions since we added more spcl tokens
            res += self._pretrained_tokenizer.tokenize(t[-1]) # add the last sequence
            return [config.start_tok] + res[:self.max_seq_len - sub] + [config.end_tok] 
        
        res = self._pretrained_tokenizer.tokenize(t)
        return [config.start_tok] + res[:self.max_seq_len - sub] + [config.end_tok]

In [0]:
# create fastai tokenizer for roberta
roberta_tok = RobertaTokenizer.from_pretrained("roberta-base")

fastai_tokenizer = Tokenizer(tok_func=FastAiRobertaTokenizer(roberta_tok, max_seq_len=config.max_seq_len), 
                             pre_rules=[], post_rules=[])

In [0]:
# create fastai vocabulary for roberta
roberta_tok.save_vocabulary('/content')

with open('vocab.json', 'r') as f:
    roberta_vocab_dict = json.load(f)
    
fastai_roberta_vocab = Vocab(list(roberta_vocab_dict.keys()))

In [0]:
# Setting up pre-processors
class RobertaTokenizeProcessor(TokenizeProcessor):
    def __init__(self, tokenizer):
         super().__init__(tokenizer=tokenizer, include_bos=False, include_eos=False, mark_fields=config.mark_fields)

class RobertaNumericalizeProcessor(NumericalizeProcessor):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, vocab=fastai_roberta_vocab, **kwargs)


def get_roberta_processor(tokenizer:Tokenizer=None, vocab:Vocab=None):
    """
    Constructing preprocessors for Roberta
    We remove sos and eos tokens since we add that ourselves in the tokenizer.
    We also use a custom vocabulary to match the numericalization with the original Roberta model.
    """
    return [RobertaTokenizeProcessor(tokenizer=tokenizer), NumericalizeProcessor(vocab=vocab)]

In [0]:
# Creating a Roberta specific DataBunch class
class RobertaDataBunch(TextDataBunch):
    "Create a `TextDataBunch` suitable for training Roberta"
    @classmethod
    def create(cls, train_ds, valid_ds, test_ds=None, path:PathOrStr='.', bs:int=64, val_bs:int=None, pad_idx=1,
               pad_first=True, device:torch.device=None, no_check:bool=False, backwards:bool=False, 
               dl_tfms:Optional[Collection[Callable]]=None, **dl_kwargs) -> DataBunch:
        "Function that transform the `datasets` in a `DataBunch` for classification. Passes `**dl_kwargs` on to `DataLoader()`"
        datasets = cls._init_ds(train_ds, valid_ds, test_ds)
        val_bs = ifnone(val_bs, bs)
        collate_fn = partial(pad_collate, pad_idx=pad_idx, pad_first=pad_first, backwards=backwards)
        train_sampler = SortishSampler(datasets[0].x, key=lambda t: len(datasets[0][t][0].data), bs=bs)
        train_dl = DataLoader(datasets[0], batch_size=bs, sampler=train_sampler, drop_last=True, **dl_kwargs)
        dataloaders = [train_dl]
        for ds in datasets[1:]:
            lengths = [len(t) for t in ds.x.items]
            sampler = SortSampler(ds.x, key=lengths.__getitem__)
            dataloaders.append(DataLoader(ds, batch_size=val_bs, sampler=sampler, **dl_kwargs))
        return cls(*dataloaders, path=path, device=device, dl_tfms=dl_tfms, collate_fn=collate_fn, no_check=no_check)

In [0]:
class RobertaTextList(TextList):
    _bunch = RobertaDataBunch
    _label_cls = TextList

In [0]:
# loading the tokenizer and vocab processors
processor = get_roberta_processor(tokenizer=fastai_tokenizer, vocab=fastai_roberta_vocab)

# creating our databunch 
data = ItemLists(".", RobertaTextList.from_df(train, ".", cols=feat_cols, processor=processor),
                      RobertaTextList.from_df(test, ".", cols=feat_cols, processor=processor)
                ) \
       .label_from_df(cols=label_cols, label_cls=CategoryList) \
       .add_test(RobertaTextList.from_df(test_actual, ".", cols=feat_cols, processor=processor)) \
       .databunch(bs=config.bs,pad_first=False)

In [0]:
import torch
import torch.nn as nn
from pytorch_transformers import RobertaForSequenceClassification

# defining our model architecture 
class RobertaForSequenceClassificationModel(nn.Module):
    def __init__(self,num_labels=config.num_labels):
        super(RobertaForSequenceClassificationModel,self).__init__()
        self.num_labels = num_labels
        self.roberta = RobertaForSequenceClassification.from_pretrained(config.roberta_model_name,num_labels= self.num_labels)

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        outputs = self.roberta(input_ids, token_type_ids, attention_mask)
        logits = outputs[0] 
        return logits

In [0]:
roberta_model = RobertaForSequenceClassificationModel() 

learn = Learner(data, roberta_model, metrics=[accuracy])

100%|██████████| 501200538/501200538 [00:39<00:00, 12776039.40B/s]


In [0]:
learn.model.roberta.train() # setting roberta to train as it is in eval mode by default
learn.fit_one_cycle(config.epochs, max_lr=config.max_lr)

epoch,train_loss,valid_loss,accuracy,time
0,0.031232,1.212344,0.715232,03:39
1,0.003588,1.808176,0.745033,03:40
2,0.077985,1.145466,0.725166,03:42
3,0.003664,1.942022,0.745033,03:45
4,0.000947,2.253066,0.715232,03:45
5,0.000484,2.284126,0.715232,03:47
6,0.001874,2.319058,0.748344,03:46
7,0.005788,2.691203,0.738411,03:47
8,3.2e-05,2.968797,0.751656,03:47
9,3.7e-05,2.945562,0.751656,03:45


In [0]:
def get_preds_as_nparray(ds_type) -> np.ndarray:
    learn.model.roberta.eval()
    preds = learn.get_preds(ds_type)[0].detach().cpu().numpy()
    sampler = [i for i in data.dl(ds_type).sampler]
    reverse_sampler = np.argsort(sampler)
    ordered_preds = preds[reverse_sampler, :]
    pred_values = np.argmax(ordered_preds, axis=1)
    return ordered_preds, pred_values

In [0]:
# val preds
preds, pred_values = get_preds_as_nparray(DatasetType.Valid)

In [0]:
# accuracy for valid valid
(pred_values == data.valid_ds.y.items).mean()

0.7516556291390728

In [0]:
# test preds
_, test_pred_values = get_preds_as_nparray(DatasetType.Test)

In [0]:
(test_pred_values == data.test_ds.y.items).mean()

0.12608695652173912

In [0]:
test_pred_values

array([1, 1, 1, 0, ..., 1, 1, 0, 1])

In [0]:
test_actual['roberta_pred'] = test_pred_values
test_actual

Unnamed: 0,pid,type,value,chq,faq,outcome,tensPairCF,tensPairFC,entail,no_entail,roberta_pred
0,1,,false,atypical pnuemonia. what is the possibility...,What are the possible treatments for atypical ...,0,2,1,1,0,1
1,2,,false,Glaucoma Can you mail me patient information a...,How is glaucoma diagnosed ?,0,1,1,0,1,1
2,3,,true,Can you mail me patient information about Glau...,What is Glaucoma ?,1,0,1,0,1,1
3,4,,true,I am suffering from Kartagener's syndrome and ...,What is primary ciliary dyskinesia ?,1,1,1,0,1,0
4,5,,true,Please help me with my brother with locked-in ...,Is there a cure for Locked-in Syndrome ?,1,1,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
225,226,,false,Anal Fissure I may have a couple anal fissures...,What are the treatments for anal fissure ?,0,1,1,0,1,1
226,227,,false,"We have 14 siblings in our family , at least 1...",What are the first signs of frozen shoulder ?,0,1,1,0,1,1
227,228,,false,When and how do you know when you have congeni...,Who gets congenital night blindness ?,0,2,1,1,0,1
228,229,,false,would you help me to fine article or jurnal ab...,Why am I crying without an evident reason ?,0,1,1,0,1,0


In [0]:
test_actual[test_actual['roberta_pred'] == test_actual['outcome']].shape

(110, 11)

In [0]:
110/230

0.5217391304347826