# This notebook contains code to train a NER ensemble


Code to classifier and combining NER output:
https://colab.research.google.com/drive/1779n9EcHGMUieslZZLw64rxJhJcFc0yf?authuser=0#scrollTo=nMMmP_X1qLIN

In [0]:
%%bash
pip install -q transformers
pip install seqeval

Collecting seqeval
  Downloading https://files.pythonhosted.org/packages/34/91/068aca8d60ce56dd9ba4506850e876aba5e66a6f2f29aa223224b50df0de/seqeval-0.0.12.tar.gz
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py): started
  Building wheel for seqeval (setup.py): finished with status 'done'
  Created wheel for seqeval: filename=seqeval-0.0.12-cp36-none-any.whl size=7424 sha256=97b40d7780b44bd0e85681a2dea1d9cb650432d2df1988baff6c708451d03228
  Stored in directory: /root/.cache/pip/wheels/4f/32/0a/df3b340a82583566975377d65e724895b3fad101a3fb729f68
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-0.0.12


In [0]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path 

import os

import torch
import torch.optim as optim
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import copy
import random 

# transformers
import transformers
from transformers import BertModel, BertTokenizer, BertConfig
from transformers import RobertaModel, RobertaTokenizer, RobertaConfig

from keras.preprocessing.sequence import pad_sequences


Using TensorFlow backend.


In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
print(device)

cuda


# BERT Sklearn

In [0]:
!git clone -b master https://github.com/charles9n/bert-sklearn
!cd bert-sklearn; pip install .
import os
os.chdir("bert-sklearn")
print(os.listdir())

Cloning into 'bert-sklearn'...
remote: Enumerating objects: 12, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 259 (delta 3), reused 3 (delta 0), pack-reused 247[K
Receiving objects: 100% (259/259), 519.36 KiB | 12.37 MiB/s, done.
Resolving deltas: 100% (125/125), done.
Processing /content/bert-sklearn
Building wheels for collected packages: bert-sklearn
  Building wheel for bert-sklearn (setup.py) ... [?25l[?25hdone
  Created wheel for bert-sklearn: filename=bert_sklearn-0.3.1-cp36-none-any.whl size=54234 sha256=4c99be86a590d35e8fa5cf5c3483faf6717a19514af6b3a3f1347af469b58f20
  Stored in directory: /root/.cache/pip/wheels/61/95/c6/5790aae8fb377f5ff356dbe58205aab28858595d6bff8197d0
Successfully built bert-sklearn
Installing collected packages: bert-sklearn
Successfully installed bert-sklearn-0.3.1
['LICENSE', 'other_examples', 'glue_examples', 'demo_tuning_hyperparams.ipynb', 'tests', 'demo.ipynb', 'setup.p

In [0]:
import torch
print('pytorch version:', torch.__version__)
print('GPU:',torch.cuda.get_device_name(0))

pytorch version: 1.5.0+cu101
GPU: Tesla P100-PCIE-16GB


In [0]:
import os
import math
import random
import csv
import sys

import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
import statistics as stats

from bert_sklearn import BertClassifier
from bert_sklearn import BertRegressor
from bert_sklearn import BertTokenClassifier
from bert_sklearn import load_model

def read_tsv(filename, quotechar=None):
    with open(filename, "r", encoding='utf-8') as f:
        return list(csv.reader(f, delimiter="\t", quotechar=quotechar))   

def flatten(l):
    return [item for sublist in l for item in sublist]

def read_CoNLL2003_format(filename, idx=3):
    """Read file in CoNLL-2003 shared task format"""
    
    # read file
    lines =  open(filename).read().strip()   
    
    # find sentence-like boundaries
    lines = lines.split("\n\n")  
    
     # split on newlines
    lines = [line.split("\n") for line in lines]
    
    # get tokens
    tokens = [[l.split('\t')[0] for l in line] for line in lines]
    
    # get labels/tags
    labels = [[l.split('\t')[idx] for l in line] for line in lines]
    
    #convert to df
    data= {'tokens': tokens, 'labels': labels}
    df=pd.DataFrame(data=data)
    
    return df

In [0]:
"""
Train data: 14987 sentences, 204567 tokens
Dev data: 3466 sentences, 51578 tokens
Test data: 3684 sentences, 46666 tokens
"""

DATADIR = "../"

def get_conll2003_data(trainfile=DATADIR + "train.tsv",
                       devfile=DATADIR + "test.tsv"):

    train = read_CoNLL2003_format(trainfile, idx=1)
    print("Train data: %d sentences, %d tokens"%(len(train), len(flatten(train.tokens))))
    dev = read_CoNLL2003_format(devfile, idx=1)
    print("Dev data: %d sentences, %d tokens"%(len(dev), len(flatten(dev.tokens))))
    
    return train, dev


train, dev = get_conll2003_data()
train.head()

Train data: 1490 sentences, 25800 tokens
Dev data: 428 sentences, 8118 tokens


Unnamed: 0,tokens,labels
0,"[my, nigga, dante, addicted, to, that, nicotine]","[O, O, O, B, O, O, O]"
1,"[i, feel, soo, much, better, today,, cymbalta,...","[O, O, O, O, O, O, O, B, I, O, O, O, O]"
2,"[@theotherrift, it, sort, of, can., :(, you, t...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[@sumiyyahiqbal, @shahbaigg, difference, is, i...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
4,"[rt, @fightforfood:, what, i, lack, in, money,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [0]:
X_train, y_train = train.tokens, train.labels
X_dev, y_dev = dev.tokens, dev.labels
# X_test, y_test = test.tokens, test.labels

label_list = np.unique(flatten(y_train))
label_list = list(label_list)
print("\nNER tags:",label_list)




NER tags: ['B', 'I', 'O']


In [0]:
def train(model_name, max_seq_length, epochs, validation_fraction, label_list,
          X_train, y_train, X_dev, y_dev):
  model = BertTokenClassifier(model_name,
                            max_seq_length=max_seq_length,
                            epochs=epochs,
                            gradient_accumulation_steps=2,
                            learning_rate=3e-5,
                            train_batch_size=16,
                            eval_batch_size=16,
                            validation_fraction=validation_fraction,                            
                            label_list=label_list,
                            ignore_label=['O'])
  print(model)
  model.fit(X_train, y_train)
  f1_test = model.score(X_dev, y_dev,'macro')
  print("Test f1: %0.02f"%(f1_test))
  y_preds = model.predict(X_dev)
  print(classification_report(flatten(y_dev), flatten(y_preds)))
  return f1_test, y_preds

In [0]:
models = ['bert-base-uncased','scibert-scivocab-uncased','biobert-v1.1-pubmed-base-cased']
max_seq_length = 128
epochs = 4
validation_fraction = 0.1
model_results = {}
for model in models:
  print("Training model ", model)
  f1, preds = train(model, max_seq_length, epochs, validation_fraction, 
                    label_list, X_train, y_train, X_dev, y_dev)
  model_results[model] = {}
  model_results[model]['f1'] = f1
  model_results[model]['preds'] = preds

Training model  bert-base-uncased
Building sklearn token classifier...
BertTokenClassifier(bert_config_json=None, bert_model='bert-base-uncased',
                    bert_vocab=None, do_lower_case=None, epochs=4,
                    eval_batch_size=16, fp16=False, from_tf=False,
                    gradient_accumulation_steps=2, ignore_label=['O'],
                    label_list=['B', 'I', 'O'], learning_rate=3e-05,
                    local_rank=-1, logfile='bert_sklearn.log', loss_scale=0,
                    max_seq_length=128, num_mlp_hiddens=500, num_mlp_layers=0,
                    random_state=42, restore_file=None, train_batch_size=16,
                    use_cuda=True, validation_fraction=0.1,
                    warmup_proportion=0.1)
Loading bert-base-uncased model...
Defaulting to linear classifier/regressor
Loading Pytorch checkpoint

train data size: 1341, validation data size: 149



HBox(children=(IntProgress(value=0, description='Training  ', max=168, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Validating', max=10, style=ProgressStyle(description_width='i…



Epoch 1, Train loss: 0.0490, Val loss: 0.0216, Val accy: 94.30%, f1: 58.03



HBox(children=(IntProgress(value=0, description='Training  ', max=168, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Validating', max=10, style=ProgressStyle(description_width='i…



Epoch 2, Train loss: 0.0192, Val loss: 0.0175, Val accy: 95.67%, f1: 67.73



HBox(children=(IntProgress(value=0, description='Training  ', max=168, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Validating', max=10, style=ProgressStyle(description_width='i…



Epoch 3, Train loss: 0.0071, Val loss: 0.0201, Val accy: 95.21%, f1: 67.17



HBox(children=(IntProgress(value=0, description='Training  ', max=168, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Validating', max=10, style=ProgressStyle(description_width='i…



Epoch 4, Train loss: 0.0033, Val loss: 0.0217, Val accy: 95.29%, f1: 67.49



HBox(children=(IntProgress(value=0, description='Predicting', max=27, style=ProgressStyle(description_width='i…


Test f1: 59.23


HBox(children=(IntProgress(value=0, description='Predicting', max=27, style=ProgressStyle(description_width='i…


              precision    recall  f1-score   support

           B       0.62      0.65      0.64       364
           I       0.57      0.53      0.55       359
           O       0.96      0.97      0.97      7395

    accuracy                           0.93      8118
   macro avg       0.72      0.72      0.72      8118
weighted avg       0.93      0.93      0.93      8118

Training model  scibert-scivocab-uncased
Building sklearn token classifier...
BertTokenClassifier(bert_config_json=None,
                    bert_model='scibert-scivocab-uncased', bert_vocab=None,
                    do_lower_case=None, epochs=4, eval_batch_size=16,
                    fp16=False, from_tf=False, gradient_accumulation_steps=2,
                    ignore_label=['O'], label_list=['B', 'I', 'O'],
                    learning_rate=3e-05, local_rank=-1,
                    logfile='bert_sklearn.log', loss_scale=0,
                    max_seq_length=128, num_mlp_hiddens=500, num_mlp_layers=0,
        

HBox(children=(IntProgress(value=0, description='Training  ', max=168, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Validating', max=10, style=ProgressStyle(description_width='i…



Epoch 1, Train loss: 0.0447, Val loss: 0.0324, Val accy: 91.71%, f1: 45.18



HBox(children=(IntProgress(value=0, description='Training  ', max=168, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Validating', max=10, style=ProgressStyle(description_width='i…



Epoch 2, Train loss: 0.0184, Val loss: 0.0350, Val accy: 92.50%, f1: 53.91



HBox(children=(IntProgress(value=0, description='Training  ', max=168, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Validating', max=10, style=ProgressStyle(description_width='i…



Epoch 3, Train loss: 0.0074, Val loss: 0.0378, Val accy: 92.31%, f1: 57.09



HBox(children=(IntProgress(value=0, description='Training  ', max=168, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Validating', max=10, style=ProgressStyle(description_width='i…



Epoch 4, Train loss: 0.0033, Val loss: 0.0427, Val accy: 92.38%, f1: 55.62



HBox(children=(IntProgress(value=0, description='Predicting', max=27, style=ProgressStyle(description_width='i…


Test f1: 57.99


HBox(children=(IntProgress(value=0, description='Predicting', max=27, style=ProgressStyle(description_width='i…


              precision    recall  f1-score   support

           B       0.60      0.60      0.60       364
           I       0.60      0.52      0.56       359
           O       0.96      0.97      0.97      7395

    accuracy                           0.93      8118
   macro avg       0.72      0.70      0.71      8118
weighted avg       0.93      0.93      0.93      8118

Training model  biobert-v1.1-pubmed-base-cased
Building sklearn token classifier...
BertTokenClassifier(bert_config_json=None,
                    bert_model='biobert-v1.1-pubmed-base-cased',
                    bert_vocab=None, do_lower_case=None, epochs=4,
                    eval_batch_size=16, fp16=False, from_tf=False,
                    gradient_accumulation_steps=2, ignore_label=['O'],
                    label_list=['B', 'I', 'O'], learning_rate=3e-05,
                    local_rank=-1, logfile='bert_sklearn.log', loss_scale=0,
                    max_seq_length=128, num_mlp_hiddens=500, num_mlp_layers

HBox(children=(IntProgress(value=0, description='Training  ', max=168, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Validating', max=10, style=ProgressStyle(description_width='i…



Epoch 1, Train loss: 0.0474, Val loss: 0.0314, Val accy: 92.26%, f1: 51.67



HBox(children=(IntProgress(value=0, description='Training  ', max=168, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Validating', max=10, style=ProgressStyle(description_width='i…



Epoch 2, Train loss: 0.0196, Val loss: 0.0295, Val accy: 92.98%, f1: 59.71



HBox(children=(IntProgress(value=0, description='Training  ', max=168, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Validating', max=10, style=ProgressStyle(description_width='i…



Epoch 3, Train loss: 0.0090, Val loss: 0.0310, Val accy: 93.85%, f1: 61.74



HBox(children=(IntProgress(value=0, description='Training  ', max=168, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Validating', max=10, style=ProgressStyle(description_width='i…



Epoch 4, Train loss: 0.0042, Val loss: 0.0383, Val accy: 93.89%, f1: 60.19



HBox(children=(IntProgress(value=0, description='Predicting', max=27, style=ProgressStyle(description_width='i…


Test f1: 55.77


HBox(children=(IntProgress(value=0, description='Predicting', max=27, style=ProgressStyle(description_width='i…


              precision    recall  f1-score   support

           B       0.64      0.56      0.60       364
           I       0.58      0.47      0.52       359
           O       0.96      0.97      0.97      7395

    accuracy                           0.93      8118
   macro avg       0.73      0.67      0.69      8118
weighted avg       0.93      0.93      0.93      8118



In [0]:
for k,v in model_results.items():
  print(k)
  print(v['f1'])
  print("\n")

bert-base-uncased
59.23236032351028


scibert-scivocab-uncased
57.99331967742531


biobert-v1.1-pubmed-base-cased
55.76539589442815




In [0]:
import pickle
pickle.dump(model_results, open('./model_results_bert_variants_task3_ner.pickle', 'wb'))

In [0]:
from collections import Counter
tags = []
for i in range(len(X_dev)):
  text_list = X_dev.iloc[i]
  tmp = []
  bert_base_pred = model_results['bert-base-uncased']['preds'][i]
  scibert_pred = model_results['scibert-scivocab-uncased']['preds'][i]
  biobert_pred = model_results['biobert-v1.1-pubmed-base-cased']['preds'][i]
  for j in range(len(bert_base_pred)):
    tmp.append(Counter([bert_base_pred[j], 
                         scibert_pred[j], 
                         biobert_pred[j]]).most_common(1)[0][0]) 
  tags.append(tmp)

In [0]:
print(classification_report(flatten(y_dev), flatten(tags)))

              precision    recall  f1-score   support

           B       0.66      0.61      0.64       364
           I       0.64      0.53      0.58       359
           O       0.96      0.97      0.97      7395

    accuracy                           0.94      8118
   macro avg       0.75      0.71      0.73      8118
weighted avg       0.93      0.94      0.94      8118



In [0]:
from sklearn.metrics import f1_score, accuracy_score
f1_score(flatten(y_dev), flatten(tags), average="macro")

0.7279931806314351

In [0]:
# from collections import Counter
# tags = []
# for i in range(len(X_dev)):
#   text_list = X_dev.iloc[i]
#   tmp = []
#   bert_base_pred = model_results['bert-base-uncased']['preds'][i]
#   scibert_pred = model_results['scibert-scivocab-uncased']['preds'][i]
#   biobert_pred = model_results['biobert-v1.1-pubmed-base-cased']['preds'][i]
#   for j in range(len(bert_base_pred)):
#     if 'B' in [bert_base_pred[j], scibert_pred[j], biobert_pred[j]]:
#        tmp.append('B')
#     elif 'I' in [bert_base_pred[j], scibert_pred[j], biobert_pred[j]]:
#       tmp.append('I')
#     else:
#       tmp.append('O')
#   tags.append(tmp)

In [0]:
# print(classification_report(flatten(y_dev), flatten(tags)))

              precision    recall  f1-score   support

           B       0.53      0.76      0.63       364
           I       0.52      0.61      0.56       359
           O       0.97      0.95      0.96      7395

    accuracy                           0.92      8118
   macro avg       0.68      0.77      0.72      8118
weighted avg       0.93      0.92      0.93      8118



In [0]:
# from sklearn.metrics import f1_score, accuracy_score
# f1_score(flatten(y_dev), flatten(tags), average="macro")

0.7152722317944162

In [0]:
valid = pd.read_csv('../task3_validation.tsv', sep='\t')
print(valid.shape)
valid.head(2)

(560, 9)


Unnamed: 0,tweet_id,begin,end,type,extraction,drug,tweet,meddra_code,meddra_term
0,332317478170546176,28.0,37.0,ADR,allergies,avelox,"do you have any medication allergies? ""asthma!...",10013661.0,drug allergy
1,347806215776116737,31.0,46.0,ADR,HURT YOUR Liver,avelox,"@ashleylvivian if #avelox has hurt your liver,...",10024668.0,liver damage


In [0]:
tweet_ids = list(set(list(valid['tweet_id'])))

In [0]:
span_op = []
for i in range(len(X_dev)):
    tags_split = tags[i]
    text_split = X_dev.iloc[i]
    text = " ".join(text_split)
    running_length = 0
    prev = ""
    st = 0
    en = 0
    
    for j in range(len(tags_split)):
        
        if tags_split[j] in ['B', 'I']:
            if prev in ['B', 'I']:
                en += len(text_split[j])
            else:
                #flush previous:
                if en != 0:
                    span_op.append([tweet_ids[i], st, en, text[st:en+1]])
                    en = 0
                st = running_length
                en = running_length + len(text_split[j])
                
        running_length += len(text_split[j]) + 1 # for whitespace
        prev = tags_split[j]

In [0]:
span_op_df = pd.DataFrame(span_op, columns=['tweet_id', 'start', 'end', 'extract'])

In [0]:
span_op_df.to_csv('./span_op_df_task3_ner.tsv')