# Task 2 - SciBERT based NER

## 1. Install necessary libraries

In [None]:
!pip install -q contractions transformers sent2vec imbalanced-learn seqeval[gpu]
!pip install -q tf-estimator-nightly==2.8.0.dev2021122109
!python -m pip uninstall -q -y spacy
!python -m pip install -q -U spacy

[K     |████████████████████████████████| 4.0 MB 28.2 MB/s 
[K     |████████████████████████████████| 43 kB 2.4 MB/s 
[K     |████████████████████████████████| 106 kB 71.2 MB/s 
[K     |████████████████████████████████| 287 kB 58.9 MB/s 
[K     |████████████████████████████████| 6.6 MB 36.4 MB/s 
[K     |████████████████████████████████| 77 kB 6.4 MB/s 
[K     |████████████████████████████████| 880 kB 58.9 MB/s 
[K     |████████████████████████████████| 596 kB 49.1 MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 462 kB 30.1 MB/s 
[K     |████████████████████████████████| 6.2 MB 33.5 MB/s 
[K     |████████████████████████████████| 457 kB 57.5 MB/s 
[K     |████████████████████████████████| 42 kB 1.5 MB/s 
[K     |████████████████████████████████| 10.1 MB 62.4 MB/s 
[K     |████████████████████████████████| 653 kB 64.4 MB/s 
[K     |███████████

In [None]:
!python -m spacy download en_core_web_trf

Collecting en-core-web-trf==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.3.0/en_core_web_trf-3.3.0-py3-none-any.whl (460.3 MB)
[K     |████████████████████████████████| 460.3 MB 25 kB/s 
Collecting spacy-transformers<1.2.0,>=1.1.2
  Downloading spacy_transformers-1.1.5-py2.py3-none-any.whl (51 kB)
[K     |████████████████████████████████| 51 kB 190 kB/s 
Collecting transformers<4.18.0,>=3.4.0
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 26.5 MB/s 
Collecting spacy-alignments<1.0.0,>=0.7.2
  Downloading spacy_alignments-0.8.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 68.7 MB/s 
Installing collected packages: transformers, spacy-alignments, spacy-transformers, en-core-web-trf
  Attempting uninstall: transformers
    Found existing installation: transformers 4.18.0
    Uninstalling transformers-4.

In [None]:
# !python -m spacy download en_core_web_sm

## 2. Load all libraries

In [None]:
import numpy as np
import pandas as pd
import spacy
import nltk
import random
import re
import torch
import warnings
import torch.nn as nn

from spacy.util import minibatch, compounding
from spacy.training import Example
from pathlib import Path
from spacy.training import offsets_to_biluo_tags
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
from pprint import pprint
# from sklearn.metrics import accuracy_score, classification_report, f1_score
from seqeval.metrics import classification_report, f1_score

nltk.download('punkt')
pd.options.display.max_rows = None
pd.options.display.max_columns = None
pd.options.display.max_colwidth=None
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## 3. Load the dataset

In [None]:
# Load data
training = pd.read_csv('training_data_with_ADR.csv')
validation = pd.read_csv('validation_data_with_ADR.csv')

In [None]:
print(training.shape)
print(validation.shape)

(2172, 13)
(560, 13)


In [None]:
training.head()

Unnamed: 0.1,Unnamed: 0,tweet_id,begin,end,type,extraction,drug,tweet,meddra_code,meddra_term,clean_tweets,label,ADR
0,0,331187619096588288,,,,,ofloxacin,@seefisch:oral drugs for pyelonephritis:ciprofloxacin levofloxacin tmp/smz do not use nitrofurantoin for pyelo(only cystitis)@david_medinaf,,,<user> : oral drugs for pyelonephritis : ciprofloxacin levofloxacin tmp / smz do not use nitrofurantoin for pyelo ( only cystitis ) <user>,0,0
1,1,332227554956161024,,,,,trazodone,happy for wellbutrin; has similar effects as adderall.. trazodone is super promising for sleep.. but abilify can cause weight gain -_-,,,happy for wellbutrin ; has similar effects as adderall . <repeated> trazodone is super promising for sleep . <repeated> but abilify can cause weight gain -_-,0,0
2,2,332448217490944000,,,,,lamotrigine,"@stilgarg i'm ok ty have an official diagnosis of bipolar now, feeling ok at the moment lamotrigine has been increased having monotherapy:/",,,"<user> i am ok ty have an official diagnosis of bipolar now , feeling ok at the moment lamotrigine has been increased having monotherapy <annoyed>",0,0
3,3,332977955754110976,,,,,cymbalta,i'm soo depressed cymbalta couldn't help me .,,,i am soo depressed cymbalta could not help me .,0,0
4,4,333674203331051520,,,,,seroquel,"time for my daily afternoon relaxation ritual of smoking weed, taking 2 mgs of clonazepam, and 400 mg of seroquel xr.",,,"time for my daily afternoon relaxation ritual of smoking weed , taking <number> mgs of clonazepam , and <number> mg of seroquel xr .",0,0


## 4. Prepare the data

In [None]:
# Get the non-null rows
training_data = training[training.ADR == 1]
# validation_data = validation[validation.ADR == 1]
validation_data = validation.copy()

# Reset index
training_data.reset_index(inplace=True, drop=True)
validation_data.reset_index(inplace=True, drop=True)

# Drop unwanted column
training_data.drop("Unnamed: 0", inplace=True, axis=1)
validation_data.drop("Unnamed: 0", inplace=True, axis=1)

# Fill in the missing values
training_data.extraction = training_data.extraction.fillna('-')
validation_data.extraction = validation_data.extraction.fillna('-')

In [None]:
print(f"Shape of training data: {training_data.shape}")
print(f"Shape of validation data: {validation_data.shape}")

Shape of training data: (1434, 12)
Shape of validation data: (560, 12)


In [None]:
# Ground truth labels were misleading 
# Hence create a new ground truth for start and end index

def find_start_end(dataframe):
    count = 0
    new_start = list()
    new_end = list()

    for row in dataframe.itertuples():
        if row[5] != '-': # If extraction is not empty then
            match = re.search(r"{}".format(row[5].lower().replace(")", "\)")), r"{}".format(row[7].lower()))
            if not match:
                new_start.append(int(row[2]))
                new_end.append(int(row[3]))
            else:
                if row[2] != match.start() and row[3] != match.end():
                    count += 1
                new_start.append(match.start())
                new_end.append(match.end())
        
        else: # If extraction is empty then
            new_start.append(0)
            new_end.append(0)

    print(f"Percentage of rows for which start and end index did not match is {round(count/dataframe.shape[0]*100, 4)}%")
    return new_start, new_end

In [None]:
train_start, train_end = find_start_end(training_data)
valid_start, valid_end = find_start_end(validation_data)

Percentage of rows for which start and end index did not match is 36.4017%
Percentage of rows for which start and end index did not match is 21.0714%


In [None]:
# Create two columns for new start and end index
training_data['new_start'] = train_start
training_data['new_end'] = train_end

validation_data['new_start'] = valid_start
validation_data['new_end'] = valid_end

In [None]:
training_data.head()

Unnamed: 0,tweet_id,begin,end,type,extraction,drug,tweet,meddra_code,meddra_term,clean_tweets,label,ADR,new_start,new_end
0,342314998904786945,42.0,53.0,ADR,lost vision,cymbalta,"#cymbalta withdrawal has reached a peak, lost vision and almost crashed my car from a brain zap. thanks a zillion #elililly #bigpharma",10047522.0,vision loss,"<hashtag> cymbalta </hashtag> withdrawal has reached a peak , lost vision and almost crashed my car from a brain zap . thanks a zillion <hashtag> eli lilly </hashtag> <hashtag> big pharma </hashtag>",1,1,41,52
1,342314998904786945,11.0,21.0,ADR,withdrawal,cymbalta,"#cymbalta withdrawal has reached a peak, lost vision and almost crashed my car from a brain zap. thanks a zillion #elililly #bigpharma",10048010.0,withdrawal syndrome,"<hashtag> cymbalta </hashtag> withdrawal has reached a peak , lost vision and almost crashed my car from a brain zap . thanks a zillion <hashtag> eli lilly </hashtag> <hashtag> big pharma </hashtag>",1,1,10,20
2,342322703556038657,27.0,35.0,ADR,nauseous,cipro,i hate cipro! #antibiotic #nauseous #cf #hospitallife #cysticfibrosis,10028823.0,nauseous,i hate cipro ! <hashtag> antibiotic </hashtag> <hashtag> nauseous </hashtag> <hashtag> cf </hashtag> <hashtag> hospital life </hashtag> <hashtag> cystic fibrosis </hashtag>,1,1,27,35
3,342349802601844737,109.0,118.0,ADR,can't cum,seroquel,@luckystubbs reppin zoloft&amp;seroquel since last november. i'm hella gainin weight too awesome i'm fat and can't cum i own,10021574.0,inability to orgasm,<user> reppin zoloft & seroquel since last november . i am hella gainin weight too awesome i am fat and can not cum i own,1,1,109,118
4,342349802601844737,101.0,104.0,ADR,fat,seroquel,@luckystubbs reppin zoloft&amp;seroquel since last november. i'm hella gainin weight too awesome i'm fat and can't cum i own,10047896.0,weight gain,<user> reppin zoloft & seroquel since last november . i am hella gainin weight too awesome i am fat and can not cum i own,1,1,101,104


In [None]:
# Load spacy and pipeline
nlp = spacy.load('en_core_web_trf')
ner = nlp.get_pipe('ner')

In [None]:
nlp.pipe_names

['transformer', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [None]:
# Prepare the tweets in spacy format
TRAIN_DATA = []
VALID_DATA = []

for row in training_data.itertuples():
    TRAIN_DATA.append((row[7],{
        'entities': [(int(row[13]), int(row[14]), 'ADR')]
    }))

for row in validation_data.itertuples():
    VALID_DATA.append((row[7],{
        'entities': [(int(row[13]), int(row[14]), 'ADR')]
    }))

In [None]:
# Tag text to BILUO format and replace the tags "L" with "I" and "U" with "B"
# for BIO labelling scheme

tags_list = list()
for text, annot in TRAIN_DATA:
    doc = nlp(text)
    tags = offsets_to_biluo_tags(doc, annot['entities'])
    bio_tags = list(map(lambda tag: tag.replace("L", "I").replace("U", "B"), tags))
    bio_tags = ['O' if tag == '-' else tag for tag in bio_tags]
    tags_list.append(bio_tags)

valid_tags_list = list()
for text, annot in VALID_DATA:
    doc = nlp(text)
    tags = offsets_to_biluo_tags(doc, annot['entities'])
    bio_tags = list(map(lambda tag: tag.replace("L", "I").replace("U", "B"), tags))
    bio_tags = ['O' if tag == '-' else tag for tag in bio_tags]
    valid_tags_list.append(bio_tags)

In [None]:
final_train = pd.DataFrame(columns=["extraction", "tweet", "bio_tags"])
final_valid = pd.DataFrame(columns=["extraction", "tweet", "bio_tags"])

In [None]:
final_train.extraction = training_data.extraction
final_train.bio_tags = tags_list
final_train.label = training_data.label
final_train.ADR = training_data.ADR

final_valid.extraction = validation_data.extraction
final_valid.bio_tags = valid_tags_list
final_valid.label = validation_data.label
final_valid.ADR = validation_data.ADR

In [None]:
final_train.tweet = [[str(token) for token in nlp(tweet)] for tweet in training_data.tweet]
final_valid.tweet = [[str(token) for token in nlp(tweet)] for tweet in validation_data.tweet]

In [None]:
final_train.head()

Unnamed: 0,extraction,tweet,bio_tags
0,lost vision,"[#, cymbalta, withdrawal, has, reached, a, peak, ,, lost, vision, and, almost, crashed, my, car, from, a, brain, zap, ., thanks, a, zillion, #, elililly, #, bigpharma]","[O, O, O, O, O, O, O, O, B-ADR, I-ADR, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
1,withdrawal,"[#, cymbalta, withdrawal, has, reached, a, peak, ,, lost, vision, and, almost, crashed, my, car, from, a, brain, zap, ., thanks, a, zillion, #, elililly, #, bigpharma]","[O, O, B-ADR, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
2,nauseous,"[i, hate, cipro, !, #, antibiotic, #, nauseous, #, cf, #, hospitallife, #, cysticfibrosis]","[O, O, O, O, O, O, O, B-ADR, O, O, O, O, O, O]"
3,can't cum,"[@luckystubbs, reppin, zoloft&amp;seroquel, since, last, november, ., i, 'm, hella, gainin, weight, too, awesome, i, 'm, fat, and, ca, n't, cum, i, own]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-ADR, I-ADR, I-ADR, O, O]"
4,fat,"[@luckystubbs, reppin, zoloft&amp;seroquel, since, last, november, ., i, 'm, hella, gainin, weight, too, awesome, i, 'm, fat, and, ca, n't, cum, i, own]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-ADR, O, O, O, O, O, O]"


In [None]:
final_valid[final_valid.label != final_valid.ADR].shape

(72, 3)

In [None]:
final_valid.loc[(final_valid.label == 0) & (final_valid.ADR == 1), :]

Unnamed: 0,extraction,tweet,bio_tags
7,-,"[@deadfanclub, @danadelany, i, have, a, penicillin/, keflex, allergy, so, my, dr, writes, me, 10, days, of, zpack, or, i, 'd, still, be, sick, too, !, sometimes, cipro]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
12,-,"[@lanternlizard, cymbalta, switch, is, part, of, what, made, me, flip, ., but, any, snri, with, family, hx, of, mania, puts, you, at, risk, .]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
20,-,"[@autumnuh_beee, i, was, on, celexa, ,, which, made, me, sleepy, ., i, switched, to, effexor, ., it, has, norepinephrine, in, it, ,, so, it, jump, starts, me, in, the, mornin]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
21,-,"[i, feel, so, pill, -, ill, again, -, decided, to, seriously, cut, down, to, just, weekly, enbrel, and, metho, alongside, daily, nsaids, #, rheum, .]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
22,-,"[rt, @dratp, :, @lisabinkc, @trainingwithra, @dsymons, enbrel, also, available, as, prefilled, syringe, ,, much, gentler, ., pen, leaves, bruises, !, #, rheum]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
26,-,"[i, have, some, seriously, disturbing, dreams, ., thanks, #, fluoxetine, thanks, a, freaking, lot, ., well, i, guess, it, 's, better, than, being, #, suicidal, .., only, just]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
30,-,"[will, still, be, on, the, geodon, ,, but, will, also, be, taking, a, low, -, dose, alprazolam, as, needed, for, the, sudden, anxiety, attacks, i, ve, been, getting]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
33,-,"[@showbizmom48, mri, showed, lesions, in, my, noggin, :, s, got, to, have, eeg, to, see, if, its, ms, or, humira, doing, it.so, more, tests, n, messing, about, lolxx]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
57,-,"[accidentally, asphyxiating, yourself, trying, to, suck, a, throat, lozenge, when, you, have, a, stuffed, nose, ., #, that]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
81,-,"[@iamcarrieeeeee, that, 's, what, i, get, #, bipolar2, but, the, bloody, quetiapine, made, me, so, bloody, miserable, .]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"


In [None]:
final_valid.loc[(final_valid.label == 1) & (final_valid.ADR == 0), :]

Unnamed: 0,extraction,tweet,bio_tags
0,allergies,"[do, you, have, any, medication, allergies, ?, "", asthma, !, !, !, "", me, :, "", ........, "", pt, :, "", no, wait, ., avelox, ,, that, 's, it, !, "", "", so, no, other, allergies, ?, "", "", right, !, "", *, cont]","[O, O, O, O, O, B-ADR, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
1,HURT YOUR Liver,"[@ashleylvivian, if, #, avelox, has, hurt, your, liver, ,, avoid, tylenol, always, ,, as, it, further, damages, liver, ,, eat, grapefruit, unless, taking, cardiac, drugs]","[O, O, O, O, O, B-ADR, I-ADR, I-ADR, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
23,space cadet,"[note, to, self, :, do, not, take, #, fluoxetine, and, #, aleved, together, ., i, 'm, a, space, cadet, today, !, , #, druginteractions]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-ADR, I-ADR, O, O, O, O, O]"
47,rashes,"[so, while, i, 'm, letting, the, thoughts, of, lamictal, /, fatal, rashes, /, tonsillectomy, /, spinal, tap, settle, ,, i, 'm, going, to, go, rescue, a, budgie, .]","[O, O, O, O, O, O, O, O, O, O, O, B-ADR, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
83,allergic,"[(, triggering, ), so, ..., i, 'm, taking, quetiapine, and, tramadol, pre, -, cutting, ., i, 'm, allergic, to, tramadol, ..., this, should, be, interesting, #, selfharm]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-ADR, O, O, O, O, O, O, O, O, O]"
114,dependence,"[but, i, do, n't, want, to, take, an, ambien, or, trazodone, or, anything, because, i, dint, wanna, restart, my, dependence, upon, them]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-ADR, O, O]"
172,parody of an actual human being,"[wellbutrin, ,, zyprexa, ,, xanax, ,, effexor, ,, lamotrigine, ,, valium, ., every, day, ., i, 'm, not, even, a, person, anymore, ., i, 'm, a, parody, of, an, actual, human, being, .]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-ADR, I-ADR, I-ADR, I-ADR, I-ADR, I-ADR, O]"
184,tired daytimes,"[tonight, i, am, taking, my, #, venlafaxine, in, the, evening, in, pursuit, of, better, sleep, and, less, tired, daytimes, ., wish, me, luck, .]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-ADR, I-ADR, O, O, O, O, O]"
303,headache,"[nicotine, is, literally, poison, ., but, sometimes, you, got, ta, find, out, the, hard, way, ., #, headache]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-ADR]"
304,poison,"[nicotine, is, literally, poison, ., but, sometimes, you, got, ta, find, out, the, hard, way, ., #, headache]","[O, O, O, B-ADR, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"


In [None]:
final_valid_zero = final_valid[final_valid.ADR == 0]
final_valid = final_valid[final_valid.ADR == 1]

In [None]:
final_valid_zero.reset_index(inplace=True, drop=True)
final_valid.reset_index(inplace=True, drop=True)

In [None]:
print(final_valid_zero.shape)
print(final_valid.shape)

(169, 3)
(391, 3)


In [None]:
# Dictionary to keep track of labels and index mapping 
all_tags = ','.join(final_train.bio_tags.apply(lambda x: ','.join(x)))
unique_tags = np.unique(all_tags.split(','))
labels_to_ids = {k: v for v, k in enumerate(unique_tags)}
ids_to_labels = {v: k for v, k in enumerate(unique_tags)}

In [None]:
pprint(ids_to_labels)
pprint(labels_to_ids)

{0: 'B-ADR', 1: 'I-ADR', 2: 'O'}
{'B-ADR': 0, 'I-ADR': 1, 'O': 2}


In [None]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 1
VALID_BATCH_SIZE = 1
EPOCHS = 5
LEARNING_RATE = 5e-05
MAX_GRAD_NORM = 10
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')

Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/223k [00:00<?, ?B/s]

In [None]:
# encoding = tokenizer(final_train.tweet.iloc[0],
#                         is_split_into_words=True,
#                         return_offsets_mapping=True, 
#                         padding='max_length', 
#                         truncation=True, 
#                         max_length=128)

In [None]:
# for key, value in encoding.items():
#     print(key, value)

In [None]:
# for token, offset in zip(tokenizer.convert_ids_to_tokens(encoding["input_ids"]), encoding['offset_mapping']):
#   print('{0:10} {1}'.format(token, offset))

In [None]:
class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        
        tweet = self.data.tweet[index]
        # text = nlp(self.data.tweet[index].strip())
        # tweet = [token for token in text]
        bio_tags = self.data.bio_tags[index]
        encoding = self.tokenizer(tweet,
                                 is_split_into_words=True,
                                 return_offsets_mapping=True, 
                                 padding='max_length', 
                                 truncation=True, 
                                 max_length=self.max_len)
        
        labels = [labels_to_ids[label] for label in bio_tags]
        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100

        i = 0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
            if mapping[0] == 0 and mapping[1] == 0:
                # print(0,0)
                continue
            if mapping[0] == 0 and mapping[1] != 0:
                encoded_labels[idx] = labels[i]
                # print('first match')
                i += 1
            else:
                # print('next match')
                # encoded_labels[idx] = encoded_labels[idx-1]
                continue
        
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)

        return item
    
    def __len__(self):
        return self.len

In [None]:
training_set = dataset(final_train, tokenizer, MAX_LEN)
testing_set = dataset(final_valid, tokenizer, MAX_LEN)

In [None]:
training_set[0]

{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]),
 'input_ids': tensor([  102,  3000,   834,  9111, 12659, 30110, 12751,   434,  5432,   106,
          2391,   422,  6677,  6021,   137,  3343, 20116,   119,  1536,   808,
           263,   106,  2216, 29455,   205,  9797,   106, 10053,  4078,   329,
          3000,   847,  3145,  4078, 30126,  3000,  5517,  4504,  9799, 30110,
           103,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     

In [None]:
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[0]["input_ids"]), training_set[0]["labels"]):
  print('{0:10}  {1}'.format(token, label))

[CLS]       -100
#           2
cy          2
##mb        -100
##alt       -100
##a         -100
withdrawal  2
has         2
reached     2
a           2
peak        2
,           2
lost        0
vision      1
and         2
almost      2
crash       2
##ed        -100
my          2
car         2
from        2
a           2
brain       2
zap         2
.           2
thanks      2
a           2
zi          2
##ll        -100
##ion       -100
#           2
el          2
##ili       -100
##ll        -100
##y         -100
#           2
big         2
##pha       -100
##rm        -100
##a         -100
[SEP]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]     

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
from transformers import AutoModelForTokenClassification

In [None]:
model = AutoModelForTokenClassification.from_pretrained('allenai/scibert_scivocab_uncased', num_labels=len(labels_to_ids)).to(device)

Downloading:   0%|          | 0.00/422M [00:00<?, ?B/s]

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initi

In [None]:
def count_parameter(model):
    return sum(para.numel() for para in model.parameters() if para.requires_grad)

In [None]:
print(f"The model has {count_parameter(model):,} trainable parameters.")

The model has 109,330,179 trainable parameters.


In [None]:
inputs = training_set[2]
input_ids = inputs["input_ids"].unsqueeze(0)
attention_mask = inputs["attention_mask"].unsqueeze(0)
labels = inputs["labels"].unsqueeze(0)

input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
labels = labels.to(device)

outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
outputs.loss

tensor(1.4511, device='cuda:0', grad_fn=<NllLossBackward0>)

In [None]:
optimizer = torch.optim.AdamW(params=model.parameters(), lr=LEARNING_RATE)

In [None]:
# Defining the training function on the 80% of the dataset for tuning the bert model
def train(epoch, clip=1.0):
    tr_loss, tr_f1 = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    
    for idx, batch in enumerate(training_loader):
        # print(idx)
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)

        optimizer.zero_grad()
        md = model(input_ids=ids, attention_mask=mask, labels=labels)
        
        loss, tr_logits = md.loss, md.logits
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
        
        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")
           
        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        
        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        
        # torch.argmax(outputs.logits, axis=2)

        # mask_labels = labels.view(-1)
        # mask_predictions = torch.argmax(tr_logits, axis=2).view(-1)

        mask_label = labels.view(-1)
        mask_labels = torch.masked_select(mask_label, active_accuracy)
        
        mask_prediction = torch.argmax(tr_logits, axis=2).view(-1)
        mask_predictions = torch.masked_select(mask_prediction, active_accuracy)

        # mask_predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        labels, predictions = list(), list()
        for label in mask_labels.tolist():
          labels.append(ids_to_labels.get(label, 'O'))
        
        for pred in mask_predictions.tolist():
          predictions.append(ids_to_labels.get(pred, 'O'))

        tr_labels.append(labels)
        tr_preds.append(predictions)

        tmp_tr_f1 = f1_score([labels], [predictions], average='micro')
        tr_f1 += tmp_tr_f1
    
        loss.backward()

        # gradient clipping
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        # backward pass
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_f1 = tr_f1 / nb_tr_steps
    print(f"Training loss epoch {epoch+1}: {epoch_loss}")
    print(f"Training F1 score epoch {epoch+1}: {tr_f1}\n")

In [None]:
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

Training epoch: 1
Training loss per 100 training steps: 0.002165211597457528
Training loss per 100 training steps: 0.1589409051218651
Training loss per 100 training steps: 0.16339796099076812
Training loss per 100 training steps: 0.1661536918113401
Training loss per 100 training steps: 0.17035028243108194
Training loss per 100 training steps: 0.16623433143657515
Training loss per 100 training steps: 0.1684685309897735
Training loss per 100 training steps: 0.18478929750781078
Training loss per 100 training steps: 0.19101034302116063
Training loss per 100 training steps: 0.1855150568248282
Training loss per 100 training steps: 0.18626105169732274
Training loss per 100 training steps: 0.18481524255318413
Training loss per 100 training steps: 0.18573020891462655
Training loss per 100 training steps: 0.19072718602270683
Training loss per 100 training steps: 0.19402167459046282
Training loss epoch 1: 0.19538717336881734
Training F1 score epoch 1: 0.475523012552301

Training epoch: 2
Training

In [None]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    
    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)
            
            md = model(input_ids=ids, attention_mask=mask, labels=labels)
            loss, eval_logits = md.loss, md.logits

            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)
        
            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            
            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)

            # mask_labels = labels.view(-1)
            # mask_predictions = torch.argmax(eval_logits, axis=2).view(-1)

            mask_label = labels.view(-1)
            mask_labels = torch.masked_select(mask_label, active_accuracy)
            
            mask_prediction = torch.argmax(eval_logits, axis=2).view(-1)
            mask_predictions = torch.masked_select(mask_prediction, active_accuracy)

            # mask_labels = torch.masked_select(flattened_targets, active_accuracy)
            # mask_predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            labels, predictions = list(), list()
            for label in mask_labels.tolist():
            #   labels.append(ids_to_labels[label])
              labels.append(ids_to_labels.get(label, 'O'))
            
            for pred in mask_predictions.tolist():
            #   predictions.append(ids_to_labels[pred])
              predictions.append(ids_to_labels.get(pred, 'O'))

            eval_labels.append(labels)
            eval_preds.append(predictions)
            
            tmp_eval_accuracy = f1_score([labels], [predictions], average='micro')
            eval_accuracy += tmp_eval_accuracy

    # labels = [[ids_to_labels[id.item()] for id in lab] for lab in eval_labels]
    # predictions = [[ids_to_labels[id.item()] for id in lab] for lab in eval_preds]
    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"\nValidation Loss: {eval_loss}")
    print(f"Validation F1 score: {eval_accuracy}")

    return eval_labels, eval_preds

In [None]:
labels, predictions = valid(model, testing_loader)

Validation loss per 100 evaluation steps: 0.7198742032051086
Validation loss per 100 evaluation steps: 0.46097442880272865
Validation loss per 100 evaluation steps: 0.4261063336248774
Validation loss per 100 evaluation steps: 0.4064989043233384

Validation Loss: 0.41216266314830163
Validation F1 score: 0.17306052855924975


In [None]:
label_list = final_valid_zero.bio_tags.tolist()
pred_list = list()
for value in final_valid_zero.itertuples():
    temp = ['O'] * len(value[2])
    pred_list.append(temp)

In [None]:
print(len(label_list))
print(len(pred_list))

169
169


In [None]:
assert len(label_list) == len(pred_list)

In [None]:
labels.extend(label_list)
predictions.extend(pred_list)

In [None]:
assert len(labels) == len(predictions)

In [None]:
print(classification_report(labels, predictions))

              precision    recall  f1-score   support

         ADR       0.17      0.32      0.22       357

   micro avg       0.17      0.32      0.22       357
   macro avg       0.17      0.32      0.22       357
weighted avg       0.17      0.32      0.22       357



In [None]:
from seqeval.scheme import IOB2

In [None]:
print(classification_report(labels, predictions, mode='strict', scheme=IOB2))

              precision    recall  f1-score   support

         ADR       0.17      0.32      0.22       357

   micro avg       0.17      0.32      0.22       357
   macro avg       0.17      0.32      0.22       357
weighted avg       0.17      0.32      0.22       357



In [None]:
torch.save(model.state_dict(), "scibert_ner.pt")