# Task 2 - SciBERT based NER

## 1. Install necessary libraries

In [None]:
!pip install -q contractions transformers sent2vec imbalanced-learn seqeval[gpu]
!pip install -q tf-estimator-nightly==2.8.0.dev2021122109
!python -m pip uninstall -q -y spacy
!python -m pip install -q -U spacy

[K     |████████████████████████████████| 4.0 MB 30.3 MB/s 
[K     |████████████████████████████████| 43 kB 2.5 MB/s 
[K     |████████████████████████████████| 287 kB 51.4 MB/s 
[K     |████████████████████████████████| 106 kB 72.8 MB/s 
[K     |████████████████████████████████| 596 kB 60.7 MB/s 
[K     |████████████████████████████████| 880 kB 60.4 MB/s 
[K     |████████████████████████████████| 77 kB 8.3 MB/s 
[K     |████████████████████████████████| 6.6 MB 61.4 MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 462 kB 35.0 MB/s 
[K     |████████████████████████████████| 6.2 MB 27.4 MB/s 
[K     |████████████████████████████████| 10.1 MB 62.3 MB/s 
[K     |████████████████████████████████| 653 kB 63.6 MB/s 
[K     |████████████████████████████████| 42 kB 1.8 MB/s 
[K     |████████████████████████████████| 457 kB 71.0 MB/s 
[K     |███████████

In [None]:
!python -m spacy download en_core_web_trf

Collecting en-core-web-trf==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.3.0/en_core_web_trf-3.3.0-py3-none-any.whl (460.3 MB)
[K     |████████████████████████████████| 460.3 MB 27 kB/s 
[?25hCollecting spacy-transformers<1.2.0,>=1.1.2
  Downloading spacy_transformers-1.1.5-py2.py3-none-any.whl (51 kB)
[K     |████████████████████████████████| 51 kB 177 kB/s 
Collecting spacy-alignments<1.0.0,>=0.7.2
  Downloading spacy_alignments-0.8.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 21.7 MB/s 
[?25hCollecting transformers<4.18.0,>=3.4.0
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 73.3 MB/s 
Installing collected packages: transformers, spacy-alignments, spacy-transformers, en-core-web-trf
  Attempting uninstall: transformers
    Found existing installation: transformers 4.18.0
    Uninstalling tra

In [None]:
# !python -m spacy download en_core_web_sm

## 2. Load all libraries

In [None]:
import numpy as np
import pandas as pd
import spacy
import nltk
import random
import re
import torch
import warnings
import torch.nn as nn

from spacy.util import minibatch, compounding
from spacy.training import Example
from pathlib import Path
from spacy.training import offsets_to_biluo_tags
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
from pprint import pprint
# from sklearn.metrics import accuracy_score, classification_report, f1_score
from seqeval.metrics import classification_report, f1_score

nltk.download('punkt')
pd.options.display.max_rows = None
pd.options.display.max_columns = None
pd.options.display.max_colwidth=None
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## 3. Load the dataset

In [None]:
# Load data
training = pd.read_csv('training_data_with_ADR.csv')
validation = pd.read_csv('validation_data_with_ADR.csv')

In [None]:
cadec_data = pd.read_excel('cadec_data.xlsx')

In [None]:
print(training.shape)
print(validation.shape)

(2172, 13)
(560, 13)


In [None]:
cadec_data.shape

(6315, 6)

In [None]:
training.head()

Unnamed: 0.1,Unnamed: 0,tweet_id,begin,end,type,extraction,drug,tweet,meddra_code,meddra_term,clean_tweets,label,ADR
0,0,331187619096588288,,,,,ofloxacin,@seefisch:oral drugs for pyelonephritis:ciprofloxacin levofloxacin tmp/smz do not use nitrofurantoin for pyelo(only cystitis)@david_medinaf,,,<user> : oral drugs for pyelonephritis : ciprofloxacin levofloxacin tmp / smz do not use nitrofurantoin for pyelo ( only cystitis ) <user>,0,0
1,1,332227554956161024,,,,,trazodone,happy for wellbutrin; has similar effects as adderall.. trazodone is super promising for sleep.. but abilify can cause weight gain -_-,,,happy for wellbutrin ; has similar effects as adderall . <repeated> trazodone is super promising for sleep . <repeated> but abilify can cause weight gain -_-,0,0
2,2,332448217490944000,,,,,lamotrigine,"@stilgarg i'm ok ty have an official diagnosis of bipolar now, feeling ok at the moment lamotrigine has been increased having monotherapy:/",,,"<user> i am ok ty have an official diagnosis of bipolar now , feeling ok at the moment lamotrigine has been increased having monotherapy <annoyed>",0,0
3,3,332977955754110976,,,,,cymbalta,i'm soo depressed cymbalta couldn't help me .,,,i am soo depressed cymbalta could not help me .,0,0
4,4,333674203331051520,,,,,seroquel,"time for my daily afternoon relaxation ritual of smoking weed, taking 2 mgs of clonazepam, and 400 mg of seroquel xr.",,,"time for my daily afternoon relaxation ritual of smoking weed , taking <number> mgs of clonazepam , and <number> mg of seroquel xr .",0,0


In [None]:
cadec_data.head()

Unnamed: 0.1,Unnamed: 0,text,extraction,indexes,text_tokens,bio_tags
0,0,"I am left with chest pain, muscle weakness and spasms and I am up every 3 hrs drinking water. Drs. laugh when I say that I have Rhabdomyolosis - my Cpk or ck or whatever it is is fine. but no one cks the myoglobin or hemoglogin in urine. I'm desparate for a Dr in Mn. that will do that. I don't know where to turn and get sicker and sicker. Am afraid my heart will go. The drug gets cholestorel down, but ruins your life.",chest pain,['15 25'],"[I, am, left, with, chest, pain, ,, muscle, weakness, and, spasms, and, I, am, up, every, 3, hrs, drinking, water, ., Drs, ., laugh, when, I, say, that, I, have, Rhabdomyolosis, -, my, Cpk, or, ck, or, whatever, it, is, is, fine, ., but, no, one, cks, the, myoglobin, or, hemoglogin, in, urine, ., I, 'm, desparate, for, a, Dr, in, Mn, ., that, will, do, that, ., I, do, n't, know, where, to, turn, and, get, sicker, and, sicker, ., Am, afraid, my, heart, will, go, ., The, drug, gets, cholestorel, down, ,, but, ruins, your, life, .]","['O', 'O', 'O', 'O', 'B-ADR', 'I-ADR', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']"
1,1,"I am left with chest pain, muscle weakness and spasms and I am up every 3 hrs drinking water. Drs. laugh when I say that I have Rhabdomyolosis - my Cpk or ck or whatever it is is fine. but no one cks the myoglobin or hemoglogin in urine. I'm desparate for a Dr in Mn. that will do that. I don't know where to turn and get sicker and sicker. Am afraid my heart will go. The drug gets cholestorel down, but ruins your life.",muscle weakness,['27 42'],"[I, am, left, with, chest, pain, ,, muscle, weakness, and, spasms, and, I, am, up, every, 3, hrs, drinking, water, ., Drs, ., laugh, when, I, say, that, I, have, Rhabdomyolosis, -, my, Cpk, or, ck, or, whatever, it, is, is, fine, ., but, no, one, cks, the, myoglobin, or, hemoglogin, in, urine, ., I, 'm, desparate, for, a, Dr, in, Mn, ., that, will, do, that, ., I, do, n't, know, where, to, turn, and, get, sicker, and, sicker, ., Am, afraid, my, heart, will, go, ., The, drug, gets, cholestorel, down, ,, but, ruins, your, life, .]","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ADR', 'I-ADR', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']"
2,2,"I am left with chest pain, muscle weakness and spasms and I am up every 3 hrs drinking water. Drs. laugh when I say that I have Rhabdomyolosis - my Cpk or ck or whatever it is is fine. but no one cks the myoglobin or hemoglogin in urine. I'm desparate for a Dr in Mn. that will do that. I don't know where to turn and get sicker and sicker. Am afraid my heart will go. The drug gets cholestorel down, but ruins your life.",spasms,['47 53'],"[I, am, left, with, chest, pain, ,, muscle, weakness, and, spasms, and, I, am, up, every, 3, hrs, drinking, water, ., Drs, ., laugh, when, I, say, that, I, have, Rhabdomyolosis, -, my, Cpk, or, ck, or, whatever, it, is, is, fine, ., but, no, one, cks, the, myoglobin, or, hemoglogin, in, urine, ., I, 'm, desparate, for, a, Dr, in, Mn, ., that, will, do, that, ., I, do, n't, know, where, to, turn, and, get, sicker, and, sicker, ., Am, afraid, my, heart, will, go, ., The, drug, gets, cholestorel, down, ,, but, ruins, your, life, .]","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ADR', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']"
3,3,"I am left with chest pain, muscle weakness and spasms and I am up every 3 hrs drinking water. Drs. laugh when I say that I have Rhabdomyolosis - my Cpk or ck or whatever it is is fine. but no one cks the myoglobin or hemoglogin in urine. I'm desparate for a Dr in Mn. that will do that. I don't know where to turn and get sicker and sicker. Am afraid my heart will go. The drug gets cholestorel down, but ruins your life.",every 3 hrs drinking water,['66 92'],"[I, am, left, with, chest, pain, ,, muscle, weakness, and, spasms, and, I, am, up, every, 3, hrs, drinking, water, ., Drs, ., laugh, when, I, say, that, I, have, Rhabdomyolosis, -, my, Cpk, or, ck, or, whatever, it, is, is, fine, ., but, no, one, cks, the, myoglobin, or, hemoglogin, in, urine, ., I, 'm, desparate, for, a, Dr, in, Mn, ., that, will, do, that, ., I, do, n't, know, where, to, turn, and, get, sicker, and, sicker, ., Am, afraid, my, heart, will, go, ., The, drug, gets, cholestorel, down, ,, but, ruins, your, life, .]","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ADR', 'I-ADR', 'I-ADR', 'I-ADR', 'I-ADR', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']"
4,4,within 2 hours started having heart attack symptoms. Had to go to ER. I was in ER within 3 hours of taking 1st pill. It was scary trying to make sure I had proper observation and not over-reaction because of the symptoms. I had to slow people down several times. Now convinced it would have had serious health risks for me had I even tried taking a second pill the next day or even continuing taking them. It took a while (months) before I started to feel normal again.,heart attack symptoms,['30 51'],"[within, 2, hours, started, having, heart, attack, symptoms, ., Had, to, go, to, ER, ., I, was, in, ER, within, 3, hours, of, taking, 1st, pill, ., It, was, scary, trying, to, make, sure, I, had, proper, observation, and, not, over, -, reaction, because, of, the, symptoms, ., I, had, to, slow, people, down, several, times, ., Now, convinced, it, would, have, had, serious, health, risks, for, me, had, I, even, tried, taking, a, second, pill, the, next, day, or, even, continuing, taking, them, ., It, took, a, while, (, months, ), before, I, started, to, feel, normal, again, .]","['O', 'O', 'O', 'O', 'O', 'B-ADR', 'I-ADR', 'I-ADR', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']"


## 4. Prepare the data

In [None]:
# Get the non-null rows
training_data = training[training.ADR == 1]
validation_data = validation[validation.ADR == 1]

# Reset index
training_data.reset_index(inplace=True, drop=True)
validation_data.reset_index(inplace=True, drop=True)

# Drop unwanted column
training_data.drop("Unnamed: 0", inplace=True, axis=1)
validation_data.drop("Unnamed: 0", inplace=True, axis=1)

# Fill in the missing values
training_data.extraction = training_data.extraction.fillna('-')
validation_data.extraction = validation_data.extraction.fillna('-')

In [None]:
print(f"Shape of training data: {training_data.shape}")
print(f"Shape of validation data: {validation_data.shape}")

Shape of training data: (1434, 12)
Shape of validation data: (391, 12)


In [None]:
# Ground truth labels were misleading 
# Hence create a new ground truth for start and end index

def find_start_end(dataframe):
    count = 0
    new_start = list()
    new_end = list()

    for row in dataframe.itertuples():
        if row[5] != '-': # If extraction is not empty then
            match = re.search(r"{}".format(row[5].lower().replace(")", "\)")), r"{}".format(row[7].lower()))
            if not match:
                new_start.append(int(row[2]))
                new_end.append(int(row[3]))
            else:
                if row[2] != match.start() and row[3] != match.end():
                    count += 1
                new_start.append(match.start())
                new_end.append(match.end())
        
        else: # If extraction is empty then
            new_start.append(0)
            new_end.append(0)

    print(f"Percentage of rows for which start and end index did not match is {round(count/dataframe.shape[0]*100, 4)}%")
    return new_start, new_end

In [None]:
train_start, train_end = find_start_end(training_data)
valid_start, valid_end = find_start_end(validation_data)

Percentage of rows for which start and end index did not match is 36.4017%
Percentage of rows for which start and end index did not match is 28.9003%


In [None]:
# Create two columns for new start and end index
training_data['new_start'] = train_start
training_data['new_end'] = train_end

validation_data['new_start'] = valid_start
validation_data['new_end'] = valid_end

In [None]:
training_data.head()

Unnamed: 0,tweet_id,begin,end,type,extraction,drug,tweet,meddra_code,meddra_term,clean_tweets,label,ADR,new_start,new_end
0,342314998904786945,42.0,53.0,ADR,lost vision,cymbalta,"#cymbalta withdrawal has reached a peak, lost vision and almost crashed my car from a brain zap. thanks a zillion #elililly #bigpharma",10047522.0,vision loss,"<hashtag> cymbalta </hashtag> withdrawal has reached a peak , lost vision and almost crashed my car from a brain zap . thanks a zillion <hashtag> eli lilly </hashtag> <hashtag> big pharma </hashtag>",1,1,41,52
1,342314998904786945,11.0,21.0,ADR,withdrawal,cymbalta,"#cymbalta withdrawal has reached a peak, lost vision and almost crashed my car from a brain zap. thanks a zillion #elililly #bigpharma",10048010.0,withdrawal syndrome,"<hashtag> cymbalta </hashtag> withdrawal has reached a peak , lost vision and almost crashed my car from a brain zap . thanks a zillion <hashtag> eli lilly </hashtag> <hashtag> big pharma </hashtag>",1,1,10,20
2,342322703556038657,27.0,35.0,ADR,nauseous,cipro,i hate cipro! #antibiotic #nauseous #cf #hospitallife #cysticfibrosis,10028823.0,nauseous,i hate cipro ! <hashtag> antibiotic </hashtag> <hashtag> nauseous </hashtag> <hashtag> cf </hashtag> <hashtag> hospital life </hashtag> <hashtag> cystic fibrosis </hashtag>,1,1,27,35
3,342349802601844737,109.0,118.0,ADR,can't cum,seroquel,@luckystubbs reppin zoloft&amp;seroquel since last november. i'm hella gainin weight too awesome i'm fat and can't cum i own,10021574.0,inability to orgasm,<user> reppin zoloft & seroquel since last november . i am hella gainin weight too awesome i am fat and can not cum i own,1,1,109,118
4,342349802601844737,101.0,104.0,ADR,fat,seroquel,@luckystubbs reppin zoloft&amp;seroquel since last november. i'm hella gainin weight too awesome i'm fat and can't cum i own,10047896.0,weight gain,<user> reppin zoloft & seroquel since last november . i am hella gainin weight too awesome i am fat and can not cum i own,1,1,101,104


In [None]:
# Load spacy and pipeline
nlp = spacy.load('en_core_web_trf')
ner = nlp.get_pipe('ner')

In [None]:
nlp.pipe_names

['transformer', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [None]:
# Prepare the tweets in spacy format
TRAIN_DATA = []
VALID_DATA = []

for row in training_data.itertuples():
    TRAIN_DATA.append((row[7],{
        'entities': [(int(row[13]), int(row[14]), 'ADR')]
    }))

for row in validation_data.itertuples():
    VALID_DATA.append((row[7],{
        'entities': [(int(row[13]), int(row[14]), 'ADR')]
    }))

In [None]:
# Tag text to BILUO format and replace the tags "L" with "I" and "U" with "B"
# for BIO labelling scheme

# tags_list = list()
# for text, annot in TRAIN_DATA:
#     doc = nlp(text)
#     tags = offsets_to_biluo_tags(doc, annot['entities'])
#     bio_tags = list(map(lambda tag: tag.replace("L", "I").replace("U", "B"), tags))
#     bio_tags = ['O' if tag == '-' else tag for tag in bio_tags]
#     tags_list.append(bio_tags)

valid_tags_list = list()
for text, annot in VALID_DATA:
    doc = nlp(text)
    tags = offsets_to_biluo_tags(doc, annot['entities'])
    bio_tags = list(map(lambda tag: tag.replace("L", "I").replace("U", "B"), tags))
    bio_tags = ['O' if tag == '-' else tag for tag in bio_tags]
    valid_tags_list.append(bio_tags)

In [None]:
# final_train = pd.DataFrame(columns=["extraction", "tweet", "bio_tags"])
final_valid = pd.DataFrame(columns=["extraction", "tweet", "bio_tags"])

In [None]:
# final_train.extraction = training_data.extraction
# final_train.bio_tags = tags_list

final_valid.extraction = validation_data.extraction
final_valid.bio_tags = valid_tags_list

In [None]:
# final_train.tweet = [[str(token) for token in nlp(tweet)] for tweet in training_data.tweet]
final_valid.tweet = [[str(token) for token in nlp(tweet)] for tweet in validation_data.tweet]

In [None]:
final_valid.head()

Unnamed: 0,extraction,tweet,bio_tags
0,AD,"[apparently, ,, baclofen, greatly, exacerbates, the, "", ad, "", part, of, my, adhd, ., average, length, of, focus, today, :, about, 30, seconds, .]","[O, O, O, O, O, O, O, B-ADR, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
1,focus,"[apparently, ,, baclofen, greatly, exacerbates, the, "", ad, "", part, of, my, adhd, ., average, length, of, focus, today, :, about, 30, seconds, .]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-ADR, O, O, O, O, O, O]"
2,died,"[pt, of, mine, died, from, cipro, rt, @ciproispoison, :, @gastromom, if, only, more, doctors, thought, like, you, !, i, lost, my, entire, life, to, 12, cipro, pills]","[O, O, O, B-ADR, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
3,died,"[@gastromom, the, only, pt, of, mine, who, ever, died, was, one, age, 21, profound, autism, chronic, underwt, &, amp, ;, gi, issues, ,, given, lots, of, cipro, ., terrible]","[O, O, O, O, O, O, O, O, B-ADR, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
4,tendon damage,"[owww, ., i, hurt, my, foot, ., &, amp, ;, am, concerned, ., 1st, warning, on, cipro, is, tendon, damage, ., :-/, it, certainly, was, n't, a, rupture, ., i, 'm, sure, i, 'm, just, paranoid, .]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-ADR, I-ADR, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"


In [None]:
import ast
import json

In [None]:
# ast.literal_eval(cadec_data.bio_tags.iloc[0])

In [None]:
cadec_data.rename(columns={'text_tokens': 'tweet'})

In [None]:
cadec_data.columns = ['Unnamed: 0', 'text', 'extraction', 'indexes', 'tweet',
       'bio_tags']

In [None]:
cadec_data.tweet = [twee.strip('][').split(', ') for twee in cadec_data.tweet]
cadec_data.bio_tags = [ast.literal_eval(tag) for tag in cadec_data.bio_tags]

In [None]:
# Dictionary to keep track of labels and index mapping 
all_tags = ','.join(cadec_data.bio_tags.apply(lambda x: ','.join(x)))
unique_tags = np.unique(all_tags.split(','))
labels_to_ids = {k: v for v, k in enumerate(unique_tags)}
ids_to_labels = {v: k for v, k in enumerate(unique_tags)}

In [None]:
pprint(ids_to_labels)
pprint(labels_to_ids)

{0: 'B-ADR', 1: 'I-ADR', 2: 'O'}
{'B-ADR': 0, 'I-ADR': 1, 'O': 2}


In [None]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 1
VALID_BATCH_SIZE = 1
EPOCHS = 5
LEARNING_RATE = 3e-05
MAX_GRAD_NORM = 10
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')

In [None]:
# encoding = tokenizer(final_train.tweet.iloc[0],
#                         is_split_into_words=True,
#                         return_offsets_mapping=True, 
#                         padding='max_length', 
#                         truncation=True, 
#                         max_length=128)

In [None]:
# for key, value in encoding.items():
#     print(key, value)

In [None]:
# for token, offset in zip(tokenizer.convert_ids_to_tokens(encoding["input_ids"]), encoding['offset_mapping']):
#   print('{0:10} {1}'.format(token, offset))

In [None]:
class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        
        tweet = self.data.tweet[index]
        # text = nlp(self.data.tweet[index].strip())
        # tweet = [token for token in text]
        bio_tags = self.data.bio_tags[index]
        encoding = self.tokenizer(tweet,
                                 is_split_into_words=True,
                                 return_offsets_mapping=True, 
                                 padding='max_length', 
                                 truncation=True, 
                                 max_length=self.max_len)
        
        labels = [labels_to_ids[label] for label in bio_tags]
        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100

        i = 0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
            if mapping[0] == 0 and mapping[1] == 0:
                # print(0,0)
                continue
            if mapping[0] == 0 and mapping[1] != 0:
                encoded_labels[idx] = labels[i]
                # print('first match')
                i += 1
            else:
                # print('next match')
                encoded_labels[idx] = encoded_labels[idx-1]
        
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)

        return item
    
    def __len__(self):
        return self.len

In [None]:
training_set = dataset(cadec_data, tokenizer, MAX_LEN)
testing_set = dataset(final_valid, tokenizer, MAX_LEN)

In [None]:
training_set[0]

{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 0]),
 'input_ids': tensor([  102,   259,   439,  2101,   190,  8693,  2675,   422,  3056, 11688,
           137, 19756,  2754, 30113,   137,   259,   439,   692,  1795,   239,
         17027, 10233,  1506,   205, 22007,   205, 26085, 11607,   603,   259,
          4654,   198,   259,   360,  3645, 22783,  1796, 15832,  4554,   579,
          1536,  3107, 30135,   234,  6569,   234, 16217,   256,   165,   165,
          6571,   205,   563,   425,   482,  6569, 30113,   111, 26740, 13620,
           107,   234,  2372,   247,  6005,   107,   12

In [None]:
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[1]["input_ids"]), training_set[1]["labels"]):
  print('{0:10}  {1}'.format(token, label))

[CLS]       -100
i           2
am          2
left        2
with        2
chest       2
pain        2
,           2
muscle      0
weakness    1
and         2
spa         2
##sm        2
##s         2
and         2
i           2
am          2
up          2
every       2
3           2
hrs         2
drinking    2
water       2
.           2
drs         2
.           2
lau         2
##gh        2
when        2
i           2
say         2
that        2
i           2
have        2
rh          2
##abd       2
##omy       2
##olo       2
##sis       2
-           2
my          2
cp          2
##k         2
or          2
ck          2
or          2
whatever    2
it          2
is          2
is          2
fine        2
.           2
but         2
no          2
one         2
ck          2
##s         2
the         2
myo         2
##glob      2
##in        2
or          2
hem         2
##og        2
##log       2
##in        2
in          2
urine       2
.           2
i           2
'           2
m  

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
from transformers import AutoModelForTokenClassification

In [None]:
model = AutoModelForTokenClassification.from_pretrained('allenai/scibert_scivocab_uncased', num_labels=len(labels_to_ids)).to(device)

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initi

In [None]:
def count_parameter(model):
    return sum(para.numel() for para in model.parameters() if para.requires_grad)

In [None]:
print(f"The model has {count_parameter(model):,} trainable parameters.")

The model has 109,330,179 trainable parameters.


In [None]:
inputs = training_set[2]
input_ids = inputs["input_ids"].unsqueeze(0)
attention_mask = inputs["attention_mask"].unsqueeze(0)
labels = inputs["labels"].unsqueeze(0)

input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
labels = labels.to(device)

outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
outputs.loss

tensor(1.0428, device='cuda:0', grad_fn=<NllLossBackward0>)

In [None]:
optimizer = torch.optim.AdamW(params=model.parameters(), lr=LEARNING_RATE)

In [None]:
# Defining the training function on the 80% of the dataset for tuning the bert model
def train(epoch, clip=1.0):
    tr_loss, tr_f1 = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    
    for idx, batch in enumerate(training_loader):
        # print(idx)
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)

        optimizer.zero_grad()
        md = model(input_ids=ids, attention_mask=mask, labels=labels)
        
        md = model(input_ids=ids, attention_mask=mask, labels=labels)
        loss, tr_logits = md.loss, md.logits
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
        
        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")
           
        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        
        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        
        # torch.argmax(outputs.logits, axis=2)

        mask_labels = labels.view(-1)
        mask_predictions = torch.argmax(tr_logits, axis=2).view(-1)
        # mask_predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        labels, predictions = list(), list()
        for label in mask_labels.tolist():
          labels.append(ids_to_labels.get(label, 'O'))
        
        for pred in mask_predictions.tolist():
          predictions.append(ids_to_labels.get(pred, 'O'))

        tr_labels.append(labels)
        tr_preds.append(predictions)

        tmp_tr_f1 = f1_score([labels], [predictions], average='micro')
        tr_f1 += tmp_tr_f1
    
        loss.backward()

        # gradient clipping
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        # backward pass
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_f1 = tr_f1 / nb_tr_steps
    print(f"Training loss epoch {epoch+1}: {epoch_loss}")
    print(f"Training F1 score epoch {epoch+1}: {tr_f1}\n")

In [None]:
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

Training epoch: 1
Training loss per 100 training steps: 1.1021047830581665
Training loss per 100 training steps: 0.1779886517710615
Training loss per 100 training steps: 0.17870816488782712
Training loss per 100 training steps: 0.17046586343356443
Training loss per 100 training steps: 0.17679200373692183
Training loss per 100 training steps: 0.176638456174877
Training loss per 100 training steps: 0.17326657735384665
Training loss per 100 training steps: 0.17402756236893485
Training loss per 100 training steps: 0.171179493458723
Training loss per 100 training steps: 0.16613836916908828
Training loss per 100 training steps: 0.16354341809284295
Training loss per 100 training steps: 0.16059093851343195
Training loss per 100 training steps: 0.15761305957658062
Training loss per 100 training steps: 0.15709186247141915
Training loss per 100 training steps: 0.15752536942201834
Training loss per 100 training steps: 0.15511023948545494
Training loss per 100 training steps: 0.1550513474044706
Tra

In [None]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    
    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)
            
            md = model(input_ids=ids, attention_mask=mask, labels=labels)
            loss, eval_logits = md.loss, md.logits

            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)
        
            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            
            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)

            mask_labels = labels.view(-1)
            mask_predictions = torch.argmax(eval_logits, axis=2).view(-1)

            # mask_labels = torch.masked_select(flattened_targets, active_accuracy)
            # mask_predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            labels, predictions = list(), list()
            for label in mask_labels.tolist():
            #   labels.append(ids_to_labels[label])
              labels.append(ids_to_labels.get(label, 'O'))
            
            for pred in mask_predictions.tolist():
            #   predictions.append(ids_to_labels[pred])
              predictions.append(ids_to_labels.get(pred, 'O'))

            eval_labels.append(labels)
            eval_preds.append(predictions)
            
            tmp_eval_accuracy = f1_score([labels], [predictions], average='micro')
            eval_accuracy += tmp_eval_accuracy

    # labels = [[ids_to_labels[id.item()] for id in lab] for lab in eval_labels]
    # predictions = [[ids_to_labels[id.item()] for id in lab] for lab in eval_preds]
    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"\nValidation Loss: {eval_loss}")
    print(f"Validation F1 score: {eval_accuracy}")

    return eval_labels, eval_preds

In [None]:
labels, predictions = valid(model, testing_loader)

Validation loss per 100 evaluation steps: 0.7478603720664978
Validation loss per 100 evaluation steps: 0.6151311755950314
Validation loss per 100 evaluation steps: 0.5940618759922658
Validation loss per 100 evaluation steps: 0.6275316970350346

Validation Loss: 0.6286164516967502
Validation F1 score: 0.03555705410170649


In [None]:
print(classification_report(labels, predictions))

              precision    recall  f1-score   support

         ADR       0.03      0.09      0.04       481

   micro avg       0.03      0.09      0.04       481
   macro avg       0.03      0.09      0.04       481
weighted avg       0.03      0.09      0.04       481



In [None]:
from seqeval.scheme import IOB2

In [None]:
print(classification_report(labels, predictions, mode='strict', scheme=IOB2))

              precision    recall  f1-score   support

         ADR       0.04      0.08      0.05       481

   micro avg       0.04      0.08      0.05       481
   macro avg       0.04      0.08      0.05       481
weighted avg       0.04      0.08      0.05       481

