## Task 2 Part B

## Install Library

In [None]:
!pip install -q contractions transformers sent2vec imbalanced-learn seqeval[gpu]
!pip install -q tf-estimator-nightly==2.8.0.dev2021122109
!python -m pip uninstall -q -y spacy
!python -m pip install -q -U spacy

[K     |████████████████████████████████| 4.0 MB 5.0 MB/s 
[K     |████████████████████████████████| 43 kB 1.9 MB/s 
[K     |████████████████████████████████| 106 kB 41.5 MB/s 
[K     |████████████████████████████████| 287 kB 45.3 MB/s 
[K     |████████████████████████████████| 880 kB 45.1 MB/s 
[K     |████████████████████████████████| 596 kB 32.7 MB/s 
[K     |████████████████████████████████| 77 kB 2.9 MB/s 
[K     |████████████████████████████████| 6.6 MB 31.5 MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 462 kB 5.3 MB/s 
[K     |████████████████████████████████| 6.2 MB 5.4 MB/s 
[K     |████████████████████████████████| 181 kB 44.2 MB/s 
[K     |████████████████████████████████| 10.1 MB 34.3 MB/s 
[K     |████████████████████████████████| 653 kB 46.6 MB/s 
[K     |████████████████████████████████| 457 kB 48.7 MB/s 
[K     |████████████

In [None]:
!pip install spacy[transformers]

Collecting spacy-transformers<1.2.0,>=1.1.2
  Downloading spacy_transformers-1.1.5-py2.py3-none-any.whl (51 kB)
[K     |████████████████████████████████| 51 kB 125 kB/s 
Collecting spacy-alignments<1.0.0,>=0.7.2
  Downloading spacy_alignments-0.8.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 9.4 MB/s 
[?25hCollecting transformers<4.18.0,>=3.4.0
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 32.7 MB/s 
Installing collected packages: transformers, spacy-alignments, spacy-transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.18.0
    Uninstalling transformers-4.18.0:
      Successfully uninstalled transformers-4.18.0
Successfully installed spacy-alignments-0.8.5 spacy-transformers-1.1.5 transformers-4.17.0


In [None]:
!python -m spacy download en_core_web_trf

Collecting en-core-web-trf==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.3.0/en_core_web_trf-3.3.0-py3-none-any.whl (460.3 MB)
[K     |████████████████████████████████| 460.3 MB 24 kB/s 
Installing collected packages: en-core-web-trf
Successfully installed en-core-web-trf-3.3.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')


## Load library

In [None]:
import numpy as np
import pandas as pd
import torch
import warnings
import torch.nn as nn
import spacy
import ast

from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader, RandomSampler
from pprint import pprint
from sklearn.metrics.pairwise import cosine_similarity
from scipy import spatial
# from sklearn.metrics import accuracy_score, classification_report, f1_score
from seqeval.metrics import classification_report, f1_score

pd.options.display.max_rows = None
pd.options.display.max_columns = None
pd.options.display.max_colwidth=None
warnings.filterwarnings("ignore")

## 3. Load the dataset

In [None]:
# Load data
train = pd.read_csv('task3_training.tsv', sep="\t", usecols=['tweet_id', 'begin', 'end', 'type', 'extraction', 'drug', 'tweet', 'meddra_code', 'meddra_term'])
valid = pd.read_csv('task3_validation.tsv', sep="\t", skipinitialspace=True)
validation = pd.read_excel('validation_with_output.xlsx')

In [None]:
# Load spacy and pipeline
nlp = spacy.load('en_core_web_trf')
ner = nlp.get_pipe('ner')

In [None]:
# Get tokens
tweets = list()
for tweet in valid.tweet.tolist():
    doc = nlp(tweet)
    tweets.append([str(token) for token in doc])

In [None]:
valid['tokens'] = tweets

In [None]:
valid.to_csv("test.csv")
test = pd.read_csv("test.csv")

In [None]:
meddra = list()
for next in validation[['tweet', 'extraction']].to_numpy():
    for row in test[['tokens', 'extraction', 'meddra_term']].to_numpy():    
        if (row[0] == next[0]):
            meddra.append(row[2])
            break

In [None]:
validation['meddra_term'] = meddra
validation.pred_extractions = validation.pred_extraction.apply(lambda x: ast.literal_eval(x))

In [None]:
train.head()

Unnamed: 0,tweet_id,begin,end,type,extraction,drug,tweet,meddra_code,meddra_term
0,331187619096588288,,,,,ofloxacin,@seefisch:oral drugs for pyelonephritis:ciprofloxacin levofloxacin tmp/smz do not use nitrofurantoin for pyelo(only cystitis)@david_medinaf,,
1,332227554956161024,,,,,trazodone,happy for wellbutrin; has similar effects as adderall.. trazodone is super promising for sleep.. but abilify can cause weight gain -_-,,
2,332448217490944000,,,,,lamotrigine,"@stilgarg i'm ok ty have an official diagnosis of bipolar now, feeling ok at the moment lamotrigine has been increased having monotherapy:/",,
3,332977955754110976,,,,,cymbalta,i'm soo depressed cymbalta couldn't help me .,,
4,333674203331051520,,,,,seroquel,"time for my daily afternoon relaxation ritual of smoking weed, taking 2 mgs of clonazepam, and 400 mg of seroquel xr.",,


## Get BERT Embeddings for MedDRA terms

In [None]:
meddra_list = pd.read_excel("patient-friendly_term_list_v25.0.xlsx")

In [None]:
meddra_list.head()

Unnamed: 0,LLT,LLT code,Added in v25.0
0,2019 novel coronavirus infection,10084529,
1,Abdomen enlarged,10000045,
2,Abdominal bloating,10048746,
3,Abdominal cramps,10000057,
4,Abdominal crampy pains,10000058,


In [None]:
from transformers import AutoTokenizer, AutoModel

In [None]:
# tokenizer = AutoTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.1')
# model = AutoModel.from_pretrained('dmis-lab/biobert-base-cased-v1.1', output_hidden_states=True)

tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased', output_hidden_states=True)


Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/223k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/422M [00:00<?, ?B/s]

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
layers = [-4, -3]

def get_embeddings(sent, tokenizer, model, layers):
    encoded = tokenizer.encode_plus(sent, return_tensors="pt")
    with torch.no_grad():
        output = model(**encoded)

    states = output.hidden_states
    output = torch.stack([states[i] for i in layers]).sum(0).squeeze()
    return output.mean(dim=0)

In [None]:
meddra_embedding_dict = dict()

# for code in meddra_list.LLT:
#     word_embedding = get_embeddings(code, tokenizer, model, layers)
#     meddra_embedding_dict[code] = word_embedding

for code in train.meddra_term.unique():
    if str(code) != 'nan':
        word_embedding = get_embeddings(code, tokenizer, model, layers)
        meddra_embedding_dict[code] = word_embedding
    else:
        continue

In [None]:
pred_meddra_codes = list()
for pred_ext in validation.pred_extractions:
    if pred_ext != []:
        pred = pred_ext[0]

        pred_embedding = get_embeddings(pred, tokenizer, model, layers)
        best_code = ''
        best_score = 0

        for code, emb in meddra_embedding_dict.items():
            result = 1 - spatial.distance.cosine(emb, pred_embedding)

            if result > best_score:
                best_score = result
                best_code = code
        
        pred_meddra_codes.append(best_code)
    else:
        pred_meddra_codes.append('-')

In [None]:
results_df = pd.DataFrame(columns=['true_meddra_term', 'pred_meddra_term', 'true_label'])

In [None]:
results_df.true_meddra_term = validation.meddra_term
results_df.pred_meddra_term = pred_meddra_codes

results_df.true_meddra_term.fillna('-', inplace=True)

results_df.true_label = [1 if term != '-' else 0 for term in results_df.true_meddra_term]

In [None]:
results_df.tail()

Unnamed: 0,true_meddra_term,pred_meddra_term,true_label
555,-,-,0
556,-,-,0
557,-,-,0
558,-,-,0
559,-,-,0


In [None]:
def cal_metric(true, pred):
    soft_medra_classification = []
    hard_medra_classfication = []
    for actual, predicted in zip(true, pred):
        breaker = True

        for actual_term in str(actual).split():
            if actual_term in str(predicted).split():
                soft_medra_classification.append(1)
                breaker = False
                break
        if breaker:
            soft_medra_classification.append(0)

        if actual == predicted:
            hard_medra_classfication.append(1)
        else:
            hard_medra_classfication.append(0)
    
    return soft_medra_classification, hard_medra_classfication

In [None]:
soft_meddra, hard_meddra = cal_metric(results_df.true_meddra_term, results_df.pred_meddra_term)

In [None]:
results_df['soft_meddra'] = soft_meddra
results_df['hard_meddra'] = hard_meddra

In [None]:
results_df.tail()

Unnamed: 0,true_meddra_term,pred_meddra_term,true_label,soft_meddra,hard_meddra
555,-,-,0,1,1
556,-,-,0,1,1
557,-,-,0,1,1
558,-,-,0,1,1
559,-,-,0,1,1


We change where true label is 0 and predicted is 1 -> then true label = 1<br>
This occurs in cases where both true and predicted are '-'. Previsouly, if '-' in present in true meddra term then we map true label to 0. But if model also didn't find any extraction then we don't have a meddra term for it which is why in predicted also we get '-'. This points to cases where the model picked up properly then predicted is 1 but true is 0. This will lead to more false positive but actually should come under true positive. Hence, we change the value for true value to 1 in such cases.

In [None]:
results_df['soft_true_label'] = [1 if (row[3]==0) and (row[4]==1) else row[3] for row in results_df.itertuples()]
results_df['hard_true_label'] = [1 if (row[3]==0) and (row[5]==1) else row[3] for row in results_df.itertuples()]

In [None]:
results_df.tail()

Unnamed: 0,true_meddra_term,pred_meddra_term,true_label,soft_meddra,hard_meddra,soft_true_label,hard_true_label
555,-,-,0,1,1,1,1
556,-,-,0,1,1,1,1
557,-,-,0,1,1,1,1
558,-,-,0,1,1,1,1
559,-,-,0,1,1,1,1


In [None]:
from sklearn.metrics import classification_report

In [None]:
# Relaxed - NER
print("Soft scores")
print(classification_report(results_df['soft_true_label'], results_df['soft_meddra']))

Soft scores
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.42      0.59       560

    accuracy                           0.42       560
   macro avg       0.50      0.21      0.30       560
weighted avg       1.00      0.42      0.59       560



In [None]:
print("Hard scores")
print(classification_report(results_df['hard_true_label'], results_df['hard_meddra']))

Hard scores
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.38      0.55       560

    accuracy                           0.38       560
   macro avg       0.50      0.19      0.27       560
weighted avg       1.00      0.38      0.55       560



In [None]:
results_df

Unnamed: 0,true_meddra_term,pred_meddra_term,true_label,soft_meddra,hard_meddra,soft_true_label,hard_true_label
0,attention deficit disorder,ill feeling,1,0,0,1,1
1,attention deficit disorder,stress,1,0,0,1,1
2,death,nauseated,1,0,0,1,1
3,death,nauseated,1,0,0,1,1
4,tendon injury,concentration loss,1,0,0,1,1
5,-,-,0,1,1,1,1
6,bizarre dreams,bad dreams,1,1,0,1,1
7,tendon rupture,ill feeling,1,0,0,1,1
8,-,-,0,1,1,1,1
9,withdrawal syndrome,exhaustion,1,0,0,1,1


In [None]:
results_df.to_csv("results_df.csv")

In [None]:
validation.meddra_term.head(20)

0     attention deficit disorder
1     attention deficit disorder
2                          death
3                          death
4                  tendon injury
5                            NaN
6                 bizarre dreams
7                 tendon rupture
8                            NaN
9            withdrawal syndrome
10            emotional distress
11                   memory loss
12                   memory loss
13                    somnolence
14                feeling stoned
15                feeling stoned
16                           NaN
17                           NaN
18                           NaN
19                   ill feeling
Name: meddra_term, dtype: object

In [None]:
pred_meddra_codes[:20]

['ADD',
 'nightmare',
 'withdrawn',
 'withdrawn',
 'tendon disorder',
 '-',
 'nightmares',
 'flat affect',
 '-',
 'withdrawn',
 'unpleasant personality',
 'bad dreams',
 'memory loss',
 'nightmare',
 'fear',
 'ache',
 '-',
 '-',
 '-',
 'sleepiness']