In [1]:
import pandas as pd
import numpy as np
from transformers import (
    RobertaTokenizerFast,
    RobertaForTokenClassification,
    get_scheduler
)

In [2]:
train_df=pd.read_csv('/kaggle/input/data-set1/indic-health-demo-main/Dataset/IHQID-WebMD/train.csv')[['question_english','disease_english','drug_english','treatment_english']]
test_df=pd.read_csv('/kaggle/input/data-set1/indic-health-demo-main/Dataset/IHQID-WebMD/test.csv')[['question_english','disease_english','drug_english','treatment_english']]

In [3]:
train_df.head()

Unnamed: 0,question_english,disease_english,drug_english,treatment_english
0,what is nystatin prescribed for?,,nystatin,
1,can douching after sex stop me from getting pr...,pregnant,,
2,does percocet cause weight gain,weight gain,percocet,
3,does 2 or 2 1/2 glasses of wine a day caues hi...,high blood pressure,,
4,can too much buttermilk cause thrush?,thrush,,


In [4]:
from fuzzywuzzy import fuzz



In [5]:
def get_common_sequence(tokenized_sentence,tokenized_entity,entity,tag_list):
    common_sequence={
        'similarity':0.0,
        'start_index':-1,
        'end_index':-1
    }
    target_len=len(tokenized_entity)
    sentence_len=len(tokenized_sentence)
    for i in range(sentence_len-target_len):
        fuzz_ratio=fuzz.ratio(tokenized_sentence[i:i+target_len],tokenized_entity)
        if fuzz_ratio>=80 and common_sequence['similarity']<fuzz_ratio:
            common_sequence['similarity']=fuzz_ratio
            common_sequence['start_index']=i
            common_sequence['end_index']=i+target_len-1
        fuzz_ratio=fuzz.ratio(tokenized_sentence[i:i+target_len-1],tokenized_entity)
        if fuzz_ratio>=80 and common_sequence['similarity']<fuzz_ratio:
            common_sequence['similarity']=fuzz_ratio
            common_sequence['start_index']=i;
            common_sequence['end_index']=i+target_len-1
    tag_list[common_sequence['start_index']]="B-"+entity
    for i in range(common_sequence['start_index']+1,common_sequence['end_index']):
        tag_list[i]="I-"+entity

    

In [6]:
from nltk.tokenize import  word_tokenize
import math

In [7]:
for i in range(len(train_df)):
    tokenised_sentence=word_tokenize(train_df.loc[i,'question_english'].lower())
    tag_list=['O' for i in tokenised_sentence]
    if type(train_df.loc[i,'disease_english']) is not float:
        tokenized_disease=[word_tokenize(j.lower()) for j in train_df.loc[i,'disease_english'].split(',')]
        for k in tokenized_disease:
            get_common_sequence(tokenised_sentence,k,"disease",tag_list)
    else:
        assert(math.isnan(train_df.loc[i,'disease_english']))
    if type(train_df.loc[i,'drug_english']) is not float:
        tokenized_disease=[word_tokenize(j.lower()) for j in train_df.loc[i,'drug_english'].split(',')]
        for k in tokenized_disease:
            get_common_sequence(tokenised_sentence,k,"drug",tag_list)
    else:
        assert(math.isnan(train_df.loc[i,'drug_english']))
    if type(train_df.loc[i,'treatment_english']) is not float:
        tokenized_disease=[word_tokenize(j.lower()) for j in train_df.loc[i,'treatment_english'].split(',')]
        for k in tokenized_disease:
            get_common_sequence(tokenised_sentence,k,"treatment",tag_list)
    else:
        assert(math.isnan(train_df.loc[i,'treatment_english']))
    train_df.loc[i,'question_english']=str(tokenised_sentence)
    train_df.loc[i,'tag_english']=str(tag_list)

In [9]:
for i in range(len(test_df)):
    tokenised_sentence=word_tokenize(test_df.loc[i,'question_english'].lower())
    tag_list=['O' for i in tokenised_sentence]
    if type(test_df.loc[i,'disease_english']) is not float:
        tokenized_disease=[word_tokenize(j.lower()) for j in test_df.loc[i,'disease_english'].split(',')]
        for k in tokenized_disease:
            get_common_sequence(tokenised_sentence,k,"disease",tag_list)
    else:
        assert(math.isnan(test_df.loc[i,'disease_english']))
    if type(test_df.loc[i,'drug_english']) is not float:
        tokenized_disease=[word_tokenize(j.lower()) for j in test_df.loc[i,'drug_english'].split(',')]
        for k in tokenized_disease:
            get_common_sequence(tokenised_sentence,k,"drug",tag_list)
    else:
        assert(math.isnan(test_df.loc[i,'drug_english']))
    if type(test_df.loc[i,'treatment_english']) is not float:
        tokenized_disease=[word_tokenize(j.lower()) for j in test_df.loc[i,'treatment_english'].split(',')]
        for k in tokenized_disease:
            get_common_sequence(tokenised_sentence,k,"treatment",tag_list)
    else:
        assert(math.isnan(test_df.loc[i,'treatment_english']))
    test_df.loc[i,'question_english']=str(tokenised_sentence)
    test_df.loc[i,'tag_english']=str(tag_list)

In [10]:
train_df.head()

Unnamed: 0,question_english,disease_english,drug_english,treatment_english,tag_english
0,"['what', 'is', 'nystatin', 'prescribed', 'for'...",,nystatin,,"['O', 'O', 'B-drug', 'O', 'O', 'O']"
1,"['can', 'douching', 'after', 'sex', 'stop', 'm...",pregnant,,,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-di..."
2,"['does', 'percocet', 'cause', 'weight', 'gain']",weight gain,percocet,,"['O', 'B-drug', 'O', 'O', 'B-disease']"
3,"['does', '2', 'or', '2', '1/2', 'glasses', 'of...",high blood pressure,,,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
4,"['can', 'too', 'much', 'buttermilk', 'cause', ...",thrush,,,"['O', 'O', 'O', 'O', 'O', 'B-disease', 'O']"


In [11]:
def convert_string(string):
    special_remove=string.replace('[','').replace(']','').replace('"','').replace("'", '')
    split_str=special_remove.split(',')
    return [item.strip() for item in split_str]

In [12]:
train_df['question_english']=train_df['question_english'].apply(convert_string)
train_df['tag_english']=train_df['tag_english'].apply(convert_string)
test_df['question_english']=test_df['question_english'].apply(convert_string)
test_df['tag_english']=test_df['tag_english'].apply(convert_string)

In [13]:
label__ = {
    'O': 0,
    'B-treatment': 1,
    'I-treatment': 2,
    'B-disease': 3,
    'I-disease': 4,
    'B-drug': 5,
    'I-drug': 6
}

In [14]:
import torch
device='cuda' if torch.cuda.is_available() else 'cpu' 
device

'cuda'

In [15]:
model_checkpoint = "roberta-base"

hyper_parameters = {
    'batch_size': 8,
    'lr': 3e-5,
    'epochs': 10
}

In [16]:
def clean_list(lst):
    return [item for item in lst if item.strip()]
train_df['question_english']=train_df['question_english'].apply(clean_list)
test_df['question_english']=test_df['question_english'].apply(clean_list)

In [17]:
tokenizer=RobertaTokenizerFast.from_pretrained(model_checkpoint,add_prefix_space=True)


In [18]:
# def labelling(question,tag):
#     question = " ".join(question)
#     tokenized_input = tokenizer(question, truncation=True, padding='max_length', is_split_into_words=False)
#     word_ids = tokenized_input.word_ids()
#     for i, labels in enumerate(word_ids):
#         if labels is None:
#             word_ids[i] = 7
#         else:
#             word_ids[i] = label_1[tag[labels]]
#     tokenized_input["labels"] = word_ids
#     return tokenized_input
def process_queries(question, tag): 
    
    tokenized_input = tokenizer(question, truncation=True, padding='max_length', is_split_into_words=True)
    word_ids = tokenized_input.word_ids()
    j=1;
    for i, label in enumerate(word_ids):
        if label is None:
            word_ids[i] = 7
        else:
            word_ids[i] = label__[tag[label]]
            
    tokenized_input["labels"] = word_ids
    return tokenized_input

In [19]:
encoded_input_train = {
    'input_ids': [],
    'attention_mask': [],
    'tags_english': []
}
for index in range(len(train_df['question_english'])):
    process_output = process_queries(train_df.loc[index,'question_english'], train_df.loc[index,'tag_english'])
    encoded_input_train['input_ids'].append(process_output['input_ids'])
    encoded_input_train['attention_mask'].append(process_output['attention_mask'])
    encoded_input_train['tags_english'].append(process_output['labels'])

In [20]:
encoded_input_test = {
    'input_ids': [],
    'attention_mask': [],
    'tags_english': []
}
for index in range(len(test_df['question_english'])):
    process_output = process_queries(test_df.loc[index,'question_english'], test_df.loc[index,'tag_english'])
    encoded_input_test['input_ids'].append(process_output['input_ids'])
    encoded_input_test['attention_mask'].append(process_output['attention_mask'])
    encoded_input_test['tags_english'].append(process_output['labels'])

In [21]:
from torch.optim import AdamW
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import confusion_matrix, classification_report
from tqdm import tqdm


In [22]:
train_dataloader = DataLoader(
    TensorDataset(
        torch.tensor(encoded_input_train['input_ids']).to(device),
        torch.tensor(encoded_input_train['attention_mask']).to(device),
        torch.tensor(encoded_input_train['tags_english']).to(device)
    ),
    batch_size=hyper_parameters['batch_size']
)
test_dataloader = DataLoader(
    TensorDataset(
        torch.tensor(encoded_input_test['input_ids']).to(device),
        torch.tensor(encoded_input_test['attention_mask']).to(device),
        torch.tensor(encoded_input_test['tags_english']).to(device)
    ),
    batch_size=hyper_parameters['batch_size']
)

In [23]:
model = RobertaForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label__) + 1
)
model.to(device)

optimizer = AdamW(
    model.parameters(),
    lr=hyper_parameters['lr']
)

lr_scheduler = get_scheduler(
  "linear",
  optimizer=optimizer,
  num_warmup_steps=0,
  num_training_steps=hyper_parameters['epochs'] * len(train_dataloader)
)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
model.train()

updater = tqdm(range(hyper_parameters['epochs']))
for epoch in updater:
    total_train_loss = 0.0
    for batch in train_dataloader:
        
        optimizer.zero_grad()
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2],
        }
        
        outputs = model(**inputs)
        
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        lr_scheduler.step()
        
        total_train_loss += loss.item()
    
    print("Epoch:", epoch + 1, " - Training Loss:", round(total_train_loss / len(train_dataloader), 4))


# Didn't have much time to implement early stopping. So, saving the model at the end of all epochs.
torch.save(model.state_dict(), f'ee_rob_en.model')

 10%|█         | 1/10 [01:07<10:10, 67.86s/it]

Epoch: 1  - Training Loss: 0.1088


 20%|██        | 2/10 [02:15<09:02, 67.83s/it]

Epoch: 2  - Training Loss: 0.015


 30%|███       | 3/10 [03:23<07:55, 67.90s/it]

Epoch: 3  - Training Loss: 0.0111


 40%|████      | 4/10 [04:31<06:47, 67.92s/it]

Epoch: 4  - Training Loss: 0.0094


 50%|█████     | 5/10 [05:39<05:39, 67.97s/it]

Epoch: 5  - Training Loss: 0.0079


 60%|██████    | 6/10 [06:47<04:31, 67.93s/it]

Epoch: 6  - Training Loss: 0.0066


 70%|███████   | 7/10 [07:55<03:23, 67.92s/it]

Epoch: 7  - Training Loss: 0.0058


 80%|████████  | 8/10 [09:03<02:15, 67.95s/it]

Epoch: 8  - Training Loss: 0.0052


 90%|█████████ | 9/10 [10:11<01:07, 67.95s/it]

Epoch: 9  - Training Loss: 0.0047


100%|██████████| 10/10 [11:19<00:00, 67.93s/it]

Epoch: 10  - Training Loss: 0.0044





In [27]:
model.eval()

prediction = []
gold_label = []

extra_appended_tokens = 0

for indexer, batch in enumerate(test_dataloader):

    inputs = {
        'input_ids': batch[0],
        'attention_mask': batch[1],
        'labels': batch[2],
    }

    with torch.no_grad():
        outputs = model(**inputs)


    gold_label_cpu = inputs['labels'].cpu().numpy()
    logits_vector = outputs.logits.detach().cpu().numpy()

    assert(len(gold_label_cpu) == len(logits_vector))

    for index in range(len(logits_vector)):
        prediction_vector = []
        for iterator__ in logits_vector[index].argmax(axis=1):
            if iterator__ != 7:
                prediction_vector.append(iterator__)
                prediction.append(iterator__)
        
        gold_label_vector = []
        for iterator__ in gold_label_cpu[index]:
            if iterator__ != 7:
                gold_label_vector.append(iterator__)
                gold_label.append(iterator__)
        
        # There are some cases (only observed once) when there was one mismatch in vector of gold label and prediction
        # To overcome that, for each tokenized
        while len(gold_label) < len(prediction):
            extra_appended_tokens += 1
            gold_label.append(0)
        
        while len(prediction) < len(gold_label):
            extra_appended_tokens += 1
            prediction.append(0)

print("Number of extra appended tokens : ", extra_appended_tokens)
print(classification_report(gold_label, prediction))

Number of extra appended tokens :  0
              precision    recall  f1-score   support

           0       0.97      0.95      0.96      2546
           1       0.53      0.70      0.60        91
           2       0.00      0.00      0.00        17
           3       0.68      0.84      0.75       243
           4       1.00      0.05      0.09        21
           5       0.80      0.85      0.82       243
           6       0.00      0.00      0.00        14

    accuracy                           0.91      3175
   macro avg       0.57      0.48      0.46      3175
weighted avg       0.91      0.91      0.91      3175



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
