In [1]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
from transformers import AutoTokenizer,AutoModel
import torch

2024-06-18 15:31:47.186790: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [3]:
import json

def read_squad(path):
    with open(path, 'rb') as f:
        squad_dict = json.load(f)
    contexts = []
    questions = []
    answers = []

    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']

                if 'plausible_answers' in qa.keys():
                    access = 'plausible_answers'
                else:
                    access = 'answers'
                for answer in qa[access]:

                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    return contexts, questions, answers


train_contexts, train_questions, train_answers = read_squad('squad/train-v2.0.json')
val_contexts, val_questions, val_answers = read_squad('squad/dev-v2.0.json')

In [4]:
# !pip install gensim
import numpy as np
# from gensim.models import Word2Vec
np.random.seed(42)
train_contexts = np.array(train_contexts)
train_questions = np.array(train_questions)
train_answers = np.array(train_answers)

val_contexts = np.array(val_contexts)
val_questions = np.array(val_questions)
val_answers = np.array(val_answers)


train_indices = np.random.choice(len(train_contexts), 5000, replace=False)
val_indices = np.random.choice(len(val_contexts), 500, replace=False)

train_contexts_sampled = train_contexts[train_indices]
train_questions_sampled = train_questions[train_indices]
train_answers_sampled = train_answers[train_indices]

val_contexts_sampled = val_contexts[val_indices]
val_questions_sampled = val_questions[val_indices]
val_answers_sampled = val_answers[val_indices]

In [5]:
train_contexts_sampled = train_contexts_sampled.tolist()
train_questions_sampled = train_questions_sampled.tolist()
train_answers_sampled =train_answers_sampled.tolist()

val_contexts_sampled = val_contexts_sampled.tolist()
val_questions_sampled = val_questions_sampled.tolist()
val_answers_sampled = val_answers_sampled.tolist()

In [6]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        else:
            for n in [1, 2]:
                if context[start_idx-n:end_idx-n] == gold_text:
                    answer['answer_start'] = start_idx - n
                    answer['answer_end'] = end_idx - n

add_end_idx(train_answers_sampled, train_contexts_sampled)
add_end_idx(val_answers_sampled, val_contexts_sampled)


In [7]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/kunuruabhishek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
import regex as re
text_cleaning_re = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

In [9]:
def preprocess(text, stem=True):
  text = re.sub(text_cleaning_re, ' ', str(text).lower()).strip()
  tokens = []
  for token in text.split():
    if token not in stop_words:
      if stem:
        tokens.append(stemmer.stem(token))
      else:
        tokens.append(token)
  return " ".join(tokens)

In [10]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from nltk.tokenize import sent_tokenize
import numpy as np

In [11]:
model_name='bert-base-uncased'
model=AutoModel.from_pretrained(model_name)
tokenizer=AutoTokenizer.from_pretrained(model_name)

def calculate_similarity_scores(context_sentences, question):
    sentences = context_sentences + [question]
    tokens={'input_ids':[],'attention_mask':[]}
    for sentence in sentences:
        new_tokens=tokenizer.encode_plus(sentence,max_length=128,
                                    truncation=True,padding='max_length',
                                    return_tensors='pt')
        tokens['input_ids'].append(new_tokens['input_ids'][0])
        tokens['attention_mask'].append(new_tokens['attention_mask'][0])
    tokens['input_ids']=torch.stack(tokens['input_ids'])
    tokens['attention_mask']=torch.stack(tokens['attention_mask'])
    outputs =model(**tokens)
    
    embeddings =outputs['last_hidden_state']
    attention =tokens['attention_mask']
    
    mask=attention.unsqueeze(-1).expand(embeddings.shape).float()
    mask_embeddings=embeddings*mask
    
    summed=torch.sum(mask_embeddings,1)
    counts=torch.clamp(mask.sum(1),min=1e-9)
    mean_pooled=summed/counts
    mean_pooled=mean_pooled.detach().numpy()
    question_vector = [mean_pooled[-1]]
    context_vectors = mean_pooled[:-1]

    similarity_scores = cosine_similarity(question_vector, context_vectors).flatten()
    return similarity_scores

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:

def filter_context_by_similarity(context_sentences, similarity_scores, threshold):
    filtered_indices = np.where(similarity_scores > threshold)[0]
    if len(filtered_indices) == 0:
        return ' '.join(context_sentences)
    filtered_sentences = [context_sentences[i] for i in filtered_indices]
    return ' '.join(filtered_sentences)

def filter_squad_contexts(contexts, questions, threshold=0.5):
    filtered_contexts = []
    for context, question in tqdm(zip(contexts, questions), total=len(contexts)):
        context_sentences = sent_tokenize(context)
        preprocessed_sentences = [preprocess(sentence) for sentence in context_sentences]
        preprocessed_question = preprocess(question)
        similarity_scores = calculate_similarity_scores(preprocessed_sentences, preprocessed_question)
        filtered_context = filter_context_by_similarity(context_sentences, similarity_scores, threshold)
        filtered_contexts.append(filtered_context)
    return filtered_contexts


In [16]:
filtered_train_contexts = filter_squad_contexts(train_contexts_sampled, train_questions_sampled)
filtered_val_contexts = filter_squad_contexts(val_contexts_sampled, val_questions_sampled)

100%|██████████| 5000/5000 [54:05<00:00,  1.54it/s]  
100%|██████████| 500/500 [05:45<00:00,  1.45it/s]


In [17]:
def align_answers_with_context(original_contexts, updated_contexts, answers):
    new_contexts = []
    aligned_answers = []
    skipped_count = 0

    for orig_context, updated_context, answer in zip(original_contexts, updated_contexts, answers):
        start_pos = answer['answer_start']
        end_pos = answer['answer_end']
        orig_answer = orig_context[start_pos:end_pos]
        start_idx = updated_context.find(orig_answer)

        if start_idx == -1:
            skipped_count += 1

            modified_answer = answer.copy()
            modified_answer['answer_start'] = len(updated_contexts)
            modified_answer['answer_end'] = len(updated_contexts)
        else:
            new_start_pos = start_idx
            new_end_pos = start_idx + len(orig_answer)
            modified_answer = answer.copy()
            modified_answer['answer_start'] = new_start_pos
            modified_answer['answer_end'] = new_end_pos

        aligned_answers.append(modified_answer)
        new_contexts.append(updated_context)

    return new_contexts, aligned_answers, skipped_count

filtered_train_contexts, train_answers_sampled, skipped_count = align_answers_with_context(train_contexts_sampled, filtered_train_contexts, train_answers_sampled)
print(f'Skipped {skipped_count} examples where the answer could not be found in the truncated context.')

Skipped 0 examples where the answer could not be found in the truncated context.


In [18]:
total_contexts = len(train_contexts_sampled) 

skipped_percentage = (skipped_count / total_contexts) * 100
print(f'Skipped {skipped_count} examples ({skipped_percentage:.2f}%) where the answer could not be found in the truncated context.')

Skipped 0 examples (0.00%) where the answer could not be found in the truncated context.


In [19]:
filtered_train_contexts[12]

"The first Sky television rights agreement was worth £304 million over five seasons. The next contract, negotiated to start from the 1997–98 season, rose to £670 million over four seasons. The third contract was a £1.024 billion deal with BSkyB for the three seasons from 2001–02 to 2003–04. The league brought in £320 million from the sale of its international rights for the three-year period from 2004–05 to 2006–07. It sold the rights itself on a territory-by-territory basis. Sky's monopoly was broken from August 2006 when Setanta Sports was awarded rights to show two out of the six packages of matches available. This occurred following an insistence by the European Commission that exclusive rights should not be sold to one television company. Sky and Setanta paid a total of £1.7 billion, a two-thirds increase which took many commentators by surprise as it had been widely assumed that the value of the rights had levelled off following many years of rapid growth. Setanta also hold right

In [20]:
train_contexts[train_indices[12]]

"The first Sky television rights agreement was worth £304 million over five seasons. The next contract, negotiated to start from the 1997–98 season, rose to £670 million over four seasons. The third contract was a £1.024 billion deal with BSkyB for the three seasons from 2001–02 to 2003–04. The league brought in £320 million from the sale of its international rights for the three-year period from 2004–05 to 2006–07. It sold the rights itself on a territory-by-territory basis. Sky's monopoly was broken from August 2006 when Setanta Sports was awarded rights to show two out of the six packages of matches available. This occurred following an insistence by the European Commission that exclusive rights should not be sold to one television company. Sky and Setanta paid a total of £1.7 billion, a two-thirds increase which took many commentators by surprise as it had been widely assumed that the value of the rights had levelled off following many years of rapid growth. Setanta also hold right

In [21]:
train_questions_sampled[12]

'How much many did the Premier League make from selling its internation rights during 2004-07?'

### after sentence selector

In [22]:
filtered_train_contexts = list(filtered_train_contexts)
train_questions_sampled = list(train_questions_sampled)
filtered_val_contexts = list(filtered_val_contexts)
val_questions_sampled = list(val_questions_sampled)

In [23]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(filtered_train_contexts, train_questions_sampled, truncation=True, padding=True)
val_encodings = tokenizer(filtered_val_contexts, val_questions_sampled, truncation=True, padding=True)

In [24]:
from transformers import DistilBertForQuestionAnswering, AdamW
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to

In [25]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_pos = encodings.char_to_token(i, answers[i]['answer_start'])
        end_pos = encodings.char_to_token(i, answers[i]['answer_end'])

        if start_pos is None:
            start_pos = tokenizer.model_max_length
        if end_pos is None:
            shift = 1
            while end_pos is None and answers[i]['answer_end'] - shift >= 0:
                end_pos = encodings.char_to_token(i, answers[i]['answer_end'] - shift)
                shift += 1
        if end_pos is None:
            end_pos = tokenizer.model_max_length

        start_positions.append(start_pos)
        end_positions.append(end_pos)
        
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers_sampled)
add_token_positions(val_encodings, val_answers_sampled)


In [26]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [27]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
if torch.cuda.is_available():
    print("Running on GPU:", torch.cuda.get_device_name(0))
else:
    print("Running on CPU")

Running on GPU: NVIDIA GeForce GTX 1080 Ti


In [29]:
model.to(device)
model.train()
optim = AdamW(model.parameters(), lr=5e-5)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

for epoch in range(3):
    model.train()
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)
        loss = outputs[0]
        loss.backward()
        optim.step()
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

Epoch 0: 100%|██████████| 313/313 [02:24<00:00,  2.17it/s, loss=2.99]
Epoch 1: 100%|██████████| 313/313 [02:28<00:00,  2.11it/s, loss=1.49] 
Epoch 2: 100%|██████████| 313/313 [02:28<00:00,  2.10it/s, loss=0.382]


In [30]:

model.eval()
val_loader = DataLoader(val_dataset, batch_size=16)
acc = []
for batch in val_loader:
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)
        acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
        acc.append(((end_pred == end_true).sum()/len(end_pred)).item())
acc = sum(acc)/len(acc)
acc

0.474609375

In [31]:
import torch
from torch.utils.data import DataLoader

def calculate_f1(pred_start, pred_end, true_start, true_end):
    pred_tokens = set(range(pred_start, pred_end + 1))
    true_tokens = set(range(true_start, true_end + 1))

    common_tokens = pred_tokens.intersection(true_tokens)
    if len(common_tokens) == 0:
        return 0, 0, 0  

    precision = len(common_tokens) / len(pred_tokens)
    recall = len(common_tokens) / len(true_tokens)
    f1 = 2 * precision * recall / (precision + recall)
    return precision, recall, f1


In [32]:
model.eval()

val_loader = DataLoader(val_dataset, batch_size=16)

start_acc = []
end_acc = []
precisions = []
recalls = []
f1s = []

for batch in val_loader:
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)
        start_acc.append((start_pred == start_true).sum().item() / len(start_pred))
        end_acc.append((end_pred == end_true).sum().item() / len(end_pred))
        for sp, ep, st, et in zip(start_pred, end_pred, start_true, end_true):
            precision, recall, f1 = calculate_f1(sp.item(), ep.item(), st.item(), et.item())
            precisions.append(precision)
            recalls.append(recall)
            f1s.append(f1)


In [33]:

avg_start_acc = sum(start_acc) / len(start_acc)
avg_end_acc = sum(end_acc) / len(end_acc)
avg_precision = sum(precisions) / len(precisions)
avg_recall = sum(recalls) / len(recalls)
avg_f1 = sum(f1s) / len(f1s)

In [34]:

print(f"Average Start Position Accuracy: {avg_start_acc:.4f}")
print(f"Average End Position Accuracy: {avg_end_acc:.4f}")
print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1 Score: {avg_f1:.4f}")

Average Start Position Accuracy: 0.4648
Average End Position Accuracy: 0.4844
Average Precision: 0.4791
Average Recall: 0.5864
Average F1 Score: 0.4791
