In [1]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
from transformers import AutoTokenizer,AutoModel
import torch

2024-06-18 00:41:13.296035: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [3]:
import json

def read_squad(path):
    with open(path, 'rb') as f:
        squad_dict = json.load(f)
    contexts = []
    questions = []
    answers = []

    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']

                if 'plausible_answers' in qa.keys():
                    access = 'plausible_answers'
                else:
                    access = 'answers'
                for answer in qa[access]:

                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    return contexts, questions, answers


train_contexts, train_questions, train_answers = read_squad('squad/train-v2.0.json')
val_contexts, val_questions, val_answers = read_squad('squad/dev-v2.0.json')

In [4]:
# !pip install gensim
import numpy as np
# from gensim.models import Word2Vec
np.random.seed(42)
train_contexts = np.array(train_contexts)
train_questions = np.array(train_questions)
train_answers = np.array(train_answers)

val_contexts = np.array(val_contexts)
val_questions = np.array(val_questions)
val_answers = np.array(val_answers)


train_indices = np.random.choice(len(train_contexts), 5000, replace=False)
val_indices = np.random.choice(len(val_contexts), 500, replace=False)

train_contexts_sampled = train_contexts[train_indices]
train_questions_sampled = train_questions[train_indices]
train_answers_sampled = train_answers[train_indices]

val_contexts_sampled = val_contexts[val_indices]
val_questions_sampled = val_questions[val_indices]
val_answers_sampled = val_answers[val_indices]

In [5]:
train_contexts_sampled = train_contexts_sampled.tolist()
train_questions_sampled = train_questions_sampled.tolist()
train_answers_sampled =train_answers_sampled.tolist()

val_contexts_sampled = val_contexts_sampled.tolist()
val_questions_sampled = val_questions_sampled.tolist()
val_answers_sampled = val_answers_sampled.tolist()

In [6]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        else:
            for n in [1, 2]:
                if context[start_idx-n:end_idx-n] == gold_text:
                    answer['answer_start'] = start_idx - n
                    answer['answer_end'] = end_idx - n

add_end_idx(train_answers_sampled, train_contexts_sampled)
add_end_idx(val_answers_sampled, val_contexts_sampled)


In [7]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     /home/kunuruabhishek/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/kunuruabhishek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
import regex as re
text_cleaning_re = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

In [10]:
def preprocess(text, stem=True):
  text = re.sub(text_cleaning_re, ' ', str(text).lower()).strip()
  tokens = []
  for token in text.split():
    if token not in stop_words:
      if stem:
        tokens.append(stemmer.stem(token))
      else:
        tokens.append(token)
  return " ".join(tokens)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
def calculate_similarity_scores(context_sentences, question):
    documents = context_sentences + [question]

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(documents)

    question_vector = tfidf_matrix[-1]
    context_vectors = tfidf_matrix[:-1]

    similarity_scores = cosine_similarity(question_vector, context_vectors).flatten()
    return similarity_scores

In [13]:
import numpy as np

def filter_context_by_similarity(context_sentences, similarity_scores, threshold):
    similarity_scores = np.array(similarity_scores)  
    
    filtered_indices = np.where(similarity_scores > threshold)[0]
    
    if len(filtered_indices) == 0:
        return ' '.join(context_sentences)
    filtered_indices = sorted(filtered_indices)

    filtered_sentences = [context_sentences[i] for i in filtered_indices]
    return ' '.join(filtered_sentences)


In [15]:
from tqdm import tqdm

def filter_squad_contexts(contexts, questions):
    filtered_contexts = []
    for context, question in tqdm(zip(contexts, questions), total=len(contexts)):
        context_sentences = context.split('.')  # Split into sentences based on full stops
        preprocessed_sentences = [preprocess(sentence) for sentence in context_sentences]
        preprocessed_question = preprocess(question)
        similarity_scores = calculate_similarity_scores(preprocessed_sentences, preprocessed_question)
        filtered_context = filter_context_by_similarity(context_sentences, similarity_scores, 0.05)
        filtered_contexts.append(filtered_context)
    return filtered_contexts

filtered_train_contexts = filter_squad_contexts(train_contexts_sampled, train_questions_sampled)
filtered_val_contexts = filter_squad_contexts(val_contexts_sampled, val_questions_sampled)


100%|██████████| 5000/5000 [00:13<00:00, 373.25it/s]
100%|██████████| 500/500 [00:01<00:00, 373.58it/s]


In [16]:
def align_answers_with_context(original_contexts, updated_contexts, answers):
    new_contexts = []
    aligned_answers = []
    skipped_count = 0

    for orig_context, updated_context, answer in zip(original_contexts, updated_contexts, answers):
        start_pos = answer['answer_start']
        end_pos = answer['answer_end']
        orig_answer = orig_context[start_pos:end_pos]
        start_idx = updated_context.find(orig_answer)

        if start_idx == -1:
            skipped_count += 1

            modified_answer = answer.copy()
            modified_answer['answer_start'] = len(updated_contexts)
            modified_answer['answer_end'] = len(updated_contexts)
        else:
            new_start_pos = start_idx
            new_end_pos = start_idx + len(orig_answer)
            modified_answer = answer.copy()
            modified_answer['answer_start'] = new_start_pos
            modified_answer['answer_end'] = new_end_pos

        aligned_answers.append(modified_answer)
        new_contexts.append(updated_context)

    return new_contexts, aligned_answers, skipped_count

filtered_train_contexts, train_answers_sampled, skipped_count = align_answers_with_context(train_contexts_sampled, filtered_train_contexts, train_answers_sampled)
print(f'Skipped {skipped_count} examples where the answer could not be found in the truncated context.')

Skipped 489 examples where the answer could not be found in the truncated context.


In [17]:
total_contexts = len(train_contexts_sampled) 

skipped_percentage = (skipped_count / total_contexts) * 100
print(f'Skipped {skipped_count} examples ({skipped_percentage:.2f}%) where the answer could not be found in the truncated context.')

Skipped 489 examples (9.78%) where the answer could not be found in the truncated context.


In [18]:
filtered_train_contexts[0]

'It threatened the collapse of large financial institutions, which was prevented by the bailout of banks by national governments, but stock markets still dropped worldwide  dollars, and a downturn in economic activity leading to the 2008–2012 global recession and contributing to the European sovereign-debt crisis  The active phase of the crisis, which manifested as a liquidity crisis, can be dated from August 9, 2007, when BNP Paribas terminated withdrawals from three hedge funds citing "a complete evaporation of liquidity"'

In [19]:
train_contexts[train_indices[0]]

'It threatened the collapse of large financial institutions, which was prevented by the bailout of banks by national governments, but stock markets still dropped worldwide. In many areas, the housing market also suffered, resulting in evictions, foreclosures and prolonged unemployment. The crisis played a significant role in the failure of key businesses, declines in consumer wealth estimated in trillions of U.S. dollars, and a downturn in economic activity leading to the 2008–2012 global recession and contributing to the European sovereign-debt crisis. The active phase of the crisis, which manifested as a liquidity crisis, can be dated from August 9, 2007, when BNP Paribas terminated withdrawals from three hedge funds citing "a complete evaporation of liquidity".'

In [20]:
filtered_train_contexts = list(filtered_train_contexts)
train_questions_sampled = list(train_questions_sampled)
filtered_val_contexts = list(filtered_val_contexts)
val_questions_sampled = list(val_questions_sampled)

In [21]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(filtered_train_contexts, train_questions_sampled, truncation=True, padding=True)
val_encodings = tokenizer(filtered_val_contexts, val_questions_sampled, truncation=True, padding=True)

In [22]:
from transformers import DistilBertForQuestionAnswering, AdamW
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to

In [23]:
tokenizer.decode(train_encodings['input_ids'][0])

'[CLS] it threatened the collapse of large financial institutions, which was prevented by the bailout of banks by national governments, but stock markets still dropped worldwide dollars, and a downturn in economic activity leading to the 2008 – 2012 global recession and contributing to the european sovereign - debt crisis the active phase of the crisis, which manifested as a liquidity crisis, can be dated from august 9, 2007, when bnp paribas terminated withdrawals from three hedge funds citing " a complete evaporation of liquidity " [SEP] what year did the global recession that followed the financial crisis of 2007 end? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]

In [24]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_pos = encodings.char_to_token(i, answers[i]['answer_start'])
        end_pos = encodings.char_to_token(i, answers[i]['answer_end'])

        if start_pos is None:
            start_pos = tokenizer.model_max_length
        if end_pos is None:
            shift = 1
            while end_pos is None and answers[i]['answer_end'] - shift >= 0:
                end_pos = encodings.char_to_token(i, answers[i]['answer_end'] - shift)
                shift += 1
        if end_pos is None:
            end_pos = tokenizer.model_max_length

        start_positions.append(start_pos)
        end_positions.append(end_pos)
        
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers_sampled)
add_token_positions(val_encodings, val_answers_sampled)


In [25]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [26]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

# setup GPU/CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
if torch.cuda.is_available():
    print("Running on GPU:", torch.cuda.get_device_name(0))
else:
    print("Running on CPU")

Running on GPU: NVIDIA GeForce GTX 1080 Ti


In [28]:
model.to(device)
model.train()
optim = AdamW(model.parameters(), lr=5e-5)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

for epoch in range(3):
    model.train()
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)
        loss = outputs[0]
        loss.backward()
        optim.step()
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

Epoch 0: 100%|██████████| 313/313 [01:57<00:00,  2.66it/s, loss=1.57]
Epoch 1: 100%|██████████| 313/313 [01:59<00:00,  2.62it/s, loss=1.81] 
Epoch 2: 100%|██████████| 313/313 [01:59<00:00,  2.62it/s, loss=1.36] 


In [29]:

model.eval()
val_loader = DataLoader(val_dataset, batch_size=16)
acc = []
for batch in val_loader:
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)
        acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
        acc.append(((end_pred == end_true).sum()/len(end_pred)).item())
acc = sum(acc)/len(acc)
acc

0.2744140625

In [30]:
import torch
from torch.utils.data import DataLoader

def calculate_f1(pred_start, pred_end, true_start, true_end):
    pred_tokens = set(range(pred_start, pred_end + 1))
    true_tokens = set(range(true_start, true_end + 1))

    common_tokens = pred_tokens.intersection(true_tokens)
    if len(common_tokens) == 0:
        return 0, 0, 0  # precision, recall, f1

    precision = len(common_tokens) / len(pred_tokens)
    recall = len(common_tokens) / len(true_tokens)
    f1 = 2 * precision * recall / (precision + recall)
    return precision, recall, f1




In [31]:
model.eval()

val_loader = DataLoader(val_dataset, batch_size=16)

start_acc = []
end_acc = []
precisions = []
recalls = []
f1s = []

for batch in val_loader:
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)
        start_acc.append((start_pred == start_true).sum().item() / len(start_pred))
        end_acc.append((end_pred == end_true).sum().item() / len(end_pred))
        for sp, ep, st, et in zip(start_pred, end_pred, start_true, end_true):
            precision, recall, f1 = calculate_f1(sp.item(), ep.item(), st.item(), et.item())
            precisions.append(precision)
            recalls.append(recall)
            f1s.append(f1)


In [32]:

avg_start_acc = sum(start_acc) / len(start_acc)
avg_end_acc = sum(end_acc) / len(end_acc)
avg_precision = sum(precisions) / len(precisions)
avg_recall = sum(recalls) / len(recalls)
avg_f1 = sum(f1s) / len(f1s)

In [33]:

print(f"Average Start Position Accuracy: {avg_start_acc:.4f}")
print(f"Average End Position Accuracy: {avg_end_acc:.4f}")
print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1 Score: {avg_f1:.4f}")

Average Start Position Accuracy: 0.2266
Average End Position Accuracy: 0.3223
Average Precision: 0.2506
Average Recall: 0.3442
Average F1 Score: 0.2568
