In [1]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
from transformers import AutoTokenizer,AutoModel
import torch

2024-06-18 17:33:32.868204: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [3]:
import json

def read_squad(path):
    with open(path, 'rb') as f:
        squad_dict = json.load(f)
    contexts = []
    questions = []
    answers = []

    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']

                if 'plausible_answers' in qa.keys():
                    access = 'plausible_answers'
                else:
                    access = 'answers'
                for answer in qa[access]:

                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    return contexts, questions, answers


train_contexts, train_questions, train_answers = read_squad('squad/train-v2.0.json')
val_contexts, val_questions, val_answers = read_squad('squad/dev-v2.0.json')

In [4]:
import numpy as np
from gensim.models import Word2Vec
np.random.seed(42)
train_contexts = np.array(train_contexts)
train_questions = np.array(train_questions)
train_answers = np.array(train_answers)

val_contexts = np.array(val_contexts)
val_questions = np.array(val_questions)
val_answers = np.array(val_answers)

train_indices = np.random.choice(len(train_contexts), 5000, replace=False)
val_indices = np.random.choice(len(val_contexts), 500, replace=False)

train_contexts_sampled = train_contexts[train_indices]
train_questions_sampled = train_questions[train_indices]
train_answers_sampled = train_answers[train_indices]

val_contexts_sampled = val_contexts[val_indices]
val_questions_sampled = val_questions[val_indices]
val_answers_sampled = val_answers[val_indices]

In [5]:
train_contexts_sampled = train_contexts_sampled.tolist()
train_questions_sampled = train_questions_sampled.tolist()
train_answers_sampled =train_answers_sampled.tolist()

val_contexts_sampled = val_contexts_sampled.tolist()
val_questions_sampled = val_questions_sampled.tolist()
val_answers_sampled = val_answers_sampled.tolist()

In [6]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        else:
            for n in [1, 2]:
                if context[start_idx-n:end_idx-n] == gold_text:
                    answer['answer_start'] = start_idx - n
                    answer['answer_end'] = end_idx - n

add_end_idx(train_answers_sampled, train_contexts_sampled)
add_end_idx(val_answers_sampled, val_contexts_sampled)


In [7]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     /home/kunuruabhishek/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/kunuruabhishek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
import regex as re
text_cleaning_re = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

In [10]:
def preprocess(text, stem=True):
  text = re.sub(text_cleaning_re, ' ', str(text).lower()).strip()
  tokens = []
  for token in text.split():
    if token not in stop_words:
      if stem:
        tokens.append(stemmer.stem(token))
      else:
        tokens.append(token)
  return " ".join(tokens)

In [11]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input,Dropout
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Bidirectional, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.random import set_seed
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize


In [12]:
import numpy as np

from tensorflow.keras.models import Model
from tensorflow.keras.initializers import Constant
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow.random import set_seed


In [13]:
def load_pretrained_embeddings(vocSize, embedding_dim, glove_file_path, word_index):
    embeddings_index = {}
    with open(glove_file_path, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    embedding_matrix = np.zeros((vocSize, embedding_dim))
    for word, i in word_index.items():
        if i < vocSize:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [14]:
def CNN_BiDirLSTM(vocab_size, word_index,embedding_dim,glove_file_path):


    embedding_matrix = load_pretrained_embeddings(vocab_size, embedding_dim, glove_file_path, word_index)
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, embeddings_initializer=Constant(embedding_matrix), input_length=100))
    model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
    model.add(MaxPooling1D(pool_size=4))
    model.add(Bidirectional(LSTM(64, return_sequences=False)))
    model.add(Dense(32, activation='relu'))

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model


In [15]:
def preprocess_texts(texts, tokenizer, max_sequence_length):
    texts = [preprocess(text) for text in texts]
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')
    return padded_sequences

In [16]:
def calculate_similarity_scores(context_sentences, question, max_sequence_length, model, tokenizer):
    context_sequences = preprocess_texts(context_sentences, tokenizer, 64)
    question_sequence = preprocess_texts([question], tokenizer, 64)

    all_sequences = np.vstack([context_sequences, question_sequence])
    cnn_lstm_embeddings = model.predict(all_sequences)
    question_vector = cnn_lstm_embeddings[-1].reshape(1, -1)
    context_vectors = cnn_lstm_embeddings[:-1]

    similarity_scores = cosine_similarity(context_vectors, question_vector).flatten()
    return similarity_scores

In [17]:
def filter_context_by_similarity(context_sentences, similarity_scores, threshold):
    filtered_indices = np.where(similarity_scores > threshold)[0]
    if len(filtered_indices) == 0:
        return ' '.join(context_sentences)
    filtered_sentences = [context_sentences[i] for i in filtered_indices]
    return ' '.join(filtered_sentences)

In [18]:
def filter_squad_contexts(contexts, questions, glove_file_path, threshold=0.5, max_sequence_length=64):
    all_sentences = [sent for context in contexts for sent in sent_tokenize(context)]
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(all_sentences + questions)
    vocab_size = len(tokenizer.word_index) + 1

    model = CNN_BiDirLSTM(vocab_size,tokenizer.word_index,100,glove_file_path)

    filtered_contexts = []
    for context, question in tqdm(zip(contexts, questions), total=len(contexts)):
        context_sentences = sent_tokenize(context)
        similarity_scores = calculate_similarity_scores(context_sentences, question, max_sequence_length, model, tokenizer)
        filtered_context = filter_context_by_similarity(context_sentences, similarity_scores, threshold)
        filtered_contexts.append(filtered_context)
    return filtered_contexts

In [19]:
glove_file_path = 'glove.6B.100d.txt'

# def filter_squad_contexts(contexts, questions):
#     filtered_contexts = []
#     for context, question in tqdm(zip(contexts, questions), total=len(contexts)):
#         context_sentences = context.split('.')  # Split into sentences based on full stops
#         preprocessed_sentences = [preprocess(sentence) for sentence in context_sentences]
#         preprocessed_question = preprocess(question)
#         similarity_scores = calculate_similarity_scores(preprocessed_sentences, preprocessed_question)
#         filtered_context = filter_context_by_similarity(context_sentences, similarity_scores, 0.85)
#         filtered_contexts.append(filtered_context)
#     return filtered_contexts

In [20]:

filtered_train_contexts = filter_squad_contexts(train_contexts_sampled, train_questions_sampled,glove_file_path)
filtered_val_contexts = filter_squad_contexts(val_contexts_sampled, val_questions_sampled,glove_file_path)

2024-06-18 17:33:54.031848: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2024-06-18 17:33:54.032010: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2024-06-18 17:33:54.307213: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-06-18 17:33:54.307745: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:0a:00.0 name: NVIDIA GeForce GTX 1080 Ti computeCapability: 6.1
coreClock: 1.582GHz coreCount: 28 deviceMemorySize: 10.92GiB deviceMemoryBandwidth: 451.17GiB/s
2024-06-18 17:33:54.307816: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA 



2024-06-18 17:33:54.670794: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2024-06-18 17:33:54.687645: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 3393250000 Hz
100%|██████████| 5000/5000 [02:28<00:00, 33.69it/s]
  0%|          | 0/500 [00:00<?, ?it/s]



100%|██████████| 500/500 [00:15<00:00, 32.64it/s]


In [21]:
def align_answers_with_context(original_contexts, updated_contexts, answers):
    new_contexts = []
    aligned_answers = []
    skipped_count = 0

    for orig_context, updated_context, answer in zip(original_contexts, updated_contexts, answers):
        start_pos = answer['answer_start']
        end_pos = answer['answer_end']
        orig_answer = orig_context[start_pos:end_pos]
        start_idx = updated_context.find(orig_answer)

        if start_idx == -1:
            skipped_count += 1

            modified_answer = answer.copy()
            modified_answer['answer_start'] = len(updated_contexts)
            modified_answer['answer_end'] = len(updated_contexts)
        else:
            new_start_pos = start_idx
            new_end_pos = start_idx + len(orig_answer)
            # Create a copy of answer and update positions
            modified_answer = answer.copy()
            modified_answer['answer_start'] = new_start_pos
            modified_answer['answer_end'] = new_end_pos

        aligned_answers.append(modified_answer)
        new_contexts.append(updated_context)

    return new_contexts, aligned_answers, skipped_count

filtered_train_contexts, train_answers_sampled, skipped_count = align_answers_with_context(train_contexts_sampled, filtered_train_contexts, train_answers_sampled)
print(f'Skipped {skipped_count} examples where the answer could not be found in the truncated context.')

Skipped 11 examples where the answer could not be found in the truncated context.


In [22]:
total_contexts = len(train_contexts_sampled) 

skipped_percentage = (skipped_count / total_contexts) * 100
print(f'Skipped {skipped_count} examples ({skipped_percentage:.2f}%) where the answer could not be found in the truncated context.')

Skipped 11 examples (0.22%) where the answer could not be found in the truncated context.


In [23]:
filtered_train_contexts[12]

"The first Sky television rights agreement was worth £304 million over five seasons. The next contract, negotiated to start from the 1997–98 season, rose to £670 million over four seasons. The third contract was a £1.024 billion deal with BSkyB for the three seasons from 2001–02 to 2003–04. The league brought in £320 million from the sale of its international rights for the three-year period from 2004–05 to 2006–07. It sold the rights itself on a territory-by-territory basis. Sky's monopoly was broken from August 2006 when Setanta Sports was awarded rights to show two out of the six packages of matches available. This occurred following an insistence by the European Commission that exclusive rights should not be sold to one television company. Sky and Setanta paid a total of £1.7 billion, a two-thirds increase which took many commentators by surprise as it had been widely assumed that the value of the rights had levelled off following many years of rapid growth. Setanta also hold right

In [24]:
train_contexts[train_indices[12]]

"The first Sky television rights agreement was worth £304 million over five seasons. The next contract, negotiated to start from the 1997–98 season, rose to £670 million over four seasons. The third contract was a £1.024 billion deal with BSkyB for the three seasons from 2001–02 to 2003–04. The league brought in £320 million from the sale of its international rights for the three-year period from 2004–05 to 2006–07. It sold the rights itself on a territory-by-territory basis. Sky's monopoly was broken from August 2006 when Setanta Sports was awarded rights to show two out of the six packages of matches available. This occurred following an insistence by the European Commission that exclusive rights should not be sold to one television company. Sky and Setanta paid a total of £1.7 billion, a two-thirds increase which took many commentators by surprise as it had been widely assumed that the value of the rights had levelled off following many years of rapid growth. Setanta also hold right

In [25]:
train_questions_sampled[12]

'How much many did the Premier League make from selling its internation rights during 2004-07?'

In [26]:
context_sentences=train_contexts[train_indices[12]]

In [27]:
question=train_questions_sampled[12]

In [28]:
context_sentences = context_sentences.split('.')  # Split into sentences based on full stops
preprocessed_sentences = [preprocess(sentence) for sentence in context_sentences]
preprocessed_question = preprocess(question)

In [29]:
preprocessed_sentences

['first sky televis right agreement worth 304 million five season',
 'next contract negoti start 1997 98 season rose 670 million four season',
 'third contract 1',
 '024 billion deal bskyb three season 2001 02 2003 04',
 'leagu brought 320 million sale intern right three year period 2004 05 2006 07',
 'sold right territori territori basi',
 'sky monopoli broken august 2006 setanta sport award right show two six packag match avail',
 'occur follow insist european commiss exclus right sold one televis compani',
 'sky setanta paid total 1',
 '7 billion two third increas took mani comment surpris wide assum valu right level follow mani year rapid growth',
 'setanta also hold right live 3 pm match sole irish viewer',
 'bbc retain right show highlight three season match day 171',
 '6 million 63 per cent increas 105 million paid previous three year period',
 'sky bt agre joint pay 84',
 '3 million delay televis right 242 game right broadcast full televis internet case period 50 hour 10 pm mat

In [30]:
preprocessed_question

'much mani premier leagu make sell intern right 2004 07'

In [31]:
filtered_train_contexts = list(filtered_train_contexts)
train_questions_sampled = list(train_questions_sampled)
filtered_val_contexts = list(filtered_val_contexts)
val_questions_sampled = list(val_questions_sampled)

In [32]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(filtered_train_contexts, train_questions_sampled, truncation=True, padding=True)
val_encodings = tokenizer(filtered_val_contexts, val_questions_sampled, truncation=True, padding=True)

In [33]:
from transformers import DistilBertForQuestionAnswering, AdamW
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to

In [34]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_pos = encodings.char_to_token(i, answers[i]['answer_start'])
        end_pos = encodings.char_to_token(i, answers[i]['answer_end'])

        if start_pos is None:
            start_pos = tokenizer.model_max_length
        if end_pos is None:
            shift = 1
            while end_pos is None and answers[i]['answer_end'] - shift >= 0:
                end_pos = encodings.char_to_token(i, answers[i]['answer_end'] - shift)
                shift += 1
        if end_pos is None:
            end_pos = tokenizer.model_max_length

        start_positions.append(start_pos)
        end_positions.append(end_pos)
        
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers_sampled)
add_token_positions(val_encodings, val_answers_sampled)


In [35]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [36]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm
import torch
# setup GPU/CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
if torch.cuda.is_available():
    print("Running on GPU:", torch.cuda.get_device_name(0))
else:
    print("Running on CPU")

Running on GPU: NVIDIA GeForce GTX 1080 Ti


In [38]:
model.to(device)
model.train()
optim = AdamW(model.parameters(), lr=5e-5)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

for epoch in range(3):
    model.train()
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)
        loss = outputs[0]
        loss.backward()
        optim.step()
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

Epoch 0: 100%|██████████| 313/313 [02:28<00:00,  2.11it/s, loss=1.66]
Epoch 1: 100%|██████████| 313/313 [02:29<00:00,  2.09it/s, loss=1.89] 
Epoch 2: 100%|██████████| 313/313 [02:29<00:00,  2.09it/s, loss=0.696]


In [39]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm
import torch
# setup GPU/CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
if torch.cuda.is_available():
    print("Running on GPU:", torch.cuda.get_device_name(0))
else:
    print("Running on CPU")

Running on GPU: NVIDIA GeForce GTX 1080 Ti


In [40]:

model.eval()
val_loader = DataLoader(val_dataset, batch_size=16)
acc = []
for batch in val_loader:
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)
        acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
        acc.append(((end_pred == end_true).sum()/len(end_pred)).item())
acc = sum(acc)/len(acc)
acc

0.4736328125

In [41]:
import torch
from torch.utils.data import DataLoader

def calculate_f1(pred_start, pred_end, true_start, true_end):
    pred_tokens = set(range(pred_start, pred_end + 1))
    true_tokens = set(range(true_start, true_end + 1))

    common_tokens = pred_tokens.intersection(true_tokens)
    if len(common_tokens) == 0:
        return 0, 0, 0  # precision, recall, f1

    precision = len(common_tokens) / len(pred_tokens)
    recall = len(common_tokens) / len(true_tokens)
    f1 = 2 * precision * recall / (precision + recall)
    return precision, recall, f1



In [42]:
model.eval()

val_loader = DataLoader(val_dataset, batch_size=16)

start_acc = []
end_acc = []
precisions = []
recalls = []
f1s = []

for batch in val_loader:
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)
        start_acc.append((start_pred == start_true).sum().item() / len(start_pred))
        end_acc.append((end_pred == end_true).sum().item() / len(end_pred))
        for sp, ep, st, et in zip(start_pred, end_pred, start_true, end_true):
            precision, recall, f1 = calculate_f1(sp.item(), ep.item(), st.item(), et.item())
            precisions.append(precision)
            recalls.append(recall)
            f1s.append(f1)


In [43]:

avg_start_acc = sum(start_acc) / len(start_acc)
avg_end_acc = sum(end_acc) / len(end_acc)
avg_precision = sum(precisions) / len(precisions)
avg_recall = sum(recalls) / len(recalls)
avg_f1 = sum(f1s) / len(f1s)

In [44]:
print(f"Average Start Position Accuracy: {avg_start_acc:.4f}")
print(f"Average End Position Accuracy: {avg_end_acc:.4f}")
print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1 Score: {avg_f1:.4f}")

Average Start Position Accuracy: 0.4531
Average End Position Accuracy: 0.4941
Average Precision: 0.4881
Average Recall: 0.5996
Average F1 Score: 0.4889
