In [1]:
import pandas as pd
import numpy as np

In [2]:
train_dataset = pd.read_csv('dataset/train.csv')
validation_dataset = pd.read_csv('dataset/dev.csv')

In [3]:
validation_dataset

Unnamed: 0,context,answer,question
0,Private schools generally prefer to be called ...,public,What schools do preparatory schools prepare Br...
1,"Private schools, also known as independent sch...",independent,Along with non-governmental and nonstate schoo...
2,The Pilgrim Street building was refurbished be...,three,How many cinemas are currently housed at one s...
3,"Chris Keates, the general secretary of Nationa...",child protection and parental rights groups,A statement made by Chris Keates caused issues...
4,"In Berlin, the Huguenots created two new neigh...",1806-07,What years did this occupation take place?
...,...,...,...
9647,"Compact trucks were introduced, such as the To...",Dodge D-50,What did Mitsubishi rename its Forte to?
9648,"Luther's rediscovery of ""Christ and His salvat...",Christ and His salvation,What became the foundation of the Reformation?
9649,BSkyB has no veto over the presence of channel...,Ofcom,Who does BSkyB have an operating license from?
9650,"Research by Harvard economist Robert Barro, fo...",Harvard,What institution does Robert Barro hail from?


In [4]:
from pprint import pprint
sample_validation_dataset = next(iter(validation_dataset))
pprint (sample_validation_dataset)

'context'


In [5]:
validation_dataset

Unnamed: 0,context,answer,question
0,Private schools generally prefer to be called ...,public,What schools do preparatory schools prepare Br...
1,"Private schools, also known as independent sch...",independent,Along with non-governmental and nonstate schoo...
2,The Pilgrim Street building was refurbished be...,three,How many cinemas are currently housed at one s...
3,"Chris Keates, the general secretary of Nationa...",child protection and parental rights groups,A statement made by Chris Keates caused issues...
4,"In Berlin, the Huguenots created two new neigh...",1806-07,What years did this occupation take place?
...,...,...,...
9647,"Compact trucks were introduced, such as the To...",Dodge D-50,What did Mitsubishi rename its Forte to?
9648,"Luther's rediscovery of ""Christ and His salvat...",Christ and His salvation,What became the foundation of the Reformation?
9649,BSkyB has no veto over the presence of channel...,Ofcom,Who does BSkyB have an operating license from?
9650,"Research by Harvard economist Robert Barro, fo...",Harvard,What institution does Robert Barro hail from?


In [6]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

from termcolor import colored
import textwrap

from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

2023-12-27 14:51:53.844718: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-27 14:51:55.053983: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-12-27 14:51:55.054032: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-12-27 14:51:59.680081: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

In [7]:
t5_tokenizer = T5Tokenizer.from_pretrained('t5-base')
t5_model = T5ForConditionalGeneration.from_pretrained('t5-base')

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [8]:
from tqdm.notebook import tqdm
import copy

class QuestionGenerationDataset(Dataset):
    def __init__(self, tokenizer, filepath, max_len_inp=512,max_len_out=96):
        self.path = filepath

        self.passage_column = "context"
        self.answer = "answer"
        self.question = "question"

        # self.data = pd.read_csv(self.path)
        self.data = pd.read_csv(self.path,nrows=1000)

        self.max_len_input = max_len_inp
        self.max_len_output = max_len_out
        self.tokenizer = tokenizer
        self.inputs = []
        self.targets = []
        self.skippedcount =0
        self._build()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        source_ids = self.inputs[index]["input_ids"].squeeze()
        target_ids = self.targets[index]["input_ids"].squeeze()

        src_mask = self.inputs[index]["attention_mask"].squeeze()  # might need to squeeze
        target_mask = self.targets[index]["attention_mask"].squeeze()  # might need to squeeze

        labels = copy.deepcopy(target_ids)
        labels [labels==0] = -100

        return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask,"labels":labels}

    def _build(self):
        for idx in tqdm(range(len(self.data))):
            passage,answer,target = self.data.loc[idx, self.passage_column],self.data.loc[idx, self.answer], self.data.loc[idx, self.question]

            input_ = "context: %s  answer: %s </s>" % (passage, answer)
            target = "question: %s </s>" % (str(target))

            # get encoding length of input. If it is greater than self.max_len skip it
            test_input_encoding = self.tokenizer.encode_plus(input_,
                                        truncation=False,
                                        return_tensors="pt")

            length_of_input_encoding = len(test_input_encoding['input_ids'][0])


            if length_of_input_encoding > self.max_len_input:
              self.skippedcount = self.skippedcount + 1
              continue

            # tokenize inputs
            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [input_], max_length=self.max_len_input, pad_to_max_length=True, return_tensors="pt"
            )
            # tokenize targets
            tokenized_targets = self.tokenizer.batch_encode_plus(
                [target], max_length=self.max_len_output, pad_to_max_length=True,return_tensors="pt"
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)

In [9]:
train_dataset = QuestionGenerationDataset(t5_tokenizer,'dataset/train.csv')
validation_dataset = QuestionGenerationDataset(t5_tokenizer,'dataset/dev.csv')

  0%|          | 0/1000 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Token indices sequence length is longer than the specified maximum sequence length for this model (530 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/1000 [00:00<?, ?it/s]



In [None]:
train_dataset.shape

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print ("device ",device)
model = t5_model.to(device)

device  cpu


In [12]:
context ="President Donald Trump said and predicted that some states would reopen this month."
answer = "Donald Trump"
text = "context: "+context + " " + "answer: " + answer + " </s>"
print (text)

encoding = t5_tokenizer.encode_plus(text,max_length =512, padding=True, return_tensors="pt")
print (encoding.keys())
input_ids,attention_mask  = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

context: President Donald Trump said and predicted that some states would reopen this month. answer: Donald Trump </s>
dict_keys(['input_ids', 'attention_mask'])


In [13]:
t5_model.eval()
beam_outputs = t5_model.generate(
    input_ids=input_ids,attention_mask=attention_mask,
    max_length=72,
    early_stopping=True,
    num_beams=5,
    num_return_sequences=3

)

for beam_output in beam_outputs:
    sent = t5_tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
    print (sent)

True
False
Tast True


In [14]:
import pytorch_lightning as pl
pl.seed_everything(42)

class T5FineTuner(pl.LightningModule):
    def __init__(self,hparams, t5model, t5tokenizer):
        super(T5FineTuner, self).__init__()
        self.save_hyperparameters(hparams)
        # self.hparams = hparams
        self.model = t5model
        self.tokenizer = t5tokenizer


    def forward( self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, lm_labels=None):
         outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=decoder_attention_mask,
            labels=lm_labels,
        )

         return outputs


    def training_step(self, batch, batch_idx):
        outputs = self.forward(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            decoder_input_ids = batch["target_ids"],
            decoder_attention_mask=batch['target_mask'],
            lm_labels=batch['labels']
        )

        loss = outputs[0]
        self.log('train_loss',loss)
        return loss

    def validation_step(self, batch, batch_idx):
        outputs = self.forward(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            decoder_input_ids = batch["target_ids"],
            decoder_attention_mask=batch['target_mask'],
            lm_labels=batch['labels']
        )

        loss = outputs[0]
        self.log("val_loss",loss)
        return loss

    def train_dataloader(self):
        return DataLoader(train_dataset, batch_size=self.hparams.batch_size,num_workers=4)

    def val_dataloader(self):
        return DataLoader(validation_dataset, batch_size=self.hparams.batch_size,num_workers=4)



    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=3e-4, eps=1e-8)
        return optimizer

  warn(f"Failed to load image Python extension: {e}")
Seed set to 42


In [18]:
args_dict = dict(
    batch_size = 4,
)

args = argparse.Namespace(**args_dict)
model = T5FineTuner(args,t5_model,t5_tokenizer)
trainer = pl.Trainer(max_epochs = 1)

trainer.fit(model)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

In [1]:
from transformers import BertTokenizer, BertForMaskedLM
import torch

def generate_distractors(context, question, answer):
    model_name = "bert-base-uncased"
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertForMaskedLM.from_pretrained(model_name)

    masked_question = question.replace(answer, "[MASK]")
    encoded_input = tokenizer.encode(context, masked_question, return_tensors="pt")
    mask_token_index = torch.where(encoded_input == tokenizer.mask_token_id)[1]

    with torch.no_grad():
        output = model(encoded_input)
        predictions = output[0]
        masked_token_predictions = predictions[0, mask_token_index, :]

    predicted_token_ids = torch.argmax(masked_token_predictions, dim=1)
    predicted_tokens = [tokenizer.decode(token_id.item()) for token_id in predicted_token_ids]

    distractors = [token for token in predicted_tokens if token.lower() != answer.lower()]
    return distractors

context = "The context sentence."
question = "What is the capital of France?"
answer = "Paris"

distractors = generate_distractors(context, question, answer)
print("Distractors:", distractors)



Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Distractors: []


In [2]:
import numpy as np
from sense2vec import Sense2Vec
s2v = Sense2Vec().from_disk('s2v_old')

2024-04-08 21:59:53.958265: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-04-08 21:59:55.352611: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-04-08 21:59:55.353414: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2024-04-08 22:00:15.550250: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-

In [3]:
from sentence_transformers import SentenceTransformer

sentence_transformer_model = SentenceTransformer('msmarco-distilbert-base-v3')

In [5]:
from Levenshtein import distance as levenshtein_distance

In [6]:
def filter_same_sense_words(original,wordlist):
  filtered_words=[]
  base_sense =original.split('|')[1] 
  print (base_sense)
  for eachword in wordlist:
    if eachword[0].split('|')[1] == base_sense:
      filtered_words.append(eachword[0].split('|')[0].replace("_", " ").title().strip())
  return filtered_words

In [9]:
from difflib import SequenceMatcher

In [10]:
def get_highest_similarity_score(wordlist,wrd):
  score=[]
  for each in wordlist:
    similarity_ratio = SequenceMatcher(None, each.lower(), wrd.lower()).ratio()
    score.append(similarity_ratio)
  return max(score)

In [11]:
def sense2vec_get_words(word,s2v,topn,question):
    output = []
    print ("word ",word)
    try:
      sense = s2v.get_best_sense(word, senses= ["NOUN", "PERSON","PRODUCT","LOC","ORG","EVENT","NORP","WORK OF ART","FAC","GPE","NUM","FACILITY"])
      most_similar = s2v.most_similar(sense, n=topn)
      # print (most_similar)
      output = filter_same_sense_words(sense,most_similar)
      print ("Similar ",output)
    except:
      output =[]

    threshold = 0.6
    final=[word]
    checklist =question.split()
    for x in output:
      if get_highest_similarity_score(final,x)<threshold and x not in final and x not in checklist:
        final.append(x)
    
    return final[1:]

In [12]:
def mmr(doc_embedding, word_embeddings, words, top_n, lambda_param):

    # Extract similarity within words, and between words and the document
    word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding)
    word_similarity = cosine_similarity(word_embeddings)

    # Initialize candidates and already choose best keyword/keyphrase
    keywords_idx = [np.argmax(word_doc_similarity)]
    candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]

    for _ in range(top_n - 1):
        # Extract similarities within candidates and
        # between candidates and selected keywords/phrases
        candidate_similarities = word_doc_similarity[candidates_idx, :]
        target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)

        # Calculate MMR
        mmr = (lambda_param) * candidate_similarities - (1-lambda_param) * target_similarities.reshape(-1, 1)
        mmr_idx = candidates_idx[np.argmax(mmr)]

        # Update keywords & candidates
        keywords_idx.append(mmr_idx)
        candidates_idx.remove(mmr_idx)

    return [words[idx] for idx in keywords_idx]

In [13]:
from collections import OrderedDict
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /home/sanatan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [17]:
def get_distractors (word,origsentence,sense2vecmodel,sentencemodel,top_n,lambdaval):
  distractors = sense2vec_get_words(word,sense2vecmodel,top_n,origsentence)
  print ("distractors ",distractors)
  if len(distractors) ==0:
    return distractors
  distractors_new = [word.capitalize()]
  distractors_new.extend(distractors)
  # print ("distractors_new .. ",distractors_new)

  embedding_sentence = origsentence+ " "+word.capitalize()
  # embedding_sentence = word
  keyword_embedding = sentencemodel.encode([embedding_sentence])
  distractor_embeddings = sentencemodel.encode(distractors_new)

  # filtered_keywords = mmr(keyword_embedding, distractor_embeddings,distractors,4,0.7)
  max_keywords = min(len(distractors_new),5)
  filtered_keywords = mmr(keyword_embedding, distractor_embeddings,distractors_new,max_keywords,lambdaval)
  # filtered_keywords = filtered_keywords[1:]
  final = [word.capitalize()]
  for wrd in filtered_keywords:
    if wrd.lower() !=word.lower():
      final.append(wrd.capitalize())
  final = final[1:]
  return final

sent = "What is found in the earth's crust?"
keyword = "stone"

print (get_distractors(keyword,sent,s2v,sentence_transformer_model,40,0.2))

word  stone
NOUN
Similar  ['Stones', 'Iron', 'Same Stone', 'Granite', 'Marble', 'Obsidian', 'Wood', 'Anvil', 'Slabs', 'Solid Stone', 'Only Stone', 'Smooth Stone', 'Big Stone', 'Wooden Planks', 'Black Stone', 'Stone Wall', 'Rocks', 'Adamantine', 'Slab', 'Stone Floor', 'Crystal', 'Large Stone', 'Jewel', 'Stone Slab', 'Lava', 'Other Stone', 'Anvils', 'Cement', 'Stone Block', 'Cauldron', 'Flint', 'Ingot', 'Mithril', 'Gemstones', 'Stonework', 'Feather', 'Large Stones', 'Stone Walls']
distractors  ['Iron', 'Granite', 'Marble', 'Obsidian', 'Wood', 'Anvil', 'Slabs', 'Smooth Stone', 'Wooden Planks', 'Rocks', 'Adamantine', 'Crystal', 'Jewel', 'Lava', 'Cement', 'Cauldron', 'Flint', 'Mithril', 'Feather', 'Large Stones']
['Rocks', 'Anvil', 'Wooden planks', 'Feather', 'Mithril']


In [1]:
!pip install python-rake==1.4.4

Defaulting to user installation because normal site-packages is not writeable
Collecting python-rake==1.4.4
  Downloading python-rake-1.4.4.tar.gz (10 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: python-rake
  Building wheel for python-rake (setup.py) ... [?25ldone
[?25h  Created wheel for python-rake: filename=python_rake-1.4.4-py3-none-any.whl size=13459 sha256=4a9bbd10d09160fd8afb630d584afe16e835942bfd62370ee56f6060c0dd69af
  Stored in directory: /home/sanatan/.cache/pip/wheels/7a/dd/2f/e16099449134869d4a9a96c94092dc0101d7a3cc25c309f8e2
Successfully built python-rake
[33mDEPRECATION: distro-info 0.23ubuntu1 has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of distro-info or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m