In [None]:
!pip install -q langchain
!pip install -q torch
!pip install -q transformers
!pip install -q sentence-transformers
!pip install -q datasets
!pip install -q faiss-cpu
!pip install -q nltk
!pip install -q datasets
!pip install -q transformers[torch]
!pip install -q tokenizers
!pip install -q evaluate
!pip install -q rouge_score
!pip install -q sentencepiece
!pip install -q huggingface_hub
!pip install -q pypdf
!pip install -qqq bitsandbytes accelerate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.3/803.3 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m58.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m205.3/205.3 kB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
[2K     [90m━━

In [None]:
# Standard library imports
import numpy as np

# External library imports
import nltk
from datasets import load_dataset
from tqdm import tqdm
from transformers import T5Tokenizer, DataCollatorForSeq2Seq, T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline

# Local module imports
from evaluate import evaluate
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter, CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.schema import Document

# Pretrained model for finetuning

In [None]:
from transformers import T5Tokenizer, DataCollatorForSeq2Seq,T5ForConditionalGeneration
# Load the tokenizer, model, and data collator

# MODELS:

# 1. "google/flan-t5-small"
# 2. "google/flan-t5-base"
# 3. "google/flan-t5-large"

MODEL_NAME = "google/flan-t5-base"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME,device_map="auto")
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# **Dataset preparation**

In [None]:
from datasets import Dataset

clean_df = pd.read_json('/content/cleanquest.json')

train = clean_df.head(1000)
test = clean_df.tail(452)

train_dataset = Dataset.from_pandas(train)
val_dataset = Dataset.from_pandas(test)

## Preparing Knowledge base using FAISS

In [None]:

list_of_documents=[]

for clean_exp in train.clean_explanation.values:
  text_splitter = CharacterTextSplitter(separator='\n',chunk_size=256,chunk_overlap=16)
  list_of_documents.extend(text_splitter.split_documents([Document(page_content=clean_exp)]))

In [None]:
modelPath = "sentence-transformers/all-MiniLM-l6-v2"
#modelPath = "sentence-transformers/all-mpnet-base-v2"

# Create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs = {'device':'cpu'}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': False}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)

In [None]:
db = FAISS.from_documents(list_of_documents, embeddings)

## Preparing data to feed into the finetuning process

In [None]:
def preprocess_data(question, answer, max_length=512):
  inputs = tokenizer(
  question,
  max_length=max_length,
  truncation='only_second',
  padding='max_length',
  return_attention_mask=True,
  add_special_tokens=True,
  return_tensors='pt'
  )
  input_ids = inputs['input_ids'].squeeze()
  attention_mask = inputs['attention_mask'].squeeze()

  labels = tokenizer(text_target=answer, max_length=max_length, padding='max_length', truncation=True)

  # replace all tokenizer.pad_token_id in the labels by -100 
  labels["input_ids"] = [(l if l != tokenizer.pad_token_id else -100) for l in labels["input_ids"]]
  inputs["labels"] = labels["input_ids"]

  return {
      'input_ids': input_ids,
      'attention_mask': attention_mask,
      'labels': inputs["labels"]

  }

In [None]:
""" preparing the data in such a way that the input text is [context + question + options]
and the target output is the correct answer.Context is the list of documents that are 
retrieved given the question and the options"""

preprocessed_train_dataset = []
preprocessed_val_dataset = []


for example in tqdm(train_dataset):
  question=example['question']
  option1=example['answers'][0]['answer']
  option2=example['answers'][1]['answer']
  option3=example['answers'][2]['answer']
  option4=example['answers'][3]['answer']
  context=''
  for docs in db.search(question,search_type='mmr',k=10):
    context+=docs.page_content+'\n'

  question="context: "+context+ " Use the context to answer the following question. Answer using the context only. For the question: "+ question +" ,choose the correct answer from the following answers: option1) " + option1 +", option2) "+ option2 +", option3) "+ option3 + ", option4) "+option4
  preprocessed_example = preprocess_data(question, example['correct_answer'])
  preprocessed_train_dataset.append(preprocessed_example)

for example in tqdm(val_dataset):
  question=example['question']
  option1=example['answers'][0]['answer']
  option2=example['answers'][1]['answer']
  option3=example['answers'][2]['answer']
  option4=example['answers'][3]['answer']
  context=''
  for docs in db.search(question,search_type='mmr',k=10):
    context+=docs.page_content+'\n'
  question="context: "+context+ " Use the context to answer the following question. Answer using the context only.For the question: "+ question +" ,choose the correct answer from the following answers: option1) " + option1 +", option2) "+ option2 +", option3) "+ option3 + ", option4) "+option4
  preprocessed_example = preprocess_data(question, example['correct_answer'])
  preprocessed_val_dataset.append(preprocessed_example)


100%|██████████| 1000/1000 [00:22<00:00, 43.97it/s]
100%|██████████| 452/452 [00:10<00:00, 44.04it/s]


In [None]:
tokenized_train_dataset = Dataset.from_dict(
{key: [example[key] for example in preprocessed_train_dataset] for key in preprocessed_train_dataset[0].keys()})
print(tokenized_train_dataset)

tokenized_val_dataset = Dataset.from_dict(
{key: [example[key] for example in preprocessed_val_dataset] for key in preprocessed_val_dataset[0].keys()})
print(tokenized_val_dataset)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 452
})


# **Finetuning**


In [None]:
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

In [None]:
def compute_metrics(eval_preds):

   preds, labels = eval_preds

   # decode preds and labels
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

   # rougeLSum expects newline after each sentence
   decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
   decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

   result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
   return result

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
# Global Parameters
L_RATE = 15e-6  #3e-4 #15e-6,
BATCH_SIZE = 4
PER_DEVICE_EVAL_BATCH = 4
WEIGHT_DECAY = 0.02
SAVE_TOTAL_LIM = 5
NUM_EPOCHS = 25

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
   output_dir="./results_25",
   evaluation_strategy="epoch",
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False,
   save_strategy='epoch',
   load_best_model_at_end=True,
)

In [None]:
trainer = Seq2SeqTrainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train_dataset,
   eval_dataset=tokenized_val_dataset,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,0.167608,0.63746,0.476441,0.622417,0.624794
2,0.220200,0.170987,0.639996,0.483059,0.626897,0.629414
3,0.220200,0.180074,0.637783,0.475566,0.625624,0.628006
4,0.160800,0.186673,0.641845,0.487079,0.629645,0.632535
5,0.160800,0.185428,0.648075,0.490834,0.63502,0.636394
6,0.128400,0.182793,0.654168,0.495114,0.64129,0.642607
7,0.128400,0.203357,0.667616,0.51389,0.655059,0.656247
8,0.103000,0.212303,0.651549,0.496788,0.636881,0.63742
9,0.103000,0.227262,0.658034,0.506613,0.646005,0.646495
10,0.088100,0.235684,0.659679,0.514818,0.648834,0.649956


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


TrainOutput(global_step=6250, training_loss=0.08841370223999023, metrics={'train_runtime': 4398.7732, 'train_samples_per_second': 5.683, 'train_steps_per_second': 1.421, 'total_flos': 2.497696310321971e+16, 'train_loss': 0.08841370223999023, 'epoch': 25.0})

# **Inference**

In [None]:
last_checkpoint = '/content/results_25/checkpoint-6250'

finetuned_model = T5ForConditionalGeneration.from_pretrained(last_checkpoint)
finetuned_tokenizer = T5Tokenizer.from_pretrained(last_checkpoint)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
question_answerer = pipeline(
    "text2text-generation",
    model=finetuned_model,
    tokenizer=finetuned_tokenizer
)

# Create an instance of the HuggingFacePipeline, which wraps the question-answering pipeline
# with additional model-specific arguments (temperature and max_length)
llm = HuggingFacePipeline(
    pipeline=question_answerer,
    model_kwargs={"temperature": 0, "max_length": 512},
)


In [None]:

retriever = db.as_retriever(
     search_kwargs={"k":10}, search_type = 'mmr'
)

qa = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True, chain_type='stuff')

In [None]:
answers=[]
from tqdm import tqdm
for i in tqdm(range(1000,1452)):
  question=clean_df['question'].iloc[i]
  option1=clean_df['answers'].iloc[i][0]['answer']
  option2=clean_df['answers'].iloc[i][1]['answer']
  option3=clean_df['answers'].iloc[i][2]['answer']
  option4=clean_df['answers'].iloc[i][3]['answer']
  # question="For the question"+ question +"choose the correct answer from the following answers" + option1 +","+ option2 +","+ option3 + ","+option4
  question="For the question: "+ question +" ,choose the correct answer from the following answers: option1) " + option1 +", option2) "+ option2 +", option3) "+ option3 + ", option4) "+option4
  response = qa({"query": question},return_only_outputs=True)
  #print(response['result'])
  answers.append(response['result'].lower().replace(clean_df['question'].iloc[i].lower(),'').strip())

100%|██████████| 452/452 [05:31<00:00,  1.36it/s]


# **Evaluation**

In [None]:
import string

cnt = 0
j = 0
for i in range(len(answers)):
  matches = 0
  if answers[j].lower().strip().translate(str.maketrans('', '', string.punctuation)).replace('.','').replace('helpful','').replace('help','')==clean_df.tail(452)['correct_answer'].values[i].lower().strip().translate(str.maketrans('', '', string.punctuation)).replace('.',''):
    cnt+=1
    matches=1

  row_data = {'predicted_answer': answers[j].replace('helpful','').replace('help',''), 'groundtruth': clean_df.tail(452)['correct_answer'].values[i], 'matches?': matches, 'options':clean_df.tail(452)['answers'].values[i]}  # Replace with your values
  result_df = result_df.append(row_data, ignore_index=True)
  j+=1
print(str(cnt)+" out of " + str(len(answers))+" are correct")

164 out of 452 are correct


In [None]:
import pandas as pd
columns = ['predicted_answer', 'groundtruth', 'matches?','options']
result_df = pd.DataFrame(columns=columns)


In [None]:
result_df['groundtruth'] = result_df['groundtruth'].str.lower()

In [None]:
def rouge_l_score(reference, generated):
    smoothing_function = SmoothingFunction().method1
    reference_tokens = reference.split()
    generated_tokens = generated.split()

    # Compute ROUGE-L score
    rouge_l = sentence_bleu([reference_tokens], generated_tokens, smoothing_function=smoothing_function)

    return rouge_l

# mapping the groundtruth string to option id using string match
def map_to_option_id_gd(row):
  target_answer=row['groundtruth']
  answer_options=row['options']
  for option in answer_options:
      if target_answer.lower() in option['answer'].lower():
          return int(option['id'])
  return None

#matching the predicted answer to option id by calculating rougel score between the predicted answer and 
# the options and the option with the maximum rougel score is returned as the predicted option
def map_to_option_id_pred(row):
  target_answer=row['predicted_answer']
  answer_options=row['options']
  maxrouge=0
  id=None
  for option in answer_options:
      rouge_lscore=rouge_l_score(option['answer'].lower(),target_answer.lower())
      if rouge_lscore > maxrouge:
          maxrouge=rouge_lscore
          id=int(option['id'])
  return id

In [None]:
result_df['result_id_groundtruth'] = result_df.apply(map_to_option_id_gd, axis=1)

In [None]:
result_df['result_id_prediction']=result_df.apply(map_to_option_id_pred, axis=1)

In [None]:
result_df['matches_ids']=result_df.result_id_groundtruth==result_df.result_id_prediction

In [None]:
result_df.matches_ids.value_counts()[True]/len(result_df)

In [None]:
result_df.matches_ids.value_counts()

False    246
True     206
Name: matches_ids, dtype: int64

In [None]:
206/452  #accuracy of the finetuned model