# Long Form Question Answering using Retrieval Augmented Generation (RAG)

In this notebook, we are going to implement a long-form question answering and generated content detection model using natrual language processing (NLP) tools.

The dataset used in this project is a subset of [ELI5 dataset](https://huggingface.co/datasets/eli5_category).

The approach in this notebook is:

Retrieve context using 

The first step is to load the data and learn about the characteristics of data.
Then we have to do some pre-proceesing on the text data in order to make it ready to be given to a model to learn.

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

# install datasets library

In [None]:
# !pip install datasets
!pip install sentence_transformers
!pip install faiss-cpu
!pip install evaluate
!pip install rouge_score

Load dataset

In [None]:
import os
from datasets import Dataset
import nltk
import evaluate

import torch
from transformers import BertModel, BertTokenizer
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm
import itertools
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from datasets import load_dataset
from datasets import load_metric

from sentence_transformers import SentenceTransformer, util
import faiss
from transformers import T5ForConditionalGeneration, T5Tokenizer

nltk.download("punkt", quiet=True)
rouge = load_metric("rouge")
bleu = load_metric("bleu")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
dataset = load_dataset("eli5_category")

# EDA

##Let's see the structure of dataset

In [None]:


print('*'*100)
print('In the following part, you can see the structure of dataset:')
print('*'*100)
dataset

##Let's see dataset information

In [None]:


import pandas as pd # I just created a dataframe of information
information_of_dataset = pd.DataFrame({'q_id': ['a string question identifier for each example, corresponding to its ID in the Pushshift.io Reddit submission dumps'],
                            'subreddit':['always explainlikeimfive, indicating which subreddit the question came from'],
                            'category':['tag of the question, the possible values are listed above.'],
                            'title':['title of the question, with URLs extracted and replaced by URL_n tokens'],
                            'title_urls':['list of the extracted URLs, the nth element of the list was replaced by URL_n'],
                            'selftext':['either an empty string or an elaboration of the question'],
                            'selftext_urls':['similar to title_urls but for self_text'],
                            'answers':['a list of answers, each answer has'],
                            'a_id':['a string answer identifier for each answer, corresponding to its ID in the Pushshift.io Reddit comments dumps.'],
                            'text':['he answer text with the URLs normalized'],
                            'score':['the number of upvotes - the number of downvotes the answer had received when the dumps were created'],
                            'text_urls': ['ists of the extracted URLs for every answer']},index=['description'])
print('In the following table, you can see dataset information. If you need to have a better view, try to use the icon on the right side')
print('*'*100)
information_of_dataset

##Let's focues on train part of dataset and see the first element of raw_train_dataset

In [None]:

raw_train_dataset = dataset['train']
print('Let us see the first element of raw_train_dataset: ')
print('*'*100)
print('\n')

raw_train_dataset[0]

## Let's create a better dictionary of dataset and remove some unnecessary columns

In [None]:
# Define weighted sampling function
def weighted_sample(answers, scores, k=3):
    k = min(k, len(answers))

    if len(answers) == 0:
        return []

    total_score = sum(scores)
    probabilities = [score / total_score for score in scores]
    sampled_indices = np.random.choice(len(answers), size=k, replace=False, p=probabilities)
    return [answers[i] for i in sampled_indices]

In [None]:
def organize_dataset(dataset):
  '''
    input: dataset must be given as dataset['train'] or dataset['test']
  '''
  # Here, we just create a dictionary using the following columns
  #1)questions
  #2)self_text
  #3)categories
  #4)answers
  #5)answers_scores
  # Since other columns are unique acroos the whole dataset. If you would like to see wheater they are unique or not, you can
  # use the following commands
  print('Unique items in selftext_urls is just only: <', np.unique(dataset['selftext_urls'])[0],'> So, this column has been removed')
  print('Unique items in title_urls is just only: <', np.unique(dataset['title_urls'])[0],'> So, this column has been removed')
  print('Unique items in subreddit is just only: <', np.unique(dataset['subreddit'])[0],'> So, this column has been removed')


  questions = dataset['title']
  questions_selftext = dataset['selftext']
  categories = dataset['category']

  answers = []
  answers_scores = []
  for item in dataset:
    answers.append(item['answers']['text'])
    answers_scores.append(item['answers']['score'])

  dataset_dict = {"questions":questions, "selftext": questions_selftext, "category":categories, "answers":answers, "score":answers_scores}
  print('*'*100)
  print('Now, we have a clean datset dictionary with ',dataset_dict.keys())
  print('*'*100)
  return dataset_dict

In [None]:
train_dataset_dict = organize_dataset(dataset['train'])

In [None]:
valid1_dataset_dict = organize_dataset(dataset['validation1'])

In [None]:
valid2_dataset_dict = organize_dataset(dataset['validation2'])

In [None]:
test_dataset_dict = organize_dataset(dataset['test'])

In [None]:
# Prepare data
train_questions = train_dataset_dict['questions']
train_answers = train_dataset_dict['answers']
train_scores = train_dataset_dict['score']

valid_questions = valid1_dataset_dict['questions']
valid_answers = valid1_dataset_dict['answers']
valid_scores = valid1_dataset_dict['score']

all_questions = train_dataset_dict['questions'] + valid1_dataset_dict['questions'] + valid2_dataset_dict['questions']
all_answers = train_dataset_dict['answers'] + valid1_dataset_dict['answers'] + valid2_dataset_dict['answers']
all_scores = train_dataset_dict['score'] + valid1_dataset_dict['score'] + valid2_dataset_dict['score']

Lets find the SBERT embeddings of questions of train set

In [None]:
# Load models
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

def find_Q_embeddings(questions):    
    # Create a progress bar with the total number of steps as 1
    progress_bar = tqdm(total=1, desc="Encoding Questions")
    
    # Embed your dataset questions
    question_embeddings = sbert_model.encode(questions, convert_to_tensor=True)
    
    # Update the progress bar as the entire dataset is processed
    progress_bar.update(1)
    progress_bar.close()
    
    return question_embeddings

def index_Q_embeddings(question_embeddings):
    # Index embeddings using FAISS
    index = faiss.IndexFlatL2(question_embeddings.shape[1])
    index.add(question_embeddings.cpu().numpy())
    return index

def save_Q_embeddings(question_embeddings, question_embeddings_file='/kaggle/working/question_embeddings.npy'):
    # Save the embeddings to a file
    embeddings_file = 'question_embeddings.npy'
    np.save(question_embeddings_file, question_embeddings.cpu().numpy())
    print("Saved Question Embeddings in file.")
    pass

def load_Q_embeddings(train_question_embeddings_file='/kaggle/working/question_embeddings.npy'):
    # Load the embeddings from the file
    train_questions_embeddings = torch.tensor(np.load(train_question_embeddings_file))
    print("Loaded Question Embeddings.")
    return train_questions_embeddings


def find_relevant_context_SBERT(question, question_embeddings, answers, scores, index, use='train'):
    '''
    this function returns top-2 similar answers for training phase
    - top-2 answers of similar questions
    - top-2 answers of current question
    '''
    new_question_embedding = sbert_model.encode(question, convert_to_tensor=True, show_progress_bar=False).unsqueeze(0)
    new_question_embedding = torch.nn.functional.normalize(new_question_embedding, p=2, dim=1)
    
    k = 5  # Retrieve a larger number of candidates
    D, I = index.search(new_question_embedding.cpu().numpy(), k)
    
    threshold = 0.5
    filtered_indices = []
    for i, distance in zip(I[0], D[0]):
        candidate_embedding = question_embeddings[i].cpu().numpy().reshape(1, -1)
        similarity = cosine_similarity(new_question_embedding.cpu().numpy(), candidate_embedding)[0][0]
        if similarity >= threshold:
            filtered_indices.append(i)
    
    if (len(filtered_indices) == 0):
        if(use == 'test'):
            return [""], [""]
        else:
            n_index = all_questions.index(question)
            top_ans = weighted_sample(answers[n_index], scores[n_index], k=3)
            return top_ans, [""]
    else:
        # Retrieve relevant contexts (questions and answers)
        relevant_contexts = [weighted_sample(answers[i], scores[i], k=3) for i in filtered_indices[1:]]
        if len(relevant_contexts) >= 1:  # when no relevant context is found
            relevant_contexts = relevant_contexts[0]
        top_2_ans = weighted_sample(answers[filtered_indices[0]], scores[filtered_indices[0]], k=3)
        return top_2_ans, relevant_contexts

In [None]:
question_embeddings = find_Q_embeddings(all_questions)
save_Q_embeddings(question_embeddings, question_embeddings_file='/kaggle/working/question_embeddings.npy')

In [None]:
# loaded_question_embeddings = load_Q_embeddings("/kaggle/working/question_embeddings.npy")
faiss_index = index_Q_embeddings(question_embeddings)

In [None]:
# Example usage
question = 'Why shouldnt we laugh at people?'
question = "how is the earth round?"
# question = train_dataset_dict['questions'][0]
# question = test_dataset_dict['questions'][1]
# answers = test_dataset_dict['answers'][1]
# question = train_dataset_dict['questions'][4]
# answers = train_dataset_dict['answers'][4]
question = valid_questions[0]
question = "What is a NIMBY?"

# context = retrieve_contexts(model_bert, tokenizer_bert, train_dataset_dict, question, train_question_embeddings, index=None, top_n=8)
print("\n question:")
print(question)
print("\n answers:")
# print(answers)

cntx = find_relevant_context_SBERT(question, question_embeddings, all_answers, 
                                   all_scores, faiss_index)
print("\n context")
print(cntx)

In [None]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # Decode preds and labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # RougeLSum expects newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    bleu_result = bleu.compute(predictions=[pred.split() for pred in decoded_preds], references=[[label.split()] for label in decoded_labels])

    return {**rouge_result, **bleu_result}

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW, get_linear_schedule_with_warmup
import random

# Load the T5 tokenizer and model
model_name = 't5-small'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

model.to(device)

In [None]:
# Custom Dataset Class
class DynamicQADataset(Dataset):
    def __init__(self, questions, answers, scores, question_embeddings, faiss_index, 
                 tokenizer, epoch, max_length=1024):
        self.questions = questions
        self.answers = answers
        self.scores = scores
        self.tokenizer = tokenizer
        self.epoch = epoch
        self.max_length = max_length
        self.question_embeddings = question_embeddings
        self.faiss_index = faiss_index

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        answers = self.answers[idx]
        scores = self.scores[idx]
        
        top_2_ans, relevant_contexts = find_relevant_context_SBERT(question, 
                                                                         self.question_embeddings, 
                                                                         self.answers, 
                                                                         self.scores, self.faiss_index)

        best_answer = top_2_ans[min(self.epoch, len(top_2_ans)-1)]
        if (len(top_2_ans) > 1):
            del top_2_ans[self.epoch]
        cntx = relevant_contexts + top_2_ans
        
        input_text = f"question: {question} context: {' '.join(cntx)}"
        target_text = best_answer

        # Tokenize the inputs and targets
        inputs = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        targets = self.tokenizer(
            target_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        input_ids = inputs.input_ids.squeeze(0)
        attention_mask = inputs.attention_mask.squeeze(0)
        labels = targets.input_ids.squeeze(0)

        # Replace padding token id's of the labels by -100 so it's ignored by the loss
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

# Function to create a DataLoader for a specific epoch
def create_dataloader(questions, answers, scores, question_embeddings, faiss_index, epoch, batch_size=2):
    dataset = DynamicQADataset(questions, answers, scores, question_embeddings, faiss_index, tokenizer, epoch)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)


In [None]:
def decode_predictions(preds, labels, tokenizer):
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return decoded_preds, decoded_labels


In [None]:
# Training parameters
epochs = 2
batch_size = 1
learning_rate = 5e-5
adam_epsilon = 1e-8
warmup_steps = 0
checkpoint_interval = 20000  # Save a checkpoint every 10000 batches

# Optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, eps=adam_epsilon)
total_steps = len(train_questions) // batch_size * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)

checkpoint_dir = "/kaggle/working/"

In [None]:
# def load_checkpoint(checkpoint_path, model, optimizer, scheduler):
#     checkpoint = torch.load(checkpoint_path)
#     model.load_state_dict(checkpoint['model_state_dict'])
#     optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
#     scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
#     start_epoch = checkpoint['epoch'] + 1  # start from the next epoch
#     loss = checkpoint['loss']
#     return model, optimizer, scheduler, start_epoch, loss

# # Initialize variables
# start_epoch = 0  # default start epoch
# checkpoint_path = os.path.join('/kaggle/input/checkpoint-epoch/', 'checkpoint_epoch_1.pt')
# if os.path.exists(checkpoint_path):
#     model, optimizer, scheduler, start_epoch, loss = load_checkpoint(checkpoint_path, model, optimizer, scheduler)
#     print(f"Loaded checkpoint from epoch {start_epoch}, loss: {loss}")


In [None]:

# Training loop
model.train()
for epoch in range(epochs):
    val_loader = create_dataloader(valid_questions, all_answers, all_scores, question_embeddings, faiss_index, epoch, batch_size)
    train_loader = create_dataloader(train_questions, all_answers, all_scores, question_embeddings, faiss_index, epoch, batch_size)
    total_loss = 0

    for batch_idx, batch in enumerate(tqdm(train_loader, desc="Batch progress")):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()
        
#         # Checkpointing after every checkpoint_interval batches
#         if (batch_idx + 1) % checkpoint_interval == 0:
#             checkpoint_path = os.path.join(checkpoint_dir, f'checkpoint_epoch_{epoch+1}_batch_{batch_idx+1}.pt')
#             torch.save({
#                 'epoch': epoch + 1,
#                 'batch_idx': batch_idx + 1,
#                 'model_state_dict': model.state_dict(),
#                 'optimizer_state_dict': optimizer.state_dict(),
#                 'scheduler_state_dict': scheduler.state_dict(),
#                 'loss': total_loss
#             }, checkpoint_path)
#             print(f'Saved checkpoint_epoch_{epoch+1}_batch_{batch_idx+1}')

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Loss: {avg_loss}")

    # Checkpointing after each epoch
    checkpoint_path = os.path.join(checkpoint_dir, f'checkpoint_epoch_{epoch+1}.pt')
    torch.save({
        'epoch': epoch + 1,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'loss': total_loss
    }, checkpoint_path)

    # Validation
    model.eval()
    val_loss = 0
    eval_preds = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validation progress"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

            # Collect predictions and labels for metric computation
            preds = outputs.logits.argmax(dim=-1)
            eval_preds.extend(zip(preds.cpu().numpy(), labels.cpu().numpy()))

    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch + 1}, Validation Loss: {avg_val_loss}")

    model.train()    

In [None]:
# save model and tokenizer
model.save_pretrained("/kaggle/working/RAG_SBERT_faiss_T5")
tokenizer.save_pretrained("/kaggle/working/RAG_SBERT_faiss_T5")

In [None]:
# # load model and tokenizer
# model_dir = "/kaggle/working/t5-with-context-finetuned"

# tokenizer = T5Tokenizer.from_pretrained(model_dir)
# model = T5ForConditionalGeneration.from_pretrained(model_dir)

# model.to(device)

Now lets try a new question. what we do is that try to find a context based on similar questions in our dataset and then generate an asnwer with that context using the trained model.

In [None]:
def generate_answer(question, context):
    input_text = f"question: {question} context: {' '.join(context[0] + context[1])}"
    inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=1024, truncation=True).to(device)
    outputs = model.generate(inputs, min_length=100, max_length=1024, num_beams=5, early_stopping=True).to(device)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
# Example usage
# question = 'how to talk Korean?'
# question = train_questions[150]
# question = valid_questions[20]
# question = "What is NIMBY?"
# question = "What is pedophilia?"
question = test_dataset_dict['questions'][8]
print(question)

In [None]:
print(test_dataset_dict['answers'][8])

In [None]:
cntx = find_relevant_context_SBERT(question, question_embeddings, all_answers, all_scores, faiss_index, use='test')
answer = generate_answer(question, cntx)
print(answer)

In [None]:
def answer_query(model, tokenizer, query,temperature=0.6,long_answer = False):

    context = find_relevant_context_SBERT(question, question_embeddings, all_answers, all_scores, faiss_index, use='test')
    prompt = "question: {} context: {}".format(question, ' '.join(context[0] + context[1]))

    input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()

    if long_answer:

        outputs = model.generate(input_ids=input_ids,


                             min_length=100,
                             max_new_tokens=300,
                                 #repetition_penalty=0.2,
                             do_sample=True,
                            #  top_p=0.9,
                             temperature=temperature)

    else:

        outputs = model.generate(input_ids=input_ids,


                             min_length=30,
                             max_length = 100,
                             max_new_tokens=70,
                                 #repetition_penalty=0.2,
                             do_sample=True,
                            #  top_p=0.9,
                             temperature=temperature)
    answer = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]

    return answer

# Remove question from answer and remove last uncomplete sentence
def pretty_asnwer(answer):
    new_answer = answer
    if new_answer.endswith('.') or new_answer.endswith('!') or new_answer.endswith('?'):

        last_index_of_question = len(prompt) +41
        new_answer = new_answer[last_index_of_question:]
        if new_answer.find('Answer')>-1:
          index = new_answer.find('Answer')
          new_answer = new_answer[index+8:]
        #print(new_answer)

    else:
        #print('Not complete answer')
        last_index_of_question = len(prompt) +41
        new_answer = new_answer[last_index_of_question:]
        last_sentence = new_answer.split('. ')[-1]
        new_answer = new_answer.replace(last_sentence, " ")
        if new_answer.find('Answer')>-1:
          #print('here')
          index = new_answer.find('Answer')
          new_answer = new_answer[index+8:]

    return new_answe

In [None]:
number_of_questions_to_get_score = 1000
generated_answer_list = []

disablity_of_model = 0

for prompt in tqdm(test_dataset_dict['questions'][:number_of_questions_to_get_score]):
  #print('index: ', index)

  answer = answer_query(model, tokenizer, prompt,long_answer=True)  # Generate Answer
  new_answer = pretty_asnwer(answer) # Make it beatiful

  j=0
  a = 0
  while len(new_answer)<5: # Generate again if, last time model was disable
      a += 1
      #print(f'index_{index}_J{j}:')
      disablity_of_model+=1
      answer = answer_query(model, tokenizer, prompt,long_answer=True)
      new_answer = pretty_asnwer(answer)
      if (a > 10):
        break
      #print('new_answer: ',new_answer)

  generated_answer_list.append(new_answer) # Store answer


# Create a dataframe with orginal questions and long genereated answers
dictionary = {'questions':test_dataset_dict['questions'][:number_of_questions_to_get_score],'Generated_answers':generated_answer_list}
generated_dataset = pd.DataFrame(dictionary)
print('disablity_of_model: ',disablity_of_model)

# Add True answers to dataset
concatenaed_answer_list = []
for answer_list in test_dataset_dict['answers'][:number_of_questions_to_get_score]:
  concatenaed_answer = ' '.join(answer_list)
  concatenaed_answer_list.append(concatenaed_answer)

generated_dataset['True_answers'] = concatenaed_answer_list
generated_dataset


## Compute Metrics

In [None]:

from rouge_score import rouge_scorer
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def calculate_rouge(predicted, reference):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=False)
    scores = scorer.score(reference, predicted)
    scores = {metric: {sub_metric: round(value * 100, 2) for sub_metric, value in scores[metric]._asdict().items()} for metric in scores}

    return scores



In [None]:


rouge_list = []
rouge1_scores = {'precision': [], 'recall': [], 'fmeasure': []}
rouge2_scores = {'precision': [], 'recall': [], 'fmeasure': []}
rougeL_scores = {'precision': [], 'recall': [], 'fmeasure': []}

for index in range(generated_dataset.shape[0]):
  predicted_text = generated_dataset.iloc[index,1]
  reference_text = generated_dataset.iloc[index,2]

  rouge_scores = calculate_rouge(predicted_text, reference_text)
  rouge_list.append(rouge_scores)

  for sub_metric in rouge1_scores:
        rouge1_scores[sub_metric].append(rouge_scores['rouge1'][sub_metric])
        rouge2_scores[sub_metric].append(rouge_scores['rouge2'][sub_metric])
        rougeL_scores[sub_metric].append(rouge_scores['rougeL'][sub_metric])


generated_dataset['Rouge Scores'] = rouge_list


#mean
mean_rouge1_scores = {sub_metric: round(sum(rouge1_scores[sub_metric]) / len(rouge1_scores[sub_metric]), 2) for sub_metric in rouge1_scores}
mean_rouge2_scores = {sub_metric: round(sum(rouge2_scores[sub_metric]) / len(rouge2_scores[sub_metric]), 2) for sub_metric in rouge2_scores}
mean_rougeL_scores = {sub_metric: round(sum(rougeL_scores[sub_metric]) / len(rougeL_scores[sub_metric]), 2) for sub_metric in rougeL_scores}
print('For 1000 items of test dataset:')
print('*'*20)
print("Mean ROUGE-1 scores (as percentages):", mean_rouge1_scores)
print("Mean ROUGE-2 scores (as percentages):", mean_rouge2_scores)
print("Mean ROUGE-L scores (as percentages):", mean_rougeL_scores)


In [None]:
import json
# Save dataframe
# generated_dataset.to_csv(f"/content/drive/MyDrive/generated_{number_of_questions_to_get_score}_test_dataset.csv",sep=',',index=False)
generated_dataset.to_csv(f"/kaggle/working/generated_{number_of_questions_to_get_score}_test_dataset.csv",sep=',',index=False)
# Svae mean scores
# Filename
filename = f"/kaggle/working/MyDrive/mean_rouge_scores_for_{number_of_questions_to_get_score}_test_dataset.txt"
note = f'These valeus are for train_dataset of {number_of_questions_to_get_score} questions '
# Writing to the text file
with open(filename, 'w') as file:
    file.write(note + "\n\n")

    file.write("Mean ROUGE-1 scores:\n")
    file.write(json.dumps(mean_rouge1_scores, indent=4) + "\n\n")

    file.write("Mean ROUGE-2 scores:\n")
    file.write(json.dumps(mean_rouge2_scores, indent=4) + "\n\n")

    file.write("Mean ROUGE-L scores:\n")
    file.write(json.dumps(mean_rougeL_scores, indent=4) + "\n")

print(f"Data saved to {filename}")

## API to Ask

In [None]:
# import gradio as gr
# from transformers import pipeline
# import numpy as np
# from gtts import gTTS

# # Initialize the transcription pipeline
# transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")

# # Transcription function
# def transcribe(audio):
#     sr, y = audio
#     y = y.astype(np.float32)
#     y /= np.max(np.abs(y))
#     return transcriber({"sampling_rate": sr, "raw": y})["text"]

# # Text-to-speech function
# def text_to_speech(text):
#     # Convert text to speech
#     tts = gTTS(text)
#     # Save the converted audio to a file
#     tts.save("output.mp3")
#     # Return the file path
#     return "output.mp3"

# # Main function to handle questions and generate responses
# def aks_question(Question, Long_Format, Short_Format, Audio_Play):
#     query = Question

#     if Long_Format and Short_Format:
#         prompt, long_answer = answer_query(query, temperature=0.6, long_answer=True)
#         long_answer = pretty_answer(prompt, long_answer)

#         while len(long_answer) < 50:
#             prompt, long_answer = answer_query(query, temperature=0.6, long_answer=True)
#             long_answer = pretty_answer(long_answer)

#         prompt, short_answer = answer_query(query, temperature=0.6, long_answer=False)
#         short_answer = pretty_answer(prompt, short_answer)

#         while len(short_answer) < 30:
#             prompt, short_answer = answer_query(query, temperature=0.6, long_answer=False)
#             short_answer = pretty_answer(prompt, short_answer)

#         if Audio_Play:
#             text = 'Long answer: ' + long_answer + ' Short answer: ' + short_answer
#             audio_output = text_to_speech(text)
#             return long_answer, short_answer, audio_output
#         else:
#             text = 'You did not choose audio play'
#             audio_output = text_to_speech(text)
#             return long_answer, short_answer, audio_output

#     elif Long_Format and not Short_Format:
#         prompt, long_answer = answer_query(query, temperature=0.6, long_answer=True)
#         long_answer = pretty_answer(prompt, long_answer)

#         while len(long_answer) < 50:
#             prompt, long_answer = answer_query(query, temperature=0.6, long_answer=True)
#             long_answer = pretty_answer(long_answer)

#         if Audio_Play:
#             text = 'Long answer: ' + long_answer
#             audio_output = text_to_speech(text)
#             return long_answer, '', audio_output
#         else:
#             text = 'You did not choose audio play'
#             audio_output = text_to_speech(text)
#             return long_answer, '', audio_output

#     elif not Long_Format and Short_Format:
#         prompt, short_answer = answer_query(query, temperature=0.6, long_answer=False)
#         short_answer = pretty_answer(prompt, short_answer)

#         while len(short_answer) < 30:
#             prompt, short_answer = answer_query(query, temperature=0.6, long_answer=False)
#             short_answer = pretty_answer(prompt, short_answer)

#         if Audio_Play:
#             text = 'Short answer: ' + short_answer
#             audio_output = text_to_speech(text)
#             return '', short_answer, audio_output
#         else:
#             text = 'You did not choose audio play'
#             audio_output = text_to_speech(text)
#             return '', short_answer, audio_output

#     else:
#         text = 'Please check at least one checkbox'
#         audio_output = text_to_speech(text)
#         return 'Please check at least one checkbox', 'Please check at least one checkbox', audio_output

# # Gradio interface
# demo = gr.Interface(
#     fn=aks_question,
#     inputs=[
#         gr.inputs.Audio(source="microphone", type="numpy"),
#         gr.inputs.Checkbox(label="Long Format"),
#         gr.inputs.Checkbox(label="Short Format"),
#         gr.inputs.Checkbox(label="Audio Play")
#     ],
#     outputs=[
#         gr.outputs.Textbox(label="Long Answer"),
#         gr.outputs.Textbox(label="Short Answer"),
#         gr.outputs.Audio(label='Long_Format_Audio_Play')
#     ],
#     live=True
# )

# demo.launch()