# Install libraries

In [None]:
!pip install "transformers==4.35" "peft==0.4.0" "accelerate==0.21.0" "bitsandbytes==0.40.2" "trl==0.4.7" "safetensors>=0.3.1" "tiktoken" "rouge-score"

# Import Libraries

In [None]:
import pandas as pd
import torch
from datasets import Dataset, load_dataset
from random import randrange
from peft import LoraConfig, get_peft_model, AutoPeftModelForCausalLM
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer
import torch
from peft import AutoPeftModelForCausalLM
import transformers

print(transformers.__version__)

# See GPU is used or not

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('Device ',device,' is being used')

# Load dataset

In [None]:
# Dataset
test_dataset = load_dataset("eli5_category",split="test")
number_of_questions_to_get_score = 100

In [None]:
test_dataset

# Load model

In [None]:
# #download dataset to colab
# ! mkdir ~/.kaggle
# #!cp kaggle.json ~/.kaggle/
# #! chmod 600 ~/.kaggle/kaggle.json
# !kaggle datasets download -d haniyehaghdam/lamma-25000dataset
# !unzip -qq '/content/lamma-25000dataset.zip'

In [None]:
model_directory = '/kaggle/input/dataset-50000/laama_15000_25000data/checkpoint-2100' # Set this to your directory

new_model = AutoPeftModelForCausalLM.from_pretrained(
    model_directory,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto",
)

# Load tokenizer

In [None]:

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_directory, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Generate answer

In [None]:
def answer_query(query,temperature=0.6,long_answer = False):

    if long_answer:
      prompt= 'Generatee long answer for this question ' + query

    else:
      prompt= 'Generate short answer for this question ' + query

    input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()

    if long_answer:

        outputs = new_model.generate(input_ids=input_ids,


                             min_length=100,
                             max_new_tokens=300,
                                 #repetition_penalty=0.2,
                            #  do_sample=True,
                            #  top_p=0.9,
                             temperature=temperature)

    else:

        outputs = new_model.generate(input_ids=input_ids,


                             min_length=30,
                             max_length = 100,
                             max_new_tokens=70,
                                 #repetition_penalty=0.2,
                            #  do_sample=True,
                            #  top_p=0.9,
                             temperature=temperature)
    answer = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]

    return answer

# Make it pretty

In [None]:
# Remove question from answer and remove last uncomplete sentence
def pretty_asnwer(answer):
    new_answer = answer
    if new_answer.endswith('.') or new_answer.endswith('!') or new_answer.endswith('?'):

        last_index_of_question = len(prompt) +41
        new_answer = new_answer[last_index_of_question:]
        if new_answer.find('Answer')>-1:
          index = new_answer.find('Answer')
          new_answer = new_answer[index+8:]
        #print(new_answer)

    else:
        #print('Not complete answer')
        last_index_of_question = len(prompt) +41
        new_answer = new_answer[last_index_of_question:]
        last_sentence = new_answer.split('. ')[-1]
        new_answer = new_answer.replace(last_sentence, " ")
        if new_answer.find('Answer')>-1:
          #print('here')
          index = new_answer.find('Answer')
          new_answer = new_answer[index+8:]



    return new_answer



# Generate a dataset

In [None]:
from tqdm import tqdm

In [None]:
generated_answer_list = []

disablity_of_model = 0

for prompt in tqdm(test_dataset['title'][:number_of_questions_to_get_score]):
  #print('index: ', index)

  answer = answer_query(prompt,long_answer=True)  # Generate Answer
  new_answer = pretty_asnwer(answer) # Make it beatiful

  j=0
  while len(new_answer)<5: # Generate again if, last time model was disable
      #print(f'index_{index}_J{j}:')
      disablity_of_model+=1
      answer = answer_query(prompt,long_answer=True)
      new_answer = pretty_asnwer(answer)
      #print('new_answer: ',new_answer)

  generated_answer_list.append(new_answer) # Store answer




# Create a dataframe with orginal questions and long genereated answers
dictionary = {'questions':test_dataset['title'][:number_of_questions_to_get_score],'Generated_answers':generated_answer_list}
generated_dataset = pd.DataFrame(dictionary)
print('disablity_of_model: ',disablity_of_model)

In [None]:
# Add True answers to dataset
concatenaed_answer_list = []
for answer_list in test_dataset['answers'][:number_of_questions_to_get_score]:
  concatenaed_answer = ' '.join(answer_list['text'])
  concatenaed_answer_list.append(concatenaed_answer)

generated_dataset['True_answers'] = concatenaed_answer_list
generated_dataset

# Compute Metrics

In [None]:

from rouge_score import rouge_scorer
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def calculate_rouge(predicted, reference):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=False)
    scores = scorer.score(reference, predicted)
    scores = {metric: {sub_metric: round(value * 100, 2) for sub_metric, value in scores[metric]._asdict().items()} for metric in scores}

    return scores



rouge_list = []
rouge1_scores = {'precision': [], 'recall': [], 'fmeasure': []}
rouge2_scores = {'precision': [], 'recall': [], 'fmeasure': []}
rougeL_scores = {'precision': [], 'recall': [], 'fmeasure': []}

for index in range(generated_dataset.shape[0]):
  predicted_text = generated_dataset.iloc[index,1]
  reference_text = generated_dataset.iloc[index,2]

  rouge_scores = calculate_rouge(predicted_text, reference_text)
  rouge_list.append(rouge_scores)

  for sub_metric in rouge1_scores:
        rouge1_scores[sub_metric].append(rouge_scores['rouge1'][sub_metric])
        rouge2_scores[sub_metric].append(rouge_scores['rouge2'][sub_metric])
        rougeL_scores[sub_metric].append(rouge_scores['rougeL'][sub_metric])


generated_dataset['Rouge Scores'] = rouge_list


#mean
mean_rouge1_scores = {sub_metric: round(sum(rouge1_scores[sub_metric]) / len(rouge1_scores[sub_metric]), 2) for sub_metric in rouge1_scores}
mean_rouge2_scores = {sub_metric: round(sum(rouge2_scores[sub_metric]) / len(rouge2_scores[sub_metric]), 2) for sub_metric in rouge2_scores}
mean_rougeL_scores = {sub_metric: round(sum(rougeL_scores[sub_metric]) / len(rougeL_scores[sub_metric]), 2) for sub_metric in rougeL_scores}
print('For 1000 items of test dataset:')
print('*'*20)
print("Mean ROUGE-1 scores (as percentages):", mean_rouge1_scores)
print("Mean ROUGE-2 scores (as percentages):", mean_rouge2_scores)
print("Mean ROUGE-L scores (as percentages):", mean_rougeL_scores)

# Save files

In [None]:
import json
# Save dataframe
generated_dataset.to_csv(f"/kaggle/working/generated_{number_of_questions_to_get_score}_test_dataset.csv",sep=',',index=False)
# Svae mean scores
# Filename
filename = f"/kaggle/working/mean_rouge_scores_for_{number_of_questions_to_get_score}_test_dataset.txt"
note = f'These valeus are for train_dataset of {number_of_questions_to_get_score} questions '
# Writing to the text file
with open(filename, 'w') as file:
    file.write(note + "\n\n")
    
    file.write("Mean ROUGE-1 scores:\n")
    file.write(json.dumps(mean_rouge1_scores, indent=4) + "\n\n")
    
    file.write("Mean ROUGE-2 scores:\n")
    file.write(json.dumps(mean_rouge2_scores, indent=4) + "\n\n")
    
    file.write("Mean ROUGE-L scores:\n")
    file.write(json.dumps(mean_rougeL_scores, indent=4) + "\n")

print(f"Data saved to {filename}")
