In [1]:
!pip3 install -q -U bitsandbytes
!pip3 install -q -U accelerate==0.27.1
!pip3 install -q -U peft==0.8.2
!pip3 install -q -U trl==0.7.10
!pip3 install -q -U datasets==2.17.0
!pip3 install -q -U transformers==4.38.0
!pip3 install -q -U rouge_score

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m279.7/279.7 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.4/183.4 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.9/150.9 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.0/102.0 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━

In [1]:
import os
from google.colab import userdata
os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")

import gc
import tqdm
import copy
import time
import random
import torch
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import pandas as pd

In [2]:
train_set = load_dataset("rPucs/TripletDollyQA-3k-Gemma-Nlg", "train")["train"]
test_set = load_dataset("rPucs/TripletDollyQA-3k-Gemma-Nlg", "test")["train"]
test_set = test_set.select(range(0,100))

### avg response len for improved generation w/ selection
train_response_lens = [len(datapoint["response"]) for datapoint in train_set]
num_sample = len(train_response_lens)
avg_train_response_len = sum([len/num_sample for len in train_response_lens])
print(avg_train_response_len)

288.5537500000003


In [6]:
def formatting_prompts_func(dataset):
    output_texts = []
    for datapoint in dataset:
        text = f"### User input: {datapoint['instruction']} \n### Relations: {' '.join(['<entry><head>{}<rel>{}<tail>{}'.format(rel['head'], rel['type'], rel['tail']) for rel in datapoint['context']]) } \n### Response:{datapoint['response']}<eos>"
        output_texts.append(text)
    return output_texts
def get_completion(prompt: str, model, tokenizer) -> str:
  device = "cuda:0"
  encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
  model_inputs = encodeds.to(device)
  generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=True, pad_token_id=tokenizer.eos_token_id)
  # decoded = tokenizer.batch_decode(generated_ids)
  decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
  return (decoded)

def get_completion_batch(prompt: list[str], model, tokenizer, num_samples=1, mid_length=avg_train_response_len) -> list[str]:
  device = "cuda:0"
  batch_size = len(prompt)
  encodeds = tokenizer(prompt,\
                      return_tensors="pt", add_special_tokens=True,\
                      padding=True)
  model_inputs = encodeds.to(device)
  if num_samples > 1:
    generated_ids = model.generate(**model_inputs, max_new_tokens=600, do_sample=True, num_return_sequences=num_samples) #, pad_token_id=tokenizer.eos_token_id)
    decoded = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    # restore batches
    batch_decoded = [decoded[(idx*num_samples):(idx*num_samples)+num_samples] for (idx, decoded) in list(zip(range(batch_size), [decoded]*batch_size))] # batch_
    # improve by selecting the one that is the closest to the avg len.  - "readable code xd"
    batch_best_fit_idx = [sorted(list(zip(range(len(decoded)), map(lambda x : abs(len(x) - mid_length), decoded))), key=lambda x : x[1])[0][0] for decoded in batch_decoded] # list of idx of best fit - list(int)
    batch_decoded = [decodeds[idx] for (idx, decodeds) in list(zip(batch_best_fit_idx, batch_decoded))]
    return batch_decoded
  else:
    generated_ids = model.generate(**model_inputs, max_new_tokens=600, do_sample=True, pad_token_id=tokenizer.eos_token_id)
    decoded = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    return decoded

def rouge_to_pd(r, IDName: str):
  """
  Converts ROUGE scores into a pandas DataFrame.

  Parameters:
  - r: A dictionary containing ROUGE scores. Each key is a ROUGE method and each value is a tuple of tuples
        containing precision, recall, and f-measure for low, mid, and high estimations.
  - IDName: A string that identifies the set of ROUGE scores.

  Returns:
  - A pandas DataFrame with columns for the IDName, method, and each metric (low, mid, high precision, recall, f-measure).
  """
  data = []
  # Fixed: Split the string into a list of strings
  for method in ["rouge1", "rouge2", "rougeL", "rougeLsum"]:
    ((low_precision, low_recall, low_fmeasure),
      (mid_precision, mid_recall, mid_fmeasure),
      (high_precision, high_recall, high_fmeasure)) = r[method]
    data.append({
        "IDName": IDName,
        "method": method,
        "low_precision": low_precision,
        "low_recall": low_recall,
        "low_fmeasure": low_fmeasure,
        "mid_precision": mid_precision,
        "mid_recall": mid_recall,
        "mid_fmeasure": mid_fmeasure,
        "high_precision": high_precision,
        "high_recall": high_recall,
        "high_fmeasure": high_fmeasure
    })
  return pd.DataFrame(data)

In [7]:
prompts = formatting_prompts_func(test_set)
test_set = test_set.add_column("prompt", prompts)

ValueError: The table can't have duplicated columns but columns ['prompt'] are duplicated.

In [8]:
model_ids = [
  "rPucs/gemma-2b-itTripletDolly-WebNLG",
  "rPucs/gemma-2b-itTripletDolly-WebNLG-fullcollator",
  "rPucs/gemma-7b-itTripletDolly-WebNLG-fullcollator"
]

In [9]:
def evaluate(model_id):
  print("start evaluating {}...".format(model_id))

  bnb_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_use_double_quant=True,
      bnb_4bit_quent_type="nf4",
      bnb_4bit_compute_dtype=torch.bfloat16
  )
  model = AutoModelForCausalLM.from_pretrained(model_id,
                                                quantization_config=bnb_config,
                                                device_map="auto")
  tokenizer = AutoTokenizer.from_pretrained(model_id,
                                            add_eos_token=True,
                                            padding_side="left"
                                            )
  batch_size = 2
  model_responses = []
  true_responses = []

  for i in tqdm.tqdm(range(0, test_set.num_rows, batch_size)):
    to = min(test_set.num_rows, i+batch_size)
    if i == to:
      continue
    batch_prompt = test_set[i:i+batch_size]["prompt"]
    expected_outputs = [p.split("\n### Response:")[1] for p in batch_prompt]


    batch_input = [p.split("\n### Response:")[0] + "\n### Response:"  for p in batch_prompt]
    completions = get_completion_batch(batch_input , model, tokenizer, num_samples=5)
    completions = [c.split("\n### Response:")[1] for c in completions]

    model_responses += completions
    true_responses += expected_outputs

  del model, tokenizer
  torch.cuda.empty_cache()

  #test_set = test_set.add_column("model_reponse",  model_responses)
  #test_set = test_set.add_column("true_response",  true_responses)

  print("Print out examples from {}".format(model_id))
  for idx in random.sample(range(len(test_set)), k=2):
    print(idx)
    print("true_response", true_responses[idx])
    print("model_reponse", model_responses[idx])

  evaluation_name_id= "rouge_score_" + model_id.split("/")[1]

  rouge = load_metric('rouge')
  references = true_responses
  predictions = model_responses
  results = rouge.compute(predictions=predictions, references=references)

  df = rouge_to_pd(results, evaluation_name_id)
  df.to_csv(evaluation_name_id + ".csv", index=False)


In [None]:
for model_id in model_ids:
  try:
    evaluate(model_id)
  except Exception as e:
    print(f"An error occurred when running:{model_id}: {e}")

start evaluating rPucs/gemma-2b-itTripletDolly-WebNLG...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

 76%|███████▌  | 38/50 [10:24<03:09, 15.79s/it]

In [None]:
from google.colab import files
files.download('/content/rouge_score_gemma-7b-itTripletDolly-WebNLG-fullcollator.csv')