# Import Required libraries

In [59]:
pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=9d11a0c2bce0732b6268dbf1e88926441a374efc9d026fb569b59200d7b7fd5f
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [48]:
!pip3 install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [1]:
import os
import warnings
warnings.filterwarnings("ignore")

from getpass import getpass
os.environ["HF_TOKEN"] = getpass("Enter the Huggingface token:")

In [55]:
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer

import evaluate
from datasets import load_dataset
from datasets import Dataset


from typing import List

import torch 
device = "cuda" if torch.cuda.is_available() else "gpu"


if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"GPU version: {torch.version.cuda}")

GPU Name: Tesla T4
GPU version: 12.6


In [36]:
def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=100):
    encoded = tokenizer(
        text["question"],
        return_tensors="pt",
        truncation=True,
        max_length=max_input_tokens
    )

    input_ids = encoded.input_ids.to(model.device)

    outputs = model.generate(
        input_ids=input_ids,
        max_new_tokens=max_output_tokens,
        do_sample=False
    )

    decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    # Safeguard for empty output
    if len(decoded) == 0:
        text["predictions"] = ""
    else:
        text["predictions"] = decoded[0]

    return text


# 1. Load Trained Model

In [None]:
save_dir = 'TaylorShiftFineTunedModel/final'
finetuned_slightly_model = AutoModelForCausalLM.from_pretrained(save_dir, 
                                                            local_files_only=True)

finetuned_slightly_model.to(device)

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (act): GELUActivation()
        )
      )
    )
    (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise

In [15]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")

# 2. Load the Testing Dataset

In [43]:
dataset = load_dataset(
    path = "lamini/taylor_swift", 
)

# We are dropping these columns and we recreate them as per our use case
dataset = dataset.remove_columns(["input_ids", "attention_mask", "labels"])
test_dataset = Dataset.from_dict(dataset["test"][0:8])

In [45]:
test_dataset = test_dataset.map(lambda x: inference(x, finetuned_slightly_model, tokenizer))

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
The attentio

In [63]:
def compute_metrics(predictions: List[str], references: List[str]) -> str:

    # Huggingface Evaluate functions
    bleu = evaluate.load("bleu") # bleu model
    rouge = evaluate.load("rouge") # rouge model

    # 3. Compute the metrics
    bleu_results = bleu.compute(predictions=predictions, references=references)
    rouge_results = rouge.compute(predictions=predictions, references=references)

    print("BLEU Results:", bleu_results)
    print("ROUGE Results:", rouge_results)

In [64]:
references = list(test_dataset["answer"])
predictions = list(test_dataset["predictions"])

In [65]:
compute_metrics(predictions, references)

BLEU Results: {'bleu': 0.11945176072640301, 'precisions': [0.26865671641791045, 0.14453781512605043, 0.0919931856899489, 0.05699481865284974], 'brevity_penalty': 1.0, 'length_ratio': 1.7735294117647058, 'translation_length': 603, 'reference_length': 340}
ROUGE Results: {'rouge1': np.float64(0.43187537920099484), 'rouge2': np.float64(0.3138107971603705), 'rougeL': np.float64(0.36149847533485924), 'rougeLsum': np.float64(0.37800846739875893)}


In [58]:
predictions

['Has Taylor Swift written songs for other artists? \nAnswer:Yes, Taylor Swift has contributed songs to various music for her music. Some notable examples include "Love Story", "You Belong with Me", "Blank Space", "Shake It Off", "Bad Blood", "Delicate", "ME!", "Cardigan", "Willow", "Willow", "Willow", "Willow", "Willow", "Willow", "Willow", "Willow", "Willow", "Willow",',
 'What is Taylor Swift\'s latest music video? \nAnswer:Taylor Swift\'s latest music video for "Shake It Off" has been a subject of controversy for a while now. The controversy stems from her decision to work with Apple Music, which provides exclusive content and merchandise to subscribers. The controversy stems from the fact that it is only available to subscribers of her music, and not having access to those that do not have access to the internet to purchase her music. This has led to a lot of speculation about the future of music streaming and',
 "what's her IG \nAnswer:Taylor Swift's official Instagram account wa