In [1]:
import evaluate
import torch

from collections import defaultdict
from datasets import load_dataset
from peft import PeftModel
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm

In [2]:
BASE_PROC_NAME = 'gpt2-medium'
FT_PROC_NAME = 'ft_model'
PLUGIN_FT_PROC_NAME = 'plugin_over_ft'
PLUGIN_BASE_PROC_NAME = 'plugin_over_base'

In [3]:
tokenizer = AutoTokenizer.from_pretrained('./models/ft_model')
tokenizer.padding_side = 'left'

In [4]:
len(tokenizer.vocab)

50259

In [5]:
# Load the fine-tuned model and tokenizer
base_model = AutoModelForCausalLM.from_pretrained(BASE_PROC_NAME)
base_model.resize_token_embeddings(len(tokenizer))

base_model_for_loading = AutoModelForCausalLM.from_pretrained(BASE_PROC_NAME)
base_model_for_loading.resize_token_embeddings(len(tokenizer))

Embedding(50259, 1024)

In [6]:
model_ft = PeftModel.from_pretrained(base_model_for_loading, f"./models/{FT_PROC_NAME}")
model_ft.resize_token_embeddings(len(tokenizer))

Embedding(50259, 1024)

In [7]:
# tokenizer = AutoTokenizer.from_pretrained(base_model_name)
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = 'left'

# model_ft = AutoModelForCausalLM.from_pretrained("./models/peft_gpt2-medium_e2e")  # Path to your fine-tuned model

# This converts the base_model variable equal to model_ft
# model_ft = PeftModel.from_pretrained(base_model_name, "./models/peft_gpt2-medium_e2e")

# plugin_model = AutoModelForCausalLM.from_pretrained("./models/plugin_decoded") 
plugin_model_ft = AutoModelForCausalLM.from_pretrained(f"./models/{PLUGIN_FT_PROC_NAME}") 
plugin_model_base = AutoModelForCausalLM.from_pretrained(f"./models/{PLUGIN_BASE_PROC_NAME}") 

In [8]:
plugin_model_ft.resize_token_embeddings(len(tokenizer))
plugin_model_base.resize_token_embeddings(len(tokenizer))

Embedding(50259, 64)

In [9]:
# # Compare the state dictionaries
# are_same = all((p1 == p2).all() for p1, p2 in zip(model_ft.parameters(), base_model.parameters()))
# print(f"Are the models identical (by weights)? {are_same}")

# # Compare the configurations
# are_same_config = model_ft.config == base_model.config
# print(f"Are the models identical (by config)? {are_same_config}")
    
# # Check if they are the same object in memory
# are_same_object = base_model is model_ft
# print(f"Are the models the same object in memory? {are_same_object}")

In [10]:
# Load test dataset (replace with your dataset)
dataset = load_dataset("e2e_nlg_cleaned", split="test")

In [11]:
meaning_to_references = defaultdict(list)
for entry in dataset:
    meaning_to_references[entry["meaning_representation"]].append(entry["human_reference"])

In [12]:
# # Preprocess the dataset to include the meaning representation (MR) as input and human reference as target
# def preprocess_function(examples):
#     # Concatenate MR and human reference with a separator
#     inputs = [f"<bos> {mr} <eos>" for mr in examples["meaning_representation"]]
#     targets = [f"<bos> {ref} <eos>" for ref in examples["human_reference"]]
#     model_inputs = tokenizer(inputs, max_length=64, truncation=True, padding="max_length")
#     labels = tokenizer(targets, max_length=256, truncation=True, padding="max_length")
    
#     # Replace padding token id's of the labels by -100 so that it's ignored by the loss
#     labels["input_ids"] = [
#         [(label if label != tokenizer.pad_token_id else -100) for label in labels_seq] 
#         for labels_seq in labels["input_ids"]
#     ]
#     model_inputs["labels"] = labels["input_ids"]
#     return model_inputs

In [13]:
# tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [14]:
# Tokenize the test dataset
def tokenize_function(examples, tokenizer):
    return tokenizer(examples["meaning_representation"], padding='max_length', truncation=True, return_tensors="pt", max_length=128)

In [15]:
class DictDataset(Dataset):
    def __init__(self, data_list, tokenizer):
        """
        Args:
            data_list (list of dicts): A list where each element is a dictionary with features as keys.
        """
        self.data_list = data_list
        self.tokenizer = tokenizer

    def __len__(self):
        """Return the length of the dataset."""
        return len(self.data_list)

    def __getitem__(self, idx):
        """
        Args:
            idx (int): Index of the data to retrieve.
        
        Returns:
            dict: A dictionary containing the features and their corresponding values for the given index.
        """
        example = self.data_list[idx]
        
        # # Tokenize the 'meaning_representation' on the fly
        # tokenized = tokenize_function(example, self.tokenizer)
        # inputs = f'<bos> {example["meaning_representation"]} <eos>'
        # inputs = f"Question: Generate a natural language sentence from the following aspects: {example['meaning_representation']}" + "\nAnswer:"
        inputs = example["meaning_representation"]
        tokenized = tokenizer(inputs, return_tensors="pt", max_length=64, truncation=True, padding="max_length")
        
        
        # Return the tokenized inputs along with any other features (like labels)
        return {
            'input_ids': tokenized['input_ids'].squeeze(0),  # Remove batch dimension
            'attention_mask': tokenized['attention_mask'].squeeze(0),
            'meaning_representation' :   example['meaning_representation']# Remove batch dimension
        }

In [16]:
unique_dataset = DictDataset([{'meaning_representation': mr} for mr in meaning_to_references.keys()], tokenizer=tokenizer)

In [17]:
unique_dataset[0]

{'input_ids': tensor([ 3672,    58, 14573, 43537,  4357,  4483,  6030,    58,  1073,  5853,
          6128,  4357,  1989,    58, 19205,  7372,    60, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'meaning_representation': 'name[Blue Spice], eatType[coffee shop], area[city centre]'}

In [18]:
# Load BLEU and ROUGE metrics from evaluate library
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

In [19]:
# Apply tokenization
# tokenized_test_data = unique_dataset.map(tokenize_function, batched=True)

In [20]:
# Move model to the GPU if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
# model.to(device)
# model_ft.to(device)

In [21]:
def sample_top_k(probs, top_k):
    top_k_probs, top_k_indices = torch.topk(probs, top_k, dim=-1)
    top_k_probs = torch.nn.functional.softmax(top_k_probs, dim=-1)
    next_token = torch.multinomial(top_k_probs, 1)
    return top_k_indices.gather(-1, next_token)

def sample_top_p(probs, top_p):
    sorted_probs, sorted_indices = torch.sort(probs, descending=True)
    cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
    sorted_indices_to_remove = cumulative_probs > top_p
    sorted_probs[sorted_indices_to_remove] = 0
    next_token = torch.multinomial(sorted_probs, 1)
    return sorted_indices.gather(-1, next_token)

def apply_temperature(logits, temperature):
    if temperature != 1.0:
        logits = logits / temperature
    return logits

In [22]:
def sample_top_k_top_p(probs, top_k, top_p):
    """
    Sample the next token based on top-k and top-p (nucleus) sampling.
    
    Args:
        probs (torch.Tensor): The probabilities of the next token (shape: [batch_size, vocab_size]).
        top_k (int): The number of most probable tokens to consider in top-k sampling.
        top_p (float): The cumulative probability cutoff for sampling (nucleus sampling).
    
    Returns:
        next_token (torch.Tensor): The sampled token indices (shape: [batch_size, 1]).
    """
    # Apply top-k sampling: Keep only the top_k most probable tokens
    if top_k > 0:
        top_k_probs, top_k_indices = torch.topk(probs, top_k, dim=-1)
        probs = torch.zeros_like(probs).scatter_(-1, top_k_indices, top_k_probs)

    # Sort probabilities for top-p sampling
    sorted_probs, sorted_indices = torch.sort(probs, descending=True)

    # Compute cumulative probabilities
    cumulative_probs = torch.cumsum(sorted_probs, dim=-1)

    # Mask out tokens that exceed the top-p cumulative probability
    sorted_indices_to_remove = cumulative_probs > top_p
    sorted_probs[sorted_indices_to_remove] = 0
    sorted_probs = sorted_probs / sorted_probs.sum(dim=-1, keepdim=True)  # Re-normalize

    # Sample from the remaining tokens after applying top-k and top-p
    next_token = torch.multinomial(sorted_probs, num_samples=1)

    # Return the corresponding token indices from the original vocabulary
    return sorted_indices.gather(-1, next_token)

In [23]:
# def get_position_ids(input_ids, attention_mask):
#     """
#     Generate position IDs for left-padded sequences.
    
#     Args:
#         input_ids (torch.Tensor): Tensor of input token IDs (shape: [batch_size, seq_len]).
#         attention_mask (torch.Tensor): Tensor of attention mask (shape: [batch_size, seq_len]).
        
#     Returns:
#         torch.Tensor: Position IDs (shape: [batch_size, seq_len]).
#     """
#     # Get the lengths of the non-padded tokens (i.e., count of '1's in the attention mask)
#     seq_lengths = attention_mask.sum(dim=-1)

#     # Create a tensor with position IDs starting from 0 for each non-padded token
#     position_ids = torch.arange(input_ids.size(1), dtype=torch.long).unsqueeze(0).repeat(input_ids.size(0), 1).to(input_ids.device)

#     # Adjust position IDs for each sequence to start from 0 after padding
#     position_ids = position_ids - (input_ids.size(1) - seq_lengths).unsqueeze(-1)

#     # Set position IDs for padding tokens to 0 (optional: you can use another value if needed)
#     position_ids = torch.where(attention_mask == 1, position_ids, torch.zeros_like(position_ids))
#     return position_ids.long()

def get_position_ids(input_ids, attention_mask):
    position_ids = torch.arange(input_ids.size(1)).expand_as(input_ids).to(input_ids.device)
    position_ids = position_ids * attention_mask
    return position_ids.long()

In [24]:
def apply_repetition_penalty(logits, generated_ids, repetition_penalty):
    """
    Applies repetition penalty to logits tensor based on generated_ids tensor.
    
    Args:
        logits (torch.Tensor): Logits of shape (batch_size, vocab_size).
        generated_ids (torch.Tensor): Generated token IDs of shape (batch_size, seq_length).
        repetition_penalty (float): The repetition penalty to apply.
    
    Returns:
        torch.Tensor: Logits after applying the repetition penalty.
    """
    batch_size, vocab_size = logits.shape

    # Check for invalid indices in generated_ids
    if (generated_ids < 0).any() or (generated_ids >= vocab_size).any():
        print("Error: `generated_ids` contains values out of range. Please ensure all token IDs are within [0, vocab_size-1].")
        print(f"generated_ids: {generated_ids}")
        print(f"Valid range: [0, {vocab_size - 1}]")
        return logits

    # Get unique tokens in each batch sequence
    unique_tokens = [torch.unique(seq) for seq in generated_ids]

    # Find the maximum number of unique tokens in any sequence
    max_unique_len = max([len(tokens) for tokens in unique_tokens])

    # Pad each unique token list to the max length with a negative index (-1) that is not in the vocabulary
    padded_unique_tokens = torch.stack(
        [torch.cat([tokens, tokens.new_full((max_unique_len - len(tokens),), -1)]) for tokens in unique_tokens]
    )

    # Ensure no negative values other than -1 in padded_unique_tokens
    if (padded_unique_tokens < -1).any() or (padded_unique_tokens >= vocab_size).any():
        print("Error: Invalid values in `padded_unique_tokens`. Please ensure all token IDs are within [0, vocab_size-1].")
        print(f"padded_unique_tokens: {padded_unique_tokens}")
        return logits

    # Debugging: Check the padded_unique_tokens shape and values
    print("Padded Unique Tokens:\n", padded_unique_tokens)
    print("Logits shape:", logits.shape)

    # Create a mask of shape (batch_size, vocab_size) where repeated tokens are True
    penalty_mask = torch.zeros((batch_size, vocab_size), dtype=torch.bool, device=logits.device)

    # Use advanced indexing to set the mask to True for repeated tokens (ignore padding -1)
    try:
        penalty_mask.scatter_(1, padded_unique_tokens, True)
    except Exception as e:
        print(f"Error during scatter operation: {e}")
        print("Check if `padded_unique_tokens` contain valid indices for `penalty_mask`.")
        return logits

    # Ensure padding index (-1) is not penalized
    penalty_mask[:, -1] = False

    # Debugging: Check the penalty_mask
    print("Penalty Mask Shape:", penalty_mask.shape)
    print("Penalty Mask Sample (First Row):", penalty_mask[0])

    # Apply the repetition penalty using vectorized operations
    positive_logits_mask = logits > 0
    negative_logits_mask = logits < 0

    # Check shapes and contents before applying the penalty
    print("Logits Before Penalty:\n", logits)
    print("Positive Logits Mask:\n", positive_logits_mask)
    print("Negative Logits Mask:\n", negative_logits_mask)

    # Check the combined mask shape and contents
    combined_mask = penalty_mask & positive_logits_mask
    print("Combined Mask Shape:", combined_mask.shape)
    print("Combined Mask (Sample):", combined_mask[0])

    # For positive logits, divide by repetition penalty
    try:
        logits[combined_mask] /= repetition_penalty
    except Exception as e:
        print(f"Error applying penalty to positive logits: {e}")

    # For negative logits, multiply by repetition penalty
    try:
        logits[penalty_mask & negative_logits_mask] *= repetition_penalty
    except Exception as e:
        print(f"Error applying penalty to negative logits: {e}")

    return logits

In [33]:
def custom_generate(model, input_ids, attention_mask, max_length, repetition_penalty, tokenizer, top_k=50, temperature=1.0, top_p = 1, bb_model = None):
    
    generated_ids = input_ids.clone().to(model.module.device)  # Start with the input prompt
    # Create a list to track which sequences have finished
    finished_sequences = torch.zeros(input_ids.size(0), dtype=torch.bool).to(model.module.device)
        
    for step in range(max_length-input_ids.size()[1]):
        # Get the model outputs (logits) for the current step
        with torch.no_grad():
            position_ids = get_position_ids(input_ids=generated_ids, attention_mask=attention_mask)
            outputs = model(input_ids=generated_ids, attention_mask=attention_mask, use_cache=True, position_ids=position_ids)
            logits = outputs.logits[:, -1, :]  # Get logits of the last token
        
#         min_logit = torch.absolute(torch.min(logits)) + 1.0
#         logits += min_logit
#         # logits = apply_repetition_penalty(logits, generated_ids, repetition_penalty)
        
#         # # Apply repetition penalty by decreasing the logits for previously generated tokens
#         for i, gen_id in enumerate(generated_ids):
#             for token_id in torch.unique(gen_id):  # Get unique tokens in the sequence
#                 # if logits[i, token_id] > 0:
#                 logits[i, token_id] /= repetition_penalty
#                 # else:
#                 #     logits[i, token_id] *= repetition_penalty

        # Apply temperature
        # logits = apply_temperature(logits, temperature)

        # Convert logits to probabilities
        probs = torch.nn.functional.softmax(logits, dim=-1)
        
        if(bb_model):
            with torch.no_grad():
                position_ids = get_position_ids(input_ids=generated_ids, attention_mask=attention_mask)
                outputs_base = bb_model(input_ids=generated_ids, attention_mask=attention_mask, position_ids=position_ids)
                logits_base = outputs_base.logits[:, -1, :]  # Get logits of the last token
            
            # min_logit = torch.absolute(torch.min(logits_base)) + 1.0
            # logits_base += min_logit
        
            # logits_base = apply_repetition_penalty(logits_base, generated_ids, repetition_penalty)
            # # # Apply repetition penalty by decreasing the logits for previously generated tokens
            # for i, gen_id in enumerate(generated_ids):
            #     for token_id in torch.unique(gen_id):  # Get unique tokens in the sequence
            #         # if logits_base[i, token_id] > 0:
            #         logits_base[i, token_id] /= repetition_penalty
            #         # else:
            #         #     logits_base[i, token_id] *= repetition_penalty
            #         # logits_base[i, token_id] /= repetition_penalty
            probs_base = torch.nn.functional.softmax(logits_base, dim=-1)
            # print('middle', torch.max(probs_base, dim=-1))
            probs = probs*probs_base
            sum_probs = probs.sum(dim=-1, keepdim=True)
    
            # Avoid division by zero by adding a small value (epsilon)
            sum_probs = torch.clamp(sum_probs, min=1e-9)

            # Re-normalize by dividing each probability by the sum of probabilities
            probs = probs / sum_probs

        # Sample the next token using top-k sampling
        # next_token = sample_top_k(probs, top_k)
        # next_token = sample_top_k_top_p(probs, top_k, top_p)
        next_token = torch.argmax(probs, dim=-1).unsqueeze(-1)
        
        next_token = torch.where(finished_sequences.unsqueeze(-1), tokenizer.pad_token_id, next_token)

        # Append the new token to the generated sequence
        generated_ids = torch.cat((generated_ids, next_token), dim=-1)

        # Extend the attention mask to include the newly generated token
        new_attention_mask = torch.ones((attention_mask.shape[0], 1)).to(model.module.device)
        attention_mask = torch.cat((attention_mask, new_attention_mask), dim=-1)
        
        finished_sequences |= next_token.squeeze(-1) == tokenizer.eos_token_id

        # If all sequences are finished, break the loop
        if finished_sequences.all():
            break

        # # Break if the model generates an end-of-sequence token
        # if torch.all(next_token == tokenizer.eos_token_id):
        #     break
    return generated_ids

In [34]:
def get_preds(md, dat, tokenizer, batch_size=8, repetition_penalty=1.1, max_length=192 , bb_model = None):
    # Put the model in evaluation mode

    # DataLoader for batching the dataset
    dataloader = DataLoader(dat, batch_size=batch_size, shuffle=False)

    # Lists to store predictions and references
    predictions = []
    references = []
    mrs = []
    
    b = 0
    # Generate predictions and collect references in batches
    for batch in tqdm(dataloader):
        # Get input IDs and attention mask from the batch and move to the device
        # input_ids = torch.stack([torch.tensor(x, dtype=torch.long) for x in batch['input_ids']], dim=1).to(device)
        # attention_mask = torch.stack([torch.tensor(x, dtype=torch.long) for x in batch['attention_mask']], dim=1).to(device)

        input_ids = batch['input_ids'].to(md.device_ids[0])
        attention_mask = batch['attention_mask'].to(md.device_ids[0])
        # Generate predictions in batches with the model
        with torch.no_grad():
            # generated_ids = md.module.generate(input_ids=input_ids, 
            #                             attention_mask=attention_mask, 
            #                             max_length=max_length, 
            #                             pad_token_id=tokenizer.eos_token_id,
            #                             bos_token_id=tokenizer.bos_token_id,
            #                             eos_token_id=tokenizer.eos_token_id,
            #                             repetition_penalty=repetition_penalty, 
            #                             do_sample = False, 
            #                             use_cache = True,
            #                             length_penalty=1.0,
            #                             early_stopping=False,
            #                             num_beams=1,
            #                             # renormalize_logits = True, 
            #                             output_logits = True,
            #                             output_scores = True,
            #                             return_dict_in_generate = True)
            generated_ids = custom_generate(md, 
                                            input_ids=input_ids, 
                                            attention_mask=attention_mask, 
                                            max_length=max_length, 
                                            repetition_penalty=repetition_penalty, 
                                            tokenizer=tokenizer, 
                                            bb_model = bb_model)
                                        
#         # Decode generated predictions and references
#         # for i in range(len(generated_ids.sequences)):
#         for i in range(len(generated_ids)):
#             # print(generated_ids[i])
#             # generated_text = tokenizer.decode(generated_ids.sequences[i], skip_special_tokens=True)
#             generated_text = tokenizer.decode(generated_ids[i], skip_special_tokens=True)

#             # Append generated text and reference to lists
#             predictions.append(generated_text)
#         for mr in batch["meaning_representation"]:
#             references.append(meaning_to_references[mr])
#             mrs.append(mr)
#         b+=1
#         # if(b==1):
#         #     break
#     return {'predictions': predictions, 'meaning_representation' : mrs, 'references': references}


In [35]:
md = base_model
md.eval()

# # Move the model to the device
md.to(device)

# If multiple GPUs are available, wrap the model with DataParallel
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs.")
    md = torch.nn.DataParallel(md)
    
print("Model is using the following GPUs:", md.device_ids)
print(md.module.device)

Using 8 GPUs.
Model is using the following GPUs: [0, 1, 2, 3, 4, 5, 6, 7]
cuda:0


In [36]:
# output = get_preds(model, tokenized_test_data, device, tokenizer)
output = get_preds(md, unique_dataset, tokenizer)

  2%|▏         | 5/231 [03:18<2:29:54, 39.80s/it]


KeyboardInterrupt: 

In [None]:
output_ft = get_preds(model_ft, unique_dataset, device, tokenizer)

In [None]:
output_plugin_ft = get_preds(plugin_model_ft, unique_dataset, device, tokenizer, bb_model = model_ft)

Using 8 GPUs.
50259


 73%|███████▎  | 168/231 [1:34:39<26:06, 24.87s/it]  

In [None]:
output_plugin_base = get_preds(plugin_model_base, unique_dataset, device, tokenizer, bb_model = base_model)

In [42]:
sel_id = 0

In [43]:
print(output['predictions'][sel_id])
print('\n')
print(output['meaning_representation'][sel_id])
print('\n')
print(output['references'][sel_id])

Question: Generate a natural language sentence from the following aspects: name[Blue Spice], eatType[coffee shop], area[city centre]
Answer:


name[Blue Spice], eatType[coffee shop], area[city centre]


['A coffee shop in the city centre area called Blue Spice.', 'Blue Spice is a coffee shop in city centre.']


In [44]:
print(output_plugin_base['predictions'][sel_id])
print('\n')
print(output_plugin_base['meaning_representation'][sel_id])
print('\n')
print(output_plugin_base['references'][sel_id])

Question: Generate a natural language sentence from the following aspects: name[Blue Spice], eatType[coffee shop], area[city centre]
Answer:


name[Blue Spice], eatType[coffee shop], area[city centre]


['A coffee shop in the city centre area called Blue Spice.', 'Blue Spice is a coffee shop in city centre.']


In [45]:
print(output_ft['predictions'][sel_id])
print('\n')
print(output_ft['meaning_representation'][sel_id])
print('\n')
print(output_ft['references'][sel_id])

Question: Generate a natural language sentence from the following aspects: name[Blue Spice], eatType[coffee shop], area[city centre]
Answer: restaurant [restaurant in Portland OR]. customer rating(5 out of 5) nearby familyFriendly([yes]) nearby[Crowne Plaza Hotel](near Manchester United), close by airport (Burger King or Indian restaurants)[no]), priceRange (£20-25, £ cost per day); cashout method ($all customers receive discount), mobile phone range short to medium; nearByRainforest[Yes]; nearBYPass®[No], nearIndianRestaurants*["The Curry House"]}, nearClose By The Sorrento Inn than other restaurants within walking distance. Price ranges between $ and more expensive than average. Nearby holidaymakers are Chinese tourists who on theirless experience.


name[Blue Spice], eatType[coffee shop], area[city centre]


['A coffee shop in the city centre area called Blue Spice.', 'Blue Spice is a coffee shop in city centre.']


In [46]:
print(output_plugin_ft['predictions'][sel_id])
print('\n')
print(output_plugin_ft['meaning_representation'][sel_id])
print('\n')
print(output_plugin_ft['references'][sel_id])

Question: Generate a natural language sentence from the following aspects: name[Blue Spice], eatType[coffee shop], area[city centre]
Answer: customer rating [5 out of 5 by Blue Curry Cafe])


name[Blue Spice], eatType[coffee shop], area[city centre]


['A coffee shop in the city centre area called Blue Spice.', 'Blue Spice is a coffee shop in city centre.']


In [None]:
# Compute BLEU score
bleu_score = bleu_metric.compute(predictions=output['predictions'], references=output['references'])
# Compute ROUGE score
rouge_score = rouge_metric.compute(predictions=output['predictions'], references=output['references'])

# Display the BLEU and ROUGE scores
print(f"BLEU Score: {bleu_score['bleu']}")
print(f"ROUGE Score: {rouge_score}")

In [None]:
# Compute BLEU score
bleu_score_plugin_base = bleu_metric.compute(predictions=output_plugin_base['predictions'], references=output_plugin_base['references'])
# Compute ROUGE score
rouge_score_plugin_base = rouge_metric.compute(predictions=output_plugin_base['predictions'], references=output_plugin_base['references'])

# Display the BLEU and ROUGE scores
print(f"BLEU Score: {bleu_score_plugin_base['bleu']}")
print(f"ROUGE Score: {rouge_score_plugin_base}")

In [None]:
# Compute BLEU score
bleu_score_ft = bleu_metric.compute(predictions=output_ft['predictions'], references=output_ft['references'])
# Compute ROUGE score
rouge_score_ft = rouge_metric.compute(predictions=output_ft['predictions'], references=output_ft['references'])

# Display the BLEU and ROUGE scores
print(f"BLEU Score: {bleu_score_ft['bleu']}")
print(f"ROUGE Score: {rouge_score_ft}")

In [None]:
# Compute BLEU score
bleu_score_plugin_ft = bleu_metric.compute(predictions=output_plugin_ft['predictions'], references=output_plugin_ft['references'])
# Compute ROUGE score
rouge_score_plugin_ft = rouge_metric.compute(predictions=output_plugin_ft['predictions'], references=output_plugin_ft['references'])

# Display the BLEU and ROUGE scores
print(f"BLEU Score: {bleu_score_plugin_ft['bleu']}")
print(f"ROUGE Score: {rouge_score_plugin_ft}")

In [41]:
# Display the BLEU and ROUGE scores
print(f"BLEU Score: {bleu_score['bleu']}")
print(f"ROUGE Score: {rouge_score}")

# Display the BLEU and ROUGE scores
print(f"BLEU Score: {bleu_score_plugin_base['bleu']}")
print(f"ROUGE Score: {rouge_score_plugin_base}")

# Display the BLEU and ROUGE scores
print(f"BLEU Score: {bleu_score_ft['bleu']}")
print(f"ROUGE Score: {rouge_score_ft}")

# Display the BLEU and ROUGE scores
print(f"BLEU Score: {bleu_score_plugin_ft['bleu']}")
print(f"ROUGE Score: {rouge_score_plugin_ft}")

BLEU Score: 0.042839042271541325
ROUGE Score: {'rouge1': 0.47810644299968463, 'rouge2': 0.1967256342802732, 'rougeL': 0.3213933320725961, 'rougeLsum': 0.3229532168360921}
BLEU Score: 0.04280878016611851
ROUGE Score: {'rouge1': 0.4777149580688591, 'rouge2': 0.19657174706846953, 'rougeL': 0.3211130623944386, 'rougeLsum': 0.32290191831621473}
BLEU Score: 0.02307265311729619
ROUGE Score: {'rouge1': 0.3572597935500922, 'rouge2': 0.1332045881288912, 'rougeL': 0.22668951336918525, 'rougeLsum': 0.24761255318223108}
BLEU Score: 0.022840266477484255
ROUGE Score: {'rouge1': 0.35508644227244474, 'rouge2': 0.1323537554184298, 'rougeL': 0.2254154845352321, 'rougeLsum': 0.24691364292477058}
