In [21]:
import torch
import wandb
from torch.utils.data import DataLoader
from transformers import BartTokenizer, Trainer, TrainingArguments
from datasets import load_dataset

from scripts.custom_BARTs.noise_encoder_BART import (
    BartForConditionalGeneration,
    BartConfig,
)


# Initialize GPU
device = torch.device("cpu")

# Load dataset
dataset = load_dataset("kmfoda/booksum")
train_dataset = dataset["train"]

# Initialize BART model and tokenizer
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
config = BartConfig.from_pretrained("facebook/bart-base")
config.encoder_gaussian_ratio = 0.45
model = BartForConditionalGeneration.from_pretrained(
    "./saved_models/BART_summary_with_noise_working_p1_45", config=config
).to(device)


# Tokenize the 'summary_text' field
def tokenize_data(example):
    encoded_summary = tokenizer.encode(
        example["summary_text"], truncation=True, padding="max_length", max_length=1024
    )
    return {
        "labels": encoded_summary,
        "input_ids": encoded_summary,  # Dummy input_ids
        "attention_mask": [1] * len(encoded_summary),  # Dummy attention_mask
    }


tokenized_dataset = train_dataset.map(tokenize_data)

# print one random summary
import random

# random.seed(42)
random_index = 452#random.randint(0, len(train_dataset))
print("Random index: ", random_index)
print("Original summary: ", train_dataset[random_index]["summary_text"])

input_ids = torch.tensor(tokenized_dataset[random_index]["input_ids"]).to(device)
output = model.generate(input_ids.unsqueeze(0), max_length=1024).to(device)

print("Generated summary: ", tokenizer.decode(output[0], skip_special_tokens=True))

Found cached dataset csv (/data1/sanps/hf_cache/kmfoda___csv/kmfoda--booksum-025141c210e07407/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
100%|██████████| 3/3 [00:00<00:00, 798.86it/s]
Loading cached processed dataset at /data1/sanps/hf_cache/kmfoda___csv/kmfoda--booksum-025141c210e07407/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-9bc715002647e65b.arrow


Random index:  452
Original summary:  And now, back to Adam. When last we saw him, Adam Bede was walking home. His destination is a "thatched house, with a stack of timber by the side of it" . And in this house waits Adam's mother, Lisbeth Bede, a tall, hardy, and very sad woman. Why is Lisbeth sad? Pretty much all thanks to Adam's father, Thias Bede. Instead of doing his work, Thias tends to run off and get drunk. On this particular evening, Adam comes home to find that Thias has left a coffin unfinished. Angered by this, Adam decides to finish the coffin himself. Even if he doesn't sleep, even if he doesn't eat, he'll finish that %@$#*@ coffin. His harsh response upsets Lisbeth, who starts to cry, and reminds him that Thias had once been "a fine-growed man an' handy at everythin'" . Adam angrily sets to work. The situation between him and his mother remains tense even when Seth, the family peacemaker, finally shows up. Adam orders Seth to "shut the door so as I mayn't hear Mother's t