In [1]:
#!/usr/bin/python3

import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

# Load the WikiHow dataset
train_articles_file = './data/articles.txt'
train_summaries_file = './data/summaries.txt'

# Load the T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base')

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    eval_accumulation_steps=10,
    learning_rate=5e-5,
    evaluation_strategy='steps',
    save_total_limit=2,
    eval_steps=200,
    save_steps=200,
    warmup_steps=500,
    logging_dir='./logs',
    logging_steps=200,
    load_best_model_at_end=True
)

# Define the data collator


def data_collator(features):
    input_ids = [f['input_ids'] for f in features]
    attention_mask = [f['attention_mask'] for f in features]
    labels = [f['input_ids'] for f in features]
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels,
    }


class WikiHowDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, articles_path, summaries_path, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length

        with open(articles_path, 'r') as f:
            self.articles = f.readlines()

        with open(summaries_path, 'r') as f:
            self.summaries = f.readlines()

    def __len__(self):
        return len(self.articles)

    def __getitem__(self, index):
        article = self.articles[index].strip()
        summary = self.summaries[index].strip()

        inputs = self.tokenizer.encode_plus(
            article, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')

        targets = self.tokenizer.encode_plus(
            summary, max_length=150, padding='max_length', truncation=True, return_tensors='pt')

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': targets['input_ids'].squeeze(),
        }



train_dataset = WikiHowDataset(
    tokenizer, train_articles_file, train_summaries_file)
train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=2, shuffle=True, collate_fn=data_collator)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()
output_dir = './models/'
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)


  from .autonotebook import tqdm as notebook_tqdm
2023-04-29 22:38:06.868872: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-29 22:38:07.314217: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-04-29 22:38:07.314238: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-04-29 22:38:09.031167: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open s

AttributeError: 'list' object has no attribute 'size'