In [1]:
import nltk
import evaluate
import numpy as np
from datasets import load_dataset, Dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [2]:
# Load the tokenizer, model, and data collator
MODEL_NAME = "  "

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
import pandas as pd

In [4]:
df = pd.read_csv('data/prompts.csv', nrows= 20)

In [5]:
import pyarrow as pa
# Convert DataFrame to pyarrow.Table
arrow_table = pa.Table.from_pandas(df)

# Initialize Dataset with the pyarrow.Table
dataset = Dataset(arrow_table)

In [6]:
dataset = Dataset.from_pandas(df)

In [7]:
dataset

Dataset({
    features: ['tag', 'lyrics', 'prompts'],
    num_rows: 20
})

In [8]:
# We prefix our tasks with "answer the question"
prefix = "Based on the above summary generate a song lyrics with in  600 to 700 words"

# Define the preprocessing function

def preprocess_function(examples):
   """Add prefix to the sentences, tokenize the text, and set the labels"""
   # The "inputs" are the tokenized answer:
   inputs = [prefix + doc for doc in examples["prompts"]]
   model_inputs = tokenizer(inputs, max_length=128, truncation=True)
  
   # The "labels" are the tokenized outputs:
   labels = tokenizer(text_target=examples["lyrics"], 
                      max_length=512,         
                      truncation=True)

   model_inputs["labels"] = labels["input_ids"]
   return model_inputs

In [9]:
# Map the preprocessing function across our dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [10]:
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

In [11]:
def compute_metrics(eval_preds):
   preds, labels = eval_preds

   # decode preds and labels
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

   # rougeLSum expects newline after each sentence
   decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
   decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

   result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
  
   return result

In [12]:
# Global Parameters
L_RATE = 3e-4
BATCH_SIZE = 1
PER_DEVICE_EVAL_BATCH = 4
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 3
NUM_EPOCHS = 10

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
   output_dir="./results",
   evaluation_strategy="epoch",
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False
)

In [13]:
trainer = Seq2SeqTrainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_dataset,
   eval_dataset=tokenized_dataset,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [14]:
trainer.train()

  0%|          | 0/200 [00:00<?, ?it/s]



  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 1.848117470741272, 'eval_rouge1': 0.18974041891513044, 'eval_rouge2': 0.17653563906629888, 'eval_rougeL': 0.18695034617346082, 'eval_rougeLsum': 0.1869507219581324, 'eval_runtime': 13.2791, 'eval_samples_per_second': 1.506, 'eval_steps_per_second': 0.377, 'epoch': 1.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 1.5155901908874512, 'eval_rouge1': 0.1833732058119273, 'eval_rouge2': 0.16202288025745826, 'eval_rougeL': 0.17908468698167085, 'eval_rougeLsum': 0.1783113511097268, 'eval_runtime': 5.2814, 'eval_samples_per_second': 3.787, 'eval_steps_per_second': 0.947, 'epoch': 2.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 1.2109038829803467, 'eval_rouge1': 0.18548942370069987, 'eval_rouge2': 0.16764923074140614, 'eval_rougeL': 0.18094445513697796, 'eval_rougeLsum': 0.18097499587264662, 'eval_runtime': 5.1006, 'eval_samples_per_second': 3.921, 'eval_steps_per_second': 0.98, 'epoch': 3.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.9411761164665222, 'eval_rouge1': 0.18492394959356975, 'eval_rouge2': 0.1657283144178555, 'eval_rougeL': 0.17831360338652907, 'eval_rougeLsum': 0.17859686824813117, 'eval_runtime': 5.5816, 'eval_samples_per_second': 3.583, 'eval_steps_per_second': 0.896, 'epoch': 4.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.7514652609825134, 'eval_rouge1': 0.1800681226051617, 'eval_rouge2': 0.16044675264907277, 'eval_rougeL': 0.17503008032957162, 'eval_rougeLsum': 0.17504388082410488, 'eval_runtime': 5.2584, 'eval_samples_per_second': 3.803, 'eval_steps_per_second': 0.951, 'epoch': 5.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.6030257940292358, 'eval_rouge1': 0.1842957177707762, 'eval_rouge2': 0.16303956722184923, 'eval_rougeL': 0.17737436177105487, 'eval_rougeLsum': 0.1771652073508581, 'eval_runtime': 4.9954, 'eval_samples_per_second': 4.004, 'eval_steps_per_second': 1.001, 'epoch': 6.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.4951631426811218, 'eval_rouge1': 0.18999982089185968, 'eval_rouge2': 0.17148499855127486, 'eval_rougeL': 0.18485890074979175, 'eval_rougeLsum': 0.18478897112875375, 'eval_runtime': 5.0781, 'eval_samples_per_second': 3.938, 'eval_steps_per_second': 0.985, 'epoch': 7.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.40412211418151855, 'eval_rouge1': 0.1905038016580638, 'eval_rouge2': 0.17372178696471685, 'eval_rougeL': 0.18484638676053772, 'eval_rougeLsum': 0.18496669646720254, 'eval_runtime': 5.0582, 'eval_samples_per_second': 3.954, 'eval_steps_per_second': 0.988, 'epoch': 8.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.3742409348487854, 'eval_rouge1': 0.18754060259251404, 'eval_rouge2': 0.17174784733141213, 'eval_rougeL': 0.1833060253677144, 'eval_rougeLsum': 0.18422786621557885, 'eval_runtime': 5.5142, 'eval_samples_per_second': 3.627, 'eval_steps_per_second': 0.907, 'epoch': 9.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.36061161756515503, 'eval_rouge1': 0.19056393111436792, 'eval_rouge2': 0.1754990069696289, 'eval_rougeL': 0.18787448980390486, 'eval_rougeLsum': 0.18782420394604807, 'eval_runtime': 5.2932, 'eval_samples_per_second': 3.778, 'eval_steps_per_second': 0.945, 'epoch': 10.0}
{'train_runtime': 188.8213, 'train_samples_per_second': 1.059, 'train_steps_per_second': 1.059, 'train_loss': 1.1788275146484375, 'epoch': 10.0}


TrainOutput(global_step=200, training_loss=1.1788275146484375, metrics={'train_runtime': 188.8213, 'train_samples_per_second': 1.059, 'train_steps_per_second': 1.059, 'train_loss': 1.1788275146484375, 'epoch': 10.0})