In [1]:
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
import pandas as pd
from datasets import load_dataset, DatasetDict, Dataset
df = pd.read_csv('./dataset/dataset_True_True_True_True_True.csv')

movies = Dataset.from_pandas(df)
print(movies)
tokenizer = AutoTokenizer.from_pretrained("t5-small")
print(tokenizer)

  from .autonotebook import tqdm as notebook_tqdm
2023-06-20 20:45:24.859863: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-06-20 20:45:24.884812: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-06-20 20:45:24.885298: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Dataset({
    features: ['imdb_id', 'original_title', 'overview', 'title', 'subtitles', 'subtitles_word_count'],
    num_rows: 4261
})
T5TokenizerFast(name_or_path='t5-small', vocab_size=32100, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id

In [2]:
def preprocess(examples):
    t5_task_prefix = "summarize: " 
    inputs = [t5_task_prefix + doc for doc in examples["subtitles"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["overview"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_subtitles = movies.map(preprocess, batched=True, num_proc=4, remove_columns=[])

                                                                             

In [3]:
test_split = 0.99

print(tokenized_subtitles)

train_test = tokenized_subtitles.train_test_split(shuffle = True, test_size=test_split, seed=1)
# test_valid = train_testvalid["test"].train_test_split(shuffle = True, test_size=(test_split/(test_split + val_split)), seed=random_state)

tokenized_subtitles = DatasetDict(
    train = train_test["train"],
    test = train_test["test"],
)

print(tokenized_subtitles)

Dataset({
    features: ['imdb_id', 'original_title', 'overview', 'title', 'subtitles', 'subtitles_word_count', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 4261
})
DatasetDict({
    train: Dataset({
        features: ['imdb_id', 'original_title', 'overview', 'title', 'subtitles', 'subtitles_word_count', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 42
    })
    test: Dataset({
        features: ['imdb_id', 'original_title', 'overview', 'title', 'subtitles', 'subtitles_word_count', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 4219
    })
})


In [4]:
import evaluate
import numpy as np
metrics = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = metrics.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [5]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

In [6]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [7]:
print(data_collator)

print(tokenized_subtitles["train"])
print(tokenized_subtitles["test"])

training_args = Seq2SeqTrainingArguments(
    output_dir="movie-overview-predictor",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    weight_decay=0.01,
    eval_steps=50,
    do_eval=True,
    save_total_limit=3,
    num_train_epochs=13,
    predict_with_generate=True,
    fp16=True,
    logging_dir='./logs',
    logging_strategy="steps",
    logging_steps=50,
    save_steps=500,
    push_to_hub=False
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_subtitles["train"],
    eval_dataset=tokenized_subtitles["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

DataCollatorForSeq2Seq(tokenizer=T5TokenizerFast(name_or_path='t5-small', vocab_size=32100, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id

In [8]:
trainer.train()

  0%|          | 0/91 [00:00<?, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 55%|█████▍    | 50/91 [00:11<00:09,  4.45it/s]

{'loss': 4.902, 'learning_rate': 9.230769230769232e-06, 'epoch': 7.14}


                                               
 55%|█████▍    | 50/91 [03:40<00:09,  4.45it/s]  

{'eval_loss': 4.573732852935791, 'eval_rouge1': 0.0236, 'eval_rouge2': 0.002, 'eval_rougeL': 0.0216, 'eval_rougeLsum': 0.0216, 'eval_gen_len': 19.0, 'eval_runtime': 208.6182, 'eval_samples_per_second': 20.224, 'eval_steps_per_second': 3.375, 'epoch': 7.14}


100%|██████████| 91/91 [03:49<00:00,  2.52s/it]

{'train_runtime': 229.6549, 'train_samples_per_second': 2.377, 'train_steps_per_second': 0.396, 'train_loss': 4.750209431071858, 'epoch': 13.0}





TrainOutput(global_step=91, training_loss=4.750209431071858, metrics={'train_runtime': 229.6549, 'train_samples_per_second': 2.377, 'train_steps_per_second': 0.396, 'train_loss': 4.750209431071858, 'epoch': 13.0})

In [9]:
df['title'][0]

'Toy Story'

In [None]:
trainer.save_model("movie-overview-predictor")

In [23]:

subtitle = df['subtitles'][0]
text = "summarize: " + subtitle
inputs = tokenizer.encode(text, return_tensors="pt", truncation=True).to("cuda")
print(inputs)
outputs = model.generate(inputs, do_sample=False, num_beams=3, max_length=13)
tokenizer.decode(outputs[0], skip_special_tokens=True)

tensor([[21603,    10,  4940,   269,   921,     3,    55,  4372,    95,     3,
            55,  4940,   269,   921,     3,    55,  4372,    95,     3,    55,
          4940,   269,   921,     3,    55,  4372,    95,     3,    55, 13112,
           888,     3,    55, 13112,   888,     3,    55, 13112,   888,     3,
            55,  6364,  1346,     3,    55,  6364,  1346,     3,    55,  6364,
          1346,     3,    55,     3,    32,    32,   107,  3534,    32,  3534,
            32,     3,    55,   540,   540,   540,     3,    55, 17387,    53,
             3,    32,    32,   107,  3534,    32,  3534,    32,     3,    55,
           540,   540,   540,     3,    55, 17387,    53,     3,    32,    32,
           107,  3534,    32,  3534,    32,     3,    55,   540,   540,   540,
             3,    55, 17387,    53,  1190,    34,     3,    55,  1190,  1243,
           625, 14741,     3,    55,  1190,    34,     3,    55,  1190,  1243,
           625, 14741,     3,    55,  1190,    34,  

'kissing ooh hoo hoo ho'