In [None]:
# Installing all the relevant libraries

In [5]:
!pip install --upgrade pip
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/torchstable.html
!pip install transformers datasets rouge-score
!pip install --upgrade "accelerate>=0.26.0"
!pip install evaluate
!pip install protobuf
!pip install sentencepiece
!pip install numpy

Looking in indexes: https://download.pytorch.org/whl/torchstable.html


In [6]:
import pandas as pd

# Loading data
df = pd.read_csv('data/Reviews.csv')

# Checking data
df.head(3)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...


In [7]:
# Rename columns that we're going to keep
df = df.rename(columns={
    'Text': 'review_text', 
    'Summary': 'summary'
})

# Keeping only concerned columns
df = df[['review_text', 'summary']]

# Making sure all worked well
df.head(3)

Unnamed: 0,review_text,summary
0,I have bought several of the Vitality canned d...,Good Quality Dog Food
1,Product arrived labeled as Jumbo Salted Peanut...,Not as Advertised
2,This is a confection that has been around a fe...,"""Delight"" says it all"


In [8]:
# Dropping rows where either field is NaN
df = df.dropna(subset=["review_text", "summary"])

# Casting them to strings
df["review_text"] = df["review_text"].astype(str)
df["summary"] = df["summary"].astype(str)

In [9]:
print(df[["review_text", "summary"]].head())
print(df["review_text"].apply(type).value_counts())  # Ensure everything is <class 'str'>

                                         review_text                summary
0  I have bought several of the Vitality canned d...  Good Quality Dog Food
1  Product arrived labeled as Jumbo Salted Peanut...      Not as Advertised
2  This is a confection that has been around a fe...  "Delight" says it all
3  If you are looking for the secret ingredient i...         Cough Medicine
4  Great taffy at a great price.  There was a wid...            Great taffy
review_text
<class 'str'>    568427
Name: count, dtype: int64


In [10]:
n_samples = 1000 #due to memory constraints on local machines, we're cutting down the data to 1000 rows for validation before we explore other ideas on how to scale
df_sampled = df.sample(n=min(n_samples, len(df)), random_state=42)

df_sampled.head(5)

Unnamed: 0,review_text,summary
41434,These are actually very tasty. Pure potatoes ...,I like these!
209481,I realize that taste is a matter of personal p...,Good but subjectively not 5 star
247306,This is one of my Favorite cup of soup choices...,"Lipton Cup A Soup, Spring Vegetable.4 oz"
80089,If you like the classic taste of a good margar...,"Suited to its purpose, if not quite its goal..."
218580,I was willing to give this a chance even after...,Tastes artificial!


In [23]:
from datasets import Dataset, DatasetDict

# Convert the entire dataframe to a Hugging Face Dataset
dataset = Dataset.from_pandas(df_sampled)

# Train/Test Split
# Doing a 90% train, 10% test split
dataset_split = dataset.train_test_split(test_size=0.3, seed=42)
train_dataset = dataset_split["train"]
test_dataset  = dataset_split["test"]

print(train_dataset)
print(test_dataset)

Dataset({
    features: ['review_text', 'summary', '__index_level_0__'],
    num_rows: 700
})
Dataset({
    features: ['review_text', 'summary', '__index_level_0__'],
    num_rows: 300
})


In [24]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

model_name = "google/t5-small-ssm-nq"  # "t5-small", "t5-base", will be our next choice.
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Defining our preprocessing function
def preprocess_function(examples):
    inputs = ["summarize: " + ex for ex in examples["review_text"]]
    targets = examples["summary"]
    
    model_inputs = tokenizer(
        inputs,
        max_length=256,
        truncation=True,
        padding=True  # Add explicit padding
    )
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=64,
            truncation=True,
            padding=True  # Add padding for targets
        )
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test  = test_dataset.map(preprocess_function, batched=True)

Map: 100%|███████████████████████████████████████████████████| 700/700 [00:00<00:00, 5173.01 examples/s]
Map: 100%|███████████████████████████████████████████████████| 300/300 [00:00<00:00, 5284.56 examples/s]


In [25]:
import ssl #had issues installing nltk on the mac so a suggested solution online was to import ssl and attempt below try else code.
import pandas as pd
import nltk

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

In [26]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aakashsondhi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [30]:
from transformers import (
    T5Tokenizer, 
    T5ForConditionalGeneration, 
    DataCollatorForSeq2Seq, 
    Seq2SeqTrainingArguments, 
    Seq2SeqTrainer
)
import evaluate
import torch
import numpy as np
import nltk

# downloading required nltk packages
nltk.download('punkt')
nltk.download('punkt_tab')

# attempting to utilize apple silicon's mps which is gpu equivalent by utilizing torch's mps feature
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

#model_name = "google/t5-small-ssm-nq" a model tried earlier that yield poor results
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

# Workarounds added after system errors around using t5
model.config.use_cache = False
# Explicitly setting the decoder start token
model.config.decoder_start_token_id = tokenizer.pad_token_id

# Preprocessing function 
def preprocess_function(examples):
    inputs = ["summarize: " + ex for ex in examples["review_text"]]
    targets = examples["summary"]
    
    model_inputs = tokenizer(
        inputs,
        max_length=256,
        truncation=True,
        padding=True
    )
    
    # Tokenize targets in target mode
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=64,
            truncation=True,
            padding=True
        )
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize the datasets 
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test  = test_dataset.map(preprocess_function, batched=True)

# Loading ROUGE metric
rouge_metric = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Moving predictions/labels to CPU and converting them to lists
    if isinstance(predictions, torch.Tensor):
        predictions = predictions.cpu().tolist()
    if isinstance(labels, torch.Tensor):
        labels = labels.cpu().tolist()

    # Throwing out invalid token IDs in predictions
    predictions = [
        [token if 0 <= token < tokenizer.vocab_size else tokenizer.pad_token_id for token in seq]
        for seq in predictions
    ]
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replacing -100 in labels with pad token id so they can be decoded
    labels = [
        [token if token != -100 else tokenizer.pad_token_id for token in seq]
        for seq in labels
    ]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # ROUGE expects a new line after each sentence, so tokenizing with nltk
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = rouge_metric.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True
    )

    # Formating the scores as percentages
    return {k: round(v * 100, 4) for k, v in result.items()}

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,
    label_pad_token_id=tokenizer.pad_token_id
)

training_args = Seq2SeqTrainingArguments(
    output_dir="my_summarization_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    # gradient_accumulation_steps=2,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=50,
    predict_with_generate=True,
    generation_max_length=64,
    generation_num_beams=1 
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Begin training
trainer.train()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aakashsondhi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/aakashsondhi/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
Map: 100%|███████████████████████████████████████████████████| 700/700 [00:00<00:00, 6153.47 examples/s]
Map: 100%|███████████████████████████████████████████████████| 300/300 [00:00<00:00, 5600.02 examples/s]


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,1.2699,0.730053,1.3823,0.281,1.271,1.3036
2,1.0586,0.705922,1.4323,0.1308,1.2673,1.283
3,1.1069,0.702623,1.448,0.1308,1.3555,1.3665


TrainOutput(global_step=1050, training_loss=1.4965096500941686, metrics={'train_runtime': 597.2576, 'train_samples_per_second': 3.516, 'train_steps_per_second': 1.758, 'total_flos': 142108891545600.0, 'train_loss': 1.4965096500941686, 'epoch': 3.0})

In [None]:
### Training Args No 1

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="my_summarization_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    # gradient_accumulation_steps=2, 
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=50,
    predict_with_generate=True,
    generation_max_length=64,
    generation_num_beams=1 
)

In [29]:
### Training Args No.2

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="my_summarization_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    #gradient_accumulation_steps=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=50,
    predict_with_generate=True,
    generation_max_length=64,
    generation_num_beams=1
)