In [1]:
import pandas as pd
from datasets import Dataset

In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments, DataCollatorForSeq2Seq

In [3]:
# Load the CSV file into a pandas DataFrame
df = pd.read_csv('/Users/rohitrawat/job-prep/Assignments/accrete-ai/text-summarization/data/processed/news_summary_cleaned_train.csv')

# Convert the DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Load the tokenizer and model
model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)



In [4]:
# Preprocess the data: tokenizing
def preprocess_function(examples):
    inputs = tokenizer(examples['text'], max_length=512, truncation=True, padding="max_length")
    targets = tokenizer(examples['summary'], max_length=150, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

# Apply the preprocessing function to the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/219 [00:00<?, ? examples/s]

In [5]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./flan_t5_small_finetuned",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    # predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    eval_steps=500,
)

# Data collator for padding and dynamic length inputs
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,  # Replace with a separate validation set if available
    data_collator=data_collator,
    tokenizer=tokenizer,
)




In [6]:
# Fine-tune the model
trainer.train()

  0%|          | 0/165 [00:00<?, ?it/s]

{'loss': 26.4186, 'grad_norm': 92.001708984375, 'learning_rate': 1.8787878787878792e-05, 'epoch': 0.18}
{'loss': 24.1238, 'grad_norm': 71.7027359008789, 'learning_rate': 1.7575757575757576e-05, 'epoch': 0.36}
{'loss': 22.3889, 'grad_norm': 78.78324127197266, 'learning_rate': 1.6363636363636366e-05, 'epoch': 0.55}
{'loss': 20.9566, 'grad_norm': 85.72879791259766, 'learning_rate': 1.5151515151515153e-05, 'epoch': 0.73}
{'loss': 18.3504, 'grad_norm': 82.81512451171875, 'learning_rate': 1.3939393939393942e-05, 'epoch': 0.91}


  0%|          | 0/55 [00:00<?, ?it/s]

{'eval_loss': 16.157461166381836, 'eval_runtime': 31.7463, 'eval_samples_per_second': 6.898, 'eval_steps_per_second': 1.732, 'epoch': 1.0}
{'loss': 17.6504, 'grad_norm': 69.0825424194336, 'learning_rate': 1.2727272727272728e-05, 'epoch': 1.09}
{'loss': 15.3431, 'grad_norm': 58.20634460449219, 'learning_rate': 1.1515151515151517e-05, 'epoch': 1.27}
{'loss': 14.2781, 'grad_norm': 63.513179779052734, 'learning_rate': 1.0303030303030304e-05, 'epoch': 1.45}
{'loss': 13.3977, 'grad_norm': 44.723480224609375, 'learning_rate': 9.090909090909091e-06, 'epoch': 1.64}
{'loss': 11.4901, 'grad_norm': 53.80904006958008, 'learning_rate': 7.87878787878788e-06, 'epoch': 1.82}
{'loss': 11.1259, 'grad_norm': 56.10178756713867, 'learning_rate': 6.666666666666667e-06, 'epoch': 2.0}


  0%|          | 0/55 [00:00<?, ?it/s]

{'eval_loss': 8.617762565612793, 'eval_runtime': 195.5173, 'eval_samples_per_second': 1.12, 'eval_steps_per_second': 0.281, 'epoch': 2.0}
{'loss': 10.3235, 'grad_norm': 48.001861572265625, 'learning_rate': 5.4545454545454545e-06, 'epoch': 2.18}
{'loss': 10.0516, 'grad_norm': 47.112327575683594, 'learning_rate': 4.242424242424243e-06, 'epoch': 2.36}
{'loss': 9.5545, 'grad_norm': 28.907733917236328, 'learning_rate': 3.0303030303030305e-06, 'epoch': 2.55}
{'loss': 9.4127, 'grad_norm': 32.1888313293457, 'learning_rate': 1.8181818181818183e-06, 'epoch': 2.73}
{'loss': 9.5473, 'grad_norm': 34.9722785949707, 'learning_rate': 6.060606060606061e-07, 'epoch': 2.91}


  0%|          | 0/55 [00:00<?, ?it/s]

{'eval_loss': 7.145495414733887, 'eval_runtime': 59.2965, 'eval_samples_per_second': 3.693, 'eval_steps_per_second': 0.928, 'epoch': 3.0}
{'train_runtime': 2351.3793, 'train_samples_per_second': 0.279, 'train_steps_per_second': 0.07, 'train_loss': 15.110159556070963, 'epoch': 3.0}


TrainOutput(global_step=165, training_loss=15.110159556070963, metrics={'train_runtime': 2351.3793, 'train_samples_per_second': 0.279, 'train_steps_per_second': 0.07, 'total_flos': 122130061590528.0, 'train_loss': 15.110159556070963, 'epoch': 3.0})

In [7]:

# Save the model and tokenizer
trainer.save_model("./flan_t5_small_finetuned")
tokenizer.save_pretrained("./flan_t5_small_finetuned")

('./flan_t5_small_finetuned/tokenizer_config.json',
 './flan_t5_small_finetuned/special_tokens_map.json',
 './flan_t5_small_finetuned/tokenizer.json')

In [24]:
from huggingface_hub import notebook_login
import os

# os.environ['HUGGINGFACE_HUB_TOKEN'] = ''

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

python(8438) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
python(8439) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
python(8442) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks.

In [25]:
trainer.push_to_hub(model_name="flan_t5_small_finetuned_news")

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/rrrohit/flan_t5_small_finetuned/commit/c6e6f24ca9668272ab179f75a9f6beca7673d13b', commit_message='End of training', commit_description='', oid='c6e6f24ca9668272ab179f75a9f6beca7673d13b', pr_url=None, pr_revision=None, pr_num=None)