In [1]:
!pip install -q datasets transformers huggingface_hub torch accelerate scikit-learn

In [2]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments, DataCollatorForSeq2Seq
import torch

In [38]:
# Load the CSV file into a pandas DataFrame
entire_df = pd.read_csv('/content/cleaned_news_summary.csv')

df, _ = train_test_split(entire_df, test_size=0.3, random_state=2024)

# Convert the DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Load the tokenizer and model
model_name = "sshleifer/distilbart-cnn-12-6"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Move the model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50264, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50264, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_l

In [39]:
# Preprocess the data: tokenizing
def preprocess_function(examples):
    inputs = tokenizer(examples['text'], max_length=512, truncation=True, padding="max_length")
    targets = tokenizer(examples['summary'], max_length=200, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

# Apply the preprocessing function to the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/3077 [00:00<?, ? examples/s]

In [40]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./distilbart-cnn-12-6_finetuned",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    eval_steps=500,
)

# Data collator for padding and dynamic length inputs
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,  # Replace with a separate validation set if available
    data_collator=data_collator,
    tokenizer=tokenizer,
)




In [41]:
# Fine-tune the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.7225,0.529425
2,0.5736,0.43529
3,0.5042,0.395657


Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_

TrainOutput(global_step=2310, training_loss=0.6694845821950343, metrics={'train_runtime': 1184.8449, 'train_samples_per_second': 7.791, 'train_steps_per_second': 1.95, 'total_flos': 7144388230643712.0, 'train_loss': 0.6694845821950343, 'epoch': 3.0})

In [42]:

# Save the model and tokenizer
trainer.save_model("./distilbart-cnn-12-6_finetuned")
tokenizer.save_pretrained("./distilbart-cnn-12-6_finetuned")

Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


('./distilbart-cnn-12-6_finetuned/tokenizer_config.json',
 './distilbart-cnn-12-6_finetuned/special_tokens_map.json',
 './distilbart-cnn-12-6_finetuned/vocab.json',
 './distilbart-cnn-12-6_finetuned/merges.txt',
 './distilbart-cnn-12-6_finetuned/added_tokens.json',
 './distilbart-cnn-12-6_finetuned/tokenizer.json')

In [52]:
!zip -r /content/distilbart-cnn-12-6_finetuned.zip /content/distilbart-cnn-12-6_finetuned