In [1]:
!pip install transformers datasets peft torch pandas




In [2]:
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer

# Load the dataset from CSV
data = pd.read_csv('/content/drive/MyDrive/Summarization-dataset/tourism_articles.csv')

# Ensure the dataset has 'article' and 'summary' columns
assert 'article' in data.columns and 'summary' in data.columns

# Drop the 'id' and 'title' columns if they exist
columns_to_drop = ['id', 'title']
data = data.drop(columns=[col for col in columns_to_drop if col in data.columns])

# Split the dataset into train and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Convert the pandas DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

# Combine them into a DatasetDict
dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

# Load the tokenizer
model_name = "csebuetnlp/mT5_multilingual_XLSum"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Preprocessing function
def preprocess_function(examples):
    inputs = examples['article']
    targets = examples['summary']
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(targets, max_length=84, truncation=True, padding='max_length')

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Apply the preprocessing function to the datasets
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Remove the original columns to avoid confusion
tokenized_datasets = tokenized_datasets.remove_columns(['article', 'summary'])

# Set the format for PyTorch
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Save the tokenized dataset if needed
tokenized_datasets.save_to_disk('./tokenized_dataset')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/32 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/128 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/32 [00:00<?, ? examples/s]

In [3]:
print(dataset['train'][0])
print(dataset['test'][0])

{'article': 'ÿ≥ÿßÿÆÿ™ŸÖÿßŸÜ ŸÖŸàÿ±ÿß⁄©ÿß ÿØÿ± ÿ≥ŸÜ⁄ØÿßŸæŸàÿ± ÿ≥ÿßÿÆÿ™Ÿá\u200c ÿ¥ÿØŸá ÿßÿ≥ÿ™ Ÿà ÿ™Ÿàÿ≥ÿ∑ ŸÖŸáŸÜÿØÿ≥€åŸÜ ŸÜ€åŸàÿ≤ŸÑŸÜÿØ€å Ÿà ⁄©ÿßÿ±ÿ¥ŸÜÿßÿ≥ÿßŸÜ ŸÅŸÜ\u200cÿ¢Ÿàÿ±€å ÿ¢⁄©Ÿàÿßÿ±€åŸàŸÖ ÿ¥ÿ±⁄©ÿ™ ÿßŸÖ ÿ¨€å ŸÖŸàÿ±ŸÅ€å ÿ∑ÿ±ÿßÿ≠€å\u200c ÿ¥ÿØŸá ÿßÿ≥ÿ™ Ÿà ÿØÿ±ŸÜŸáÿß€åÿ™ ÿ®Ÿá ÿ¢ÿ®\u200cŸáÿß€å ⁄Øÿ±ŸÖ ŸÖÿßŸÑÿØ€åŸà ÿßŸÜÿ™ŸÇÿßŸÑ Ÿæ€åÿØÿß ⁄©ÿ±ÿØŸá ÿßÿ≥ÿ™. ÿØÿ± ÿß€åŸÜ ÿ≥ÿßÿ≤Ÿá €∂ ÿ™ŸÜ€å Ÿà ÿØŸàÿ∑ÿ®ŸÇŸáÿå ÿ∑ÿ®ŸÇŸá ŸÅŸàŸÇÿßŸÜ€å ÿ®Ÿá ÿßÿ™ÿßŸÇ ŸÜÿ¥€åŸÖŸÜÿå ÿ¢ÿ¥Ÿæÿ≤ÿÆÿßŸÜŸáÿå ŸÜÿßŸáÿßÿ±ÿÆŸàÿ±€åÿå ÿ®ÿßÿ¥⁄ØÿßŸá Ÿà ÿßÿ≥ÿ™ÿÆÿ± ÿßÿÆÿ™ÿµÿßÿµ ÿØÿßÿ±ÿØ Ÿà ÿßÿ≤ ÿ∑ÿ±€åŸÇ €å⁄© \u200cÿ±ÿßŸá\u200cŸæŸÑŸá ŸÖÿßÿ±Ÿæ€å⁄Ü€å ÿ®Ÿá ÿßÿ™ÿßŸÇ\u200cÿÆŸàÿßÿ® ÿ∑ÿ®ŸÇŸá ÿ≤€åÿ±€åŸÜ ÿØÿ≥ÿ™ÿ±ÿ≥€å Ÿæ€åÿØÿß ŸÖ€å\u200c⁄©ŸÜÿØ. ŸÇÿ≥ŸÖÿ™ ÿ≤€åÿ± ÿØÿ±€åÿß€å ÿß€åŸÜ ÿßŸÇÿßŸÖÿ™⁄ØÿßŸá ŸÜ€åÿ≤ (€å⁄©€å ÿßÿ≤ ÿßÿ™ÿßŸÇ\u200cÿÆŸàÿßÿ®\u200cŸáÿß) ÿØÿ± ÿπŸÖŸÇ €µ ŸÖÿ™ÿ±€å ÿ≤€åÿ± ÿ¢ÿ® ŸÇÿ±ÿßÿ±⁄Øÿ±ŸÅÿ™Ÿá ÿßÿ≥ÿ™. \n ÿ¥ÿ±⁄©ÿ™ ÿ∑ÿ±ÿßÿ≠€å €åŸàÿ¨€å €åÿßŸÖÿßÿ≤ÿß⁄©€å ⁄©Ÿá ÿØÿ± ŸÜ€åŸà€åŸàÿ±⁄© ÿ¢ŸÖÿ±€å⁄©ÿß ŸÖÿ≥ÿ™ŸÇÿ± ÿßÿ≥ÿ™ ÿ∑ÿ±ÿßÿ≠€å ÿØÿßÿÆŸÑ€å ŸÖŸàÿ±ÿß⁄©ÿß ÿ

In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments
from peft import get_peft_model, LoraConfig, TaskType
from datasets import load_dataset, load_metric
import torch

# Load the model and tokenizer
model_name = "csebuetnlp/mT5_multilingual_XLSum"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


# Define the LoRA configuration
lora_config = LoraConfig(
    r=4,
    lora_alpha=32,
    task_type=TaskType.SEQ_2_SEQ_LM,
    lora_dropout=0.1,
    bias='none',
)

# Apply the LoRA configuration to the model
model = get_peft_model(model, lora_config)


No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'


In [5]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
)

# Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()




Epoch,Training Loss,Validation Loss
1,No log,2.662491
2,No log,2.661693
3,No log,2.661358


TrainOutput(global_step=96, training_loss=2.5153770446777344, metrics={'train_runtime': 3080.7219, 'train_samples_per_second': 0.125, 'train_steps_per_second': 0.031, 'total_flos': 460955553103872.0, 'train_loss': 2.5153770446777344, 'epoch': 3.0})

In [8]:
model.save_pretrained("/content/drive/MyDrive/fine-tuning-project/task1/fine-tuned-task1")
tokenizer.save_pretrained("/content/drive/MyDrive/fine-tuning-project/task1/fine-tuned-task1")




('/content/drive/MyDrive/fine-tuning-project/task1/fine-tuned-task1/tokenizer_config.json',
 '/content/drive/MyDrive/fine-tuning-project/task1/fine-tuned-task1/special_tokens_map.json',
 '/content/drive/MyDrive/fine-tuning-project/task1/fine-tuned-task1/spiece.model',
 '/content/drive/MyDrive/fine-tuning-project/task1/fine-tuned-task1/added_tokens.json',
 '/content/drive/MyDrive/fine-tuning-project/task1/fine-tuned-task1/tokenizer.json')

In [6]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the fine-tuned model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/fine-tuning-project/task1/fine-tuned-task1")
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/fine-tuning-project/task1/fine-tuned-task1")




In [7]:
# Define a sample input
sample_article = """ÿ®Ÿá ⁄Øÿ≤ÿßÿ±ÿ¥ ÿ±Ÿàÿßÿ®ÿ∑ ÿπŸÖŸàŸÖ€å ÿµÿ®ÿßÿß€åÿØŸáÿå ÿß€åŸÜ ÿ®ÿ±ŸÜÿØ ŸÅÿπÿßŸÑ ÿØÿ± ÿ≠Ÿàÿ≤Ÿá Ÿà€åÿØÿ¶Ÿàÿå ÿßÿ≥ÿ™ÿ±€åŸÖ Ÿà ÿ™ÿ®ŸÑ€åÿ∫ÿßÿ™ ÿ¢ŸÜŸÑÿß€åŸÜ ⁄©Ÿá ŸæŸÑÿ™ŸÅÿ±ŸÖŸáÿß€å€å ⁄ÜŸàŸÜ ŸÅ€åŸÑ€åŸÖŸàÿå ÿ¢Ÿæÿßÿ±ÿßÿ™ Ÿà ÿ¢⁄òÿßŸÜÿ≥ ÿ™ÿ®ŸÑ€åÿ∫ÿßÿ™ ÿ¢ŸÜŸÑÿß€åŸÜ ÿµÿ®ÿßŸà€å⁄òŸÜ ÿ±ÿß ÿ≤€åÿ± ŸÖÿ¨ŸÖŸàÿπŸá ÿÆŸàÿØ ÿØÿßÿ±ÿØÿå ÿ®ÿß ÿ≥ÿ±ŸÖÿß€åŸá ⁄Øÿ∞ÿßÿ±€å ÿØÿ± €å⁄©€å ÿßÿ≤ ÿØŸà ÿßÿ≥ÿ™ÿßÿ±ÿ™ÿßŸæ ÿ®ÿ≤ÿ±⁄Ø ÿ≠Ÿàÿ≤Ÿá ⁄Øÿ±ÿØÿ¥⁄Øÿ±€åÿå ÿ®Ÿá ÿ∑Ÿàÿ± ÿ±ÿ≥ŸÖ€å Ÿàÿßÿ±ÿØ ÿ®ÿßÿ≤ÿßÿ± ŸÅÿ±Ÿàÿ¥ Ÿà ÿ±ÿ≤ÿ±Ÿàÿßÿ≥€åŸàŸÜ ÿ®ŸÑ€åÿ∑ ŸáŸàÿßŸæ€åŸÖÿß Ÿà ŸÇÿ∑ÿßÿ± ÿ¥ÿØ.
 ÿ∏ÿ±ŸÅ€åÿ™‚ÄåŸáÿß€å ŸÖŸàÿ¨ŸàÿØ ÿØÿ± ÿ®ÿßÿ≤ÿßÿ± ⁄Øÿ±ÿØÿ¥⁄Øÿ±€å ÿ¢ŸÜŸÑÿß€åŸÜ ÿß€åÿ±ÿßŸÜ Ÿà ŸÅÿßÿµŸÑŸá ÿ≤€åÿßÿØ€å ⁄©Ÿá ÿß€åŸÜ ÿ≠Ÿàÿ≤Ÿá ÿßÿ≤ ÿ®ÿßÿ≤ÿßÿ± ÿ¢ŸÅŸÑÿß€åŸÜ ÿØÿßÿ±ÿØÿå ÿ±ÿ¥ÿØ ŸÇÿßÿ®ŸÑ‚Äåÿ™Ÿàÿ¨Ÿá Ÿà ÿßÿ±⁄ØÿßŸÜ€å⁄© ŸÅŸÑÿß€åÿ™€åŸà ÿßÿ≤ ÿßÿ®ÿ™ÿØÿß€å ŸÅÿπÿßŸÑ€åÿ™ ÿ¢ŸÜ Ÿà Ÿà€å⁄ò⁄Ø€å‚ÄåŸáÿß€å ŸÖÿ¥ÿ™ÿ±⁄© ÿ™€åŸÖ‚ÄåŸáÿß€å ŸÖÿØ€åÿ±€åÿ™€å ÿØŸà ÿ®ÿ±ŸÜÿØ ⁄©Ÿá ÿ¢ŸÜ‚ÄåŸáÿß ÿ±ÿß ÿ®ŸáŸÖ ŸÜÿ≤ÿØ€å⁄©‚Äåÿ™ÿ± ŸÖ€å‚Äå⁄©ŸÜÿØ ÿßÿ≤ ŸÖŸáŸÖÿ™ÿ±€åŸÜ ÿπŸÑÿ™‚ÄåŸáÿß€å ÿ¨ÿØ€åÿØÿ™ÿ±€åŸÜ ÿ≥ÿ±ŸÖÿß€åŸá‚Äå⁄Øÿ∞ÿßÿ±€å ÿµÿ®ÿßÿß€åÿØŸá ÿßÿ≥ÿ™.
 ÿß€åŸÜ ÿ≥ÿ±ŸÖÿß€åŸá‚Äå⁄Øÿ∞ÿßÿ±€å ⁄©Ÿá ÿ®ÿß ÿÆÿ±€åÿØ €≤€∞ ÿØÿ±ÿµÿØ ÿ≥ŸáÿßŸÖ ŸÅŸÑÿß€åÿ™€åŸà ÿ±ŸÇŸÖ ÿÆŸàÿ±ÿØŸá Ÿæ€åÿ±Ÿà ÿ™ÿ¨ÿ±ÿ®Ÿá Ÿà ÿØÿßŸÜÿ¥ ŸÅŸÑÿß€åÿ™€åŸà ÿØÿ± ÿµŸÜÿπÿ™ ⁄Øÿ±ÿØÿ¥⁄Øÿ±€å Ÿà ÿØÿ± ⁄©ŸÜÿßÿ± ÿ∏ÿ±ŸÅ€åÿ™‚ÄåŸáÿß Ÿà ÿ≠ŸÖÿß€åÿ™‚ÄåŸáÿß€å ⁄Øÿ≥ÿ™ÿ±ÿØŸá ÿ™ÿ®ŸÑ€åÿ∫ÿßÿ™€å ÿµÿ®ÿßÿß€åÿØŸáÿå ŸÖÿ≥€åÿ± ÿ™ÿßÿ≤Ÿá‚Äåÿß€å ÿ±ÿß ÿØÿ± ÿµŸÜÿπÿ™ ⁄Øÿ±ÿØÿ¥⁄Øÿ±€å ÿ¢ŸÜŸÑÿß€åŸÜ ÿß€åÿ±ÿßŸÜ ÿ±ŸÇŸÖ ÿÆŸàÿßŸáÿØ ÿ≤ÿØ.
"""

# Preprocess the input
input_ids = tokenizer(
    [sample_article],
    return_tensors="pt",
    padding="max_length",
    truncation=True,
    max_length=512
)["input_ids"]

# Generate the summary
output_ids = model.generate(
    input_ids=input_ids,
    max_length=84,
    no_repeat_ngram_size=2,
    num_beams=4
)[0]

# Decode the summary
summary = tokenizer.decode(
    output_ids,
    skip_special_tokens=True,
    clean_up_tokenization_spaces=False
)

print(summary)

ÿµÿ®ÿßÿß€åÿØŸáÿå €å⁄©€å ÿßÿ≤ ÿ®ÿ≤ÿ±⁄Øÿ™ÿ±€åŸÜ ÿ¥ÿ±⁄©ÿ™ Ÿáÿß€å ⁄Øÿ±ÿØÿ¥⁄Øÿ±€å ÿ¢ŸÜŸÑÿß€åŸÜ ÿß€åÿ±ÿßŸÜÿå ÿ®ÿß ÿ≥ÿ±ŸÖÿß€åŸá ⁄Øÿ∞ÿßÿ±€å ÿ®€åÿ¥ ÿßÿ≤ €≤€∞ ÿØÿ±ÿµÿØ ÿ≥ŸáÿßŸÖ ÿÆŸàÿØ ÿØÿ± ÿ®ÿßÿ≤ÿßÿ± ŸÅÿ±Ÿàÿ¥ Ÿà ÿ±ÿ≤ÿ±Ÿàÿßÿ≥€åŸàŸÜ ÿ®ŸÑ€åÿ∑ ŸáŸàÿßŸæ€åŸÖÿß Ÿà ŸÇÿ∑ÿßÿ± Ÿàÿßÿ±ÿØ ÿ®ÿßÿ≤ÿßÿ± ÿ¥ÿØ.
