### **Parameter-Efficient Fine-Tuning (PEFT) method**



[PEFT](https://github.com/huggingface/peft)


### **LoRA**


### Setup Steps:

### Install required dependencies

In [1]:
!pip -q install transformers datasets evaluate transformers[torch]
!pip install rouge_score
!pip install -U accelerate
!pip install bitsandbytes

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

### Import required packages

In [4]:
import os
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import TrainingArguments, Trainer

from peft import LoraConfig, get_peft_model, TaskType
from peft import PeftModel, PeftConfig

import warnings
warnings.filterwarnings('ignore')

### **Load Dataset**

In [5]:
from datasets import load_dataset

billsum = load_dataset("billsum", split='train')

README.md:   0%|          | 0.00/7.27k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/91.8M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/15.8M [00:00<?, ?B/s]

ca_test-00000-of-00001.parquet:   0%|          | 0.00/6.12M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18949 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3269 [00:00<?, ? examples/s]

Generating ca_test split:   0%|          | 0/1237 [00:00<?, ? examples/s]

In [6]:
billsum

Dataset({
    features: ['text', 'summary', 'title'],
    num_rows: 18949
})

In [7]:
billsum = billsum.train_test_split(test_size=0.2)

In [8]:
billsum["train"][0]

{'text': "SECTION 1. SHORT TITLE.\n\n    This Act may be cited as the ``Identity Theft Prevention Act''.\n\nSEC. 2. FINDINGS.\n\n    Congress finds that--\n            (1) the crime of identity theft has become one of the major \n        law enforcement challenges of the new economy, as vast \n        quantities of sensitive, personal information are now \n        vulnerable to criminal interception and misuse;\n            (2) in November 2002, Americans were alerted to the dangers \n        of identity theft when Federal prosecutors announced that 3 \n        individuals had allegedly sold the credit and personal \n        information of 30,000 people, the largest single identity theft \n        case in United States history;\n            (3) hundreds of thousands of Americans are victims of \n        identity theft each year, resulting in an annual cost to \n        industry of more than $3,500,000,000.\n            (4) several indicators reveal that despite increased public \n     

### **Load Model & Tokenizer**

In [9]:
from transformers import AutoTokenizer

checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [10]:
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [11]:
tokenized_billsum = billsum.map(preprocess_function, batched=True)

Map:   0%|          | 0/15159 [00:00<?, ? examples/s]

Map:   0%|          | 0/3790 [00:00<?, ? examples/s]

In [12]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [13]:
import evaluate

rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [14]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [15]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

### Huggingface login

In [16]:
!huggingface-cli login



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `Peft_Lora` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `Peft_Lor

#### optional: using wandb to store weights abd biases

# Model Training

In [17]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from peft import get_peft_model, LoraConfig, TaskType

lora_config = LoraConfig(
    r=64,                        # Increase rank
    lora_alpha=64,               # Increase LoRA scaling
    lora_dropout=0.01,           # Adjust dropout
    bias="all",                  # Include bias parameters
    task_type=TaskType.SEQ_2_SEQ_LM,
    target_modules=["q", "v"]    # Remove unsupported "k"
)

# Apply LoRA to the model
lora_model = get_peft_model(model, lora_config)

training_args = Seq2SeqTrainingArguments(
    output_dir="lora_31",
    eval_strategy="epoch",
    auto_find_batch_size=True,
    learning_rate=2e-4,
    per_device_train_batch_size=16,  # Adjusted batch size
    per_device_eval_batch_size=16,   # Adjusted batch size
    gradient_accumulation_steps = 1,
    # optim = "paged_adamw_32bit",
    save_steps = 0,
    weight_decay=0.01,
    logging_steps = 25,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=False,
    bf16=True,
    push_to_hub=True,
    max_grad_norm = 0.3,
    max_steps = -1,
    warmup_ratio = 0.03,
    group_by_length = True,
    lr_scheduler_type = "cosine"
)


trainer = Seq2SeqTrainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,1.9389,1.718094,0.2344,0.1843,0.2252,0.2252,19.0
2,1.8784,1.651986,0.2381,0.1884,0.2297,0.2297,19.0
3,1.8503,1.647177,0.24,0.1904,0.2315,0.2315,18.9997


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


TrainOutput(global_step=2844, training_loss=1.963274319966634, metrics={'train_runtime': 6163.0341, 'train_samples_per_second': 7.379, 'train_steps_per_second': 0.461, 'total_flos': 1.2942623378276352e+16, 'train_loss': 1.963274319966634, 'epoch': 3.0})

#### sample input

In [23]:
text = """Deforestation is one of the most pressing environmental issues of our time, dramatically reshaping ecosystems, impacting biodiversity, contributing to climate change, and threatening the livelihoods of countless communities worldwide. The removal of large swathes of forests has both direct and indirect repercussions on our planet. Forests cover about 31% of the earth's land area and are home to more than 80% of terrestrial species of animals, plants, and insects. They play a critical role in absorbing carbon dioxide, a key contributor to climate change, and are essential for maintaining ecological balance. However, every year, millions of hectares of forests are cut down, and this loss of tree cover has intensified due to human activities and economic pressures.

One of the primary drivers of deforestation is agricultural expansion. In tropical regions, particularly in countries like Brazil, Indonesia, and parts of Africa, vast tracts of forests are cleared to make way for large-scale agriculture, especially for soy, palm oil, and cattle ranching. Palm oil, found in many household products, has been a significant factor in deforestation in Southeast Asia, with palm plantations replacing ancient rainforests. Similarly, cattle ranching, a primary driver in the Amazon rainforest, leads to significant carbon emissions as forests are burned to clear land. When these forests are destroyed, they release stored carbon back into the atmosphere, further intensifying climate change. Additionally, as forests shrink, the species that depend on them are forced to relocate, adapt, or face extinction. This loss of biodiversity affects ecosystems and disrupts food chains, impacting both wildlife and human populations.

The timber industry also plays a substantial role in deforestation, as forests are cut down to meet the demand for wood, paper, and other products. Illegal logging is especially problematic in tropical regions where enforcement is weak, and the demand for timber is high. Despite international efforts to regulate logging practices, many companies continue to exploit these forests due to lucrative profits and weak local governance. Often, local communities bear the brunt of these activities, losing their homes and resources, and facing environmental degradation. In many cases, these communities rely on the forests for their livelihoods, and deforestation disrupts their traditional way of life. When forests disappear, these communities face greater challenges in accessing clean water, fertile soil, and other essential resources.

Infrastructure development, such as building roads, mining, and urban expansion, also accelerates deforestation. Governments and corporations frequently push for these developments to boost economic growth. However, this leads to increased access to previously remote areas, which often initiates a chain reaction of deforestation as settlers move in and forests are cleared for agriculture and settlements. Mining activities, in particular, have led to devastating deforestation in areas rich in minerals. The Amazon rainforest, for example, has experienced extensive deforestation due to gold mining, which not only destroys forest land but also contaminates rivers with toxic chemicals, affecting aquatic life and human health.

Climate change is both a consequence and a driver of deforestation. Rising temperatures, prolonged droughts, and other extreme weather patterns, which are consequences of climate change, put immense pressure on forests. Forests are vital carbon sinks, absorbing significant amounts of CO₂. However, as they are destroyed, their capacity to absorb carbon is lost, and the carbon stored in trees is released back into the atmosphere, exacerbating global warming. Deforestation accounts for approximately 10-15% of global greenhouse gas emissions, a significant contributor to the ongoing climate crisis. Forest fires, often set deliberately to clear land, become uncontrollable in drier, hotter climates, destroying vast areas of forest cover and releasing even more carbon dioxide.

The impact of deforestation is extensive and multifaceted. The loss of forest cover leads to soil erosion, reducing the soil's ability to retain water and nutrients, which is essential for agriculture. As a result, lands become less productive, leading to lower crop yields and increased food insecurity. Deforestation also impacts the water cycle by reducing the amount of water released into the atmosphere through transpiration. This reduction leads to decreased rainfall and can contribute to desertification in some areas, exacerbating droughts and affecting water supplies. Moreover, as forests disappear, so do the services they provide, including water filtration, flood control, and habitat for pollinators that are essential for agriculture.

Another critical aspect of deforestation is its impact on indigenous communities and their cultural heritage. Many indigenous peoples live in and depend on forests for their way of life. They have a deep spiritual connection to the land and have used sustainable practices to live in harmony with nature for centuries. When forests are destroyed, these communities are often displaced, losing not only their homes but also their identity and way of life. In some cases, they may be forced to relocate to urban areas where they face significant social and economic challenges, often living in poverty and struggling to integrate into new environments.

International organizations, governments, and non-governmental organizations have made efforts to address deforestation through various policies and initiatives. Sustainable forestry practices, reforestation projects, and policies that incentivize conservation over exploitation are being promoted worldwide. For instance, the United Nations’ REDD+ (Reducing Emissions from Deforestation and Forest Degradation) program encourages developing countries to reduce emissions from deforestation and degradation while promoting conservation, sustainable management of forests, and enhancement of forest carbon stocks. Countries like Norway and Germany have funded REDD+ projects, which help protect forests and support local communities. Additionally, certifications like the Forest Stewardship Council (FSC) ensure that timber and forest products come from responsibly managed forests."""

In [24]:
lora_model.print_trainable_parameters()

trainable params: 2,359,808 || all params: 62,865,920 || trainable%: 3.7537


#### Finetuned model summarization

In [25]:
from transformers import pipeline

summarizer = pipeline("summarization", model= lora_model, tokenizer= tokenizer)
summarizer(text)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
The model 'PeftModelForSeq2SeqLM' is not supported for summarization. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseForConditionalGeneration', 'LEDForConditionalGeneration', 'LongT5ForConditionalGeneration', 'M2M100ForConditionalGeneration', 'MarianMTModel', 'MBartForConditionalGeneration', 'MT5ForConditionalGeneration', 'MvpForConditionalGeneration', 'NllbMoeForConditionalGeneration', 'PegasusForConditionalGeneration', 'PegasusXForConditionalGeneration', 'PLBartForConditionalGeneration', 'ProphetNetForConditionalGeneration', 'Qwen2AudioForConditionalGeneration', 'SeamlessM4TForTextToText', 'SeamlessM4Tv2ForTextToText', 'SwitchTransformersForCon

[{'summary_text': 'and forest degradation) program encourages developing countries to reduce emissions from deforestation and degradation while promoting conservation, sustainable management of forests, and enhancement of forest carbon stocks. The United Nations’ REDD+ (Reducing Emissions from Deforement and Forest Degradation), a program that encourages sustainable forestry practices, reforestry projects, and policies that incentivize conservation over exploitation are being promoted worldwide.'}]

#### Original model summarization

In [27]:
from transformers import pipeline

summarizer = pipeline("summarization", model="google-t5/t5-small")
summarizer(text)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Token indices sequence length is longer than the specified maximum sequence length for this model (1250 > 512). Running this sequence through the model will result in indexing errors


[{'summary_text': "deforestation is one of the most pressing environmental issues of our time . forests cover about 31% of the earth's land area and are home to more than 80% of terrestrial species of animals, plants, and insects . the loss of tree cover has intensified due to human activities and economic pressures ."}]