In [1]:
!pip install torch tensorboard



In [2]:
!pip install  --upgrade transformers datasets accelerate evaluate bitsandbytes



In [3]:
!pip install git+https://github.com/huggingface/trl@a3c5b7178ac4f65569975efadc97db2f3749c65e --upgrade
!pip install git+https://github.com/huggingface/peft@4a1559582281fc3c9283892caea8ccef1d6f5a4f --upgrade

Collecting git+https://github.com/huggingface/trl@a3c5b7178ac4f65569975efadc97db2f3749c65e
  Cloning https://github.com/huggingface/trl (to revision a3c5b7178ac4f65569975efadc97db2f3749c65e) to /tmp/pip-req-build-me37sdb1
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/trl /tmp/pip-req-build-me37sdb1
  Running command git rev-parse -q --verify 'sha^a3c5b7178ac4f65569975efadc97db2f3749c65e'
  Running command git fetch -q https://github.com/huggingface/trl a3c5b7178ac4f65569975efadc97db2f3749c65e
  Running command git checkout -q a3c5b7178ac4f65569975efadc97db2f3749c65e
  Resolved https://github.com/huggingface/trl to commit a3c5b7178ac4f65569975efadc97db2f3749c65e
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting git+https://github.com/huggingface/peft@4a1559582281fc3c9283892caea8ccef1d6f5a4f
  Cloning https://github.com/hugg

In [4]:
from huggingface_hub import login
 
login(
  token="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", 
  add_to_git_credential=True
)

Token is valid (permission: fineGrained).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m
Token has not been saved to git credential helper.
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [5]:
from datasets import load_dataset, DatasetDict
import torch
from torch.utils.data import DataLoader
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    Trainer, 
    TrainingArguments, 
    pipeline, 
    BitsAndBytesConfig,
    HfArgumentParser, 
    logging
)
from peft import LoraConfig, get_peft_model, PeftModel, prepare_model_for_int8_training
import numpy as np
import evaluate
import copy

In [6]:
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False

In [7]:
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
   load_in_4bit=use_4bit,
   bnb_4bit_quant_type=bnb_4bit_quant_type,
   bnb_4bit_compute_dtype=compute_dtype,
   bnb_4bit_use_double_quant=use_nested_quant,
)

if compute_dtype == torch.float16 and use_4bit:
   major, _ = torch.cuda.get_device_capability()
   if major >= 8:
       print("=" * 80)
       print("Your GPU supports bfloat16: accelerate training with bf16=True")
       print("=" * 80)


In [8]:
# Step 1: Load model and tokenizer
model_name = "google/gemma-2-2b-it"
tokenizer = AutoTokenizer.from_pretrained(model_name , trust_remote_code=True)

tokenizer.pad_token = tokenizer.eos_token 
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map={"": 0}
)
model = prepare_model_for_int8_training(model)
model.config.use_cache = False
model.config.pretraining_tp = 1


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [9]:
# Step 2: Apply LoRA configuration and freeze base model parameters
lora_config = LoraConfig(r=64, lora_alpha=16, target_modules=["q_proj", "v_proj"], lora_dropout=0.1)
model = get_peft_model(model, lora_config)


# Freeze the base model parameters except lora layers; only LoRA parameters will be trainable
for name, param in model.named_parameters():
    if "lora_" not in name:
        param.requires_grad = False

# Load QA Datasets
dataset_1 = load_dataset("Amod/mental_health_counseling_conversations")  # Replace with the first QA dataset
dataset_2 = load_dataset("nbertagnolli/counsel-chat") 

Repo card metadata block was not found. Setting CardData to empty.


In [10]:
trainable_params = []
cnt = 0
for name, param in model.named_parameters():
    cnt+=1
    if param.requires_grad:
        trainable_params.append(name)
        # print(f"Parameter {name} will be trained.")

print(f"\nTotal number of parameters: {cnt} , trainable parameters: {len(trainable_params)}")


Total number of parameters: 392 , trainable parameters: 104


In [11]:
# def preprocess_data_mental_health(examples):
#     # Extract the "context" and "response" fields, stripping any extra whitespace
#     contexts = [c.strip() for c in examples["Context"]]
#     responses = [r.strip() for r in examples["Response"]]

#     # Concatenate context and response for text generation
#     inputs = [f"{context}\n{response}" for context, response in zip(contexts, responses)]
    
#     # Tokenize the concatenated input
#     tokenized_inputs = tokenizer(
#         inputs,
#         truncation=True,
#         padding="max_length",  # Adjust as needed
#         max_length=512  # Set to your model's max input length
#     )

#     # Return tokenized inputs with labels (for text generation, labels are the same as input IDs)
#     tokenized_inputs["labels"] = tokenized_inputs["input_ids"]

#     return tokenized_inputs
 


#
def preprocess_data_mental_health(examples):
    # Concatenate Context and Response
    inputs = [context + response for context, response in zip(examples["Context"], examples["Response"])]
    
    # Tokenize the inputs
    model_inputs = tokenizer(
        inputs, 
        max_length=128, 
        truncation=True, 
        padding="max_length"  # or "longest" or "do_not_pad", depending on your needs
    )
    
    # Prepare labels (for causal LM, labels are usually the same as input_ids)
    labels = model_inputs["input_ids"].copy()
    
    # Optionally, you can mask the padding tokens in the labels
    # Replace padding token id with -100 to ignore in loss calculation
    labels = [
        [(label if label != tokenizer.pad_token_id else -100) for label in labels_example]
        for labels_example in labels
    ]
    
    model_inputs["labels"] = labels
    return model_inputs

tokenized_dataset_1 = dataset_1.map(preprocess_data_mental_health, batched=True)
tokenized_dataset_1.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
dataset = tokenized_dataset_1["train"].train_test_split(test_size=0.2)  # 80% train, 20% validation
tokenized_dataset_1 = DatasetDict({
    "train": dataset["train"],
    "validation": dataset["test"]
})


In [12]:
tokenized_dataset_1

DatasetDict({
    train: Dataset({
        features: ['Context', 'Response', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2809
    })
    validation: Dataset({
        features: ['Context', 'Response', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 703
    })
})

In [13]:
def preprocess_data_counsel_chat(examples):
    # Extract the "context" and "response" fields, stripping any extra whitespace
    contexts = [c.strip() if c is not None else "" for c in examples["questionText"]]
    responses = [r.strip() if r is not None else "" for r in examples["answerText"]]

    # Concatenate context and response for text generation
    inputs = [f"{context}\n{response}" for context, response in zip(contexts, responses)]
    
    # Tokenize the concatenated input
    tokenized_inputs = tokenizer(
        inputs,
        truncation=True,
        padding="max_length",  # Adjust as needed
        max_length=128  # Set to your model's max input length
    )

    # Return tokenized inputs with labels (for text generation, labels are the same as input IDs)
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"]

    return tokenized_inputs

# Tokenize datasets with respective preprocessing functions
tokenized_dataset_2 = dataset_2.map(preprocess_data_counsel_chat, batched=True)
tokenized_dataset_2.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
dataset = tokenized_dataset_2["train"].train_test_split(test_size=0.2)  # 80% train, 20% validation
tokenized_dataset_2 = DatasetDict({
    "train": dataset["train"],
    "validation": dataset["test"]
})

In [14]:
tokenized_dataset_2

DatasetDict({
    train: Dataset({
        features: ['questionID', 'questionTitle', 'questionText', 'questionLink', 'topic', 'therapistInfo', 'therapistURL', 'answerText', 'upvotes', 'views', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2220
    })
    validation: Dataset({
        features: ['questionID', 'questionTitle', 'questionText', 'questionLink', 'topic', 'therapistInfo', 'therapistURL', 'answerText', 'upvotes', 'views', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 555
    })
})

In [15]:
# Load Metrics
metric_em = evaluate.load("exact_match")
metric_f1 = evaluate.load("f1")

def compute_metrics(predictions, references):
    preds = np.argmax(predictions, axis=-1)
    em_score = metric_em.compute(predictions=preds, references=references)
    f1_score = metric_f1.compute(predictions=preds, references=references)
    return {"exact_match": em_score, "f1": f1_score}

In [19]:
class EWC:
    def __init__(self, model, dataset, fisher_multiplier=0.5):
        self.model = model
        self.fisher_multiplier = fisher_multiplier
        # Save only parameters that require gradients
        self.params = {n: p.clone() for n, p in model.named_parameters() if p.requires_grad}
        self.fisher = self.compute_fisher(dataset)

    def compute_fisher(self, dataset):
        fisher = {n: torch.zeros_like(p) for n, p in self.model.named_parameters() if p.requires_grad}
        device = next(self.model.parameters()).device  # Get model device
        dataloader = DataLoader(dataset, batch_size=4)  # Adjust batch size as needed

        for batch in dataloader:
            self.model.zero_grad()

            # Move tensors to the device
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = self.model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss
            loss.backward()

            for n, p in self.model.named_parameters():
                if p.requires_grad and p.grad is not None:
                    fisher[n] += p.grad.data ** 2  # Use .data to avoid issues with autograd

        # Normalize the Fisher information by the number of samples
        num_batches = len(dataloader)
        for n in fisher:
            fisher[n] /= num_batches
        return fisher

    def penalty(self, model):
        loss = 0.0
        for n, p in model.named_parameters():
            if p.requires_grad:
                loss += (self.fisher_multiplier * self.fisher[n] * (p - self.params[n]) ** 2).sum()
        return loss
        

# Training Loop with EWC
class EWCTrainer(Trainer):
    def __init__(self, *args, ewc=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.ewc = ewc

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        outputs = model(**inputs)
        loss = outputs.loss
        if self.ewc:
            loss += self.ewc.penalty(model)
        return (loss, outputs) if return_outputs else loss

In [17]:
# Fine-Tune on Dataset 1
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="no",
    learning_rate=2e-4,
    per_device_train_batch_size = 2,
    # per_device_eval_batch_size = 2,
    gradient_checkpointing = True,
    max_grad_norm = 0.3,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    fp16=False,
    bf16 = False,
    num_train_epochs=1,
    weight_decay=0.001,
    lr_scheduler_type = "constant",
    max_steps = -1,
    warmup_ratio = 0.03,
    group_by_length = True,
    save_steps = 25,
    logging_steps = 25,

)

trainer_1 = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_1["train"],
    # eval_dataset=tokenized_dataset_1["validation"],
    compute_metrics=compute_metrics,
)

trainer_1.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mdeeps2657[0m ([33msaideep[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112638622216764, max=1.0…

  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
25,2.7833
50,2.5513
75,2.4569
100,2.3781
125,2.5266
150,2.4334
175,2.3311
200,2.3564
225,2.4315
250,2.3731


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast

TrainOutput(global_step=1405, training_loss=2.2374893907974625, metrics={'train_runtime': 1254.6654, 'train_samples_per_second': 2.239, 'train_steps_per_second': 1.12, 'total_flos': 4395086145847296.0, 'train_loss': 2.2374893907974625, 'epoch': 1.0})

In [20]:
# Save EWC Fisher information after fine-tuning on Dataset 1
ewc = EWC(model, tokenized_dataset_1["train"])

# Fine-Tune on Dataset 2 with EWC applied
training_args_2 = TrainingArguments(
    output_dir="./results_with_ewc",
    evaluation_strategy="no",
    learning_rate=2e-4,
    per_device_train_batch_size = 4,
    # per_device_eval_batch_size = 4,
    gradient_checkpointing = True,
    max_grad_norm = 0.3,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    fp16=False,
    bf16 = False,
    num_train_epochs=1,
    weight_decay=0.001,
    lr_scheduler_type = "constant",
    max_steps = -1,
    warmup_ratio = 0.03,
    group_by_length = True,
    save_steps = 25,
    logging_steps = 25,
)

trainer_2 = EWCTrainer(
    model=model,
    args=training_args_2,
    train_dataset=tokenized_dataset_2["train"],
    # eval_dataset=tokenized_dataset_2["validation"],
    ewc=ewc,
    compute_metrics=compute_metrics,
)

trainer_2.train()

  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
25,2.0153
50,1.947
75,1.8081
100,1.8426
125,1.927
150,1.7701
175,1.8407
200,1.6988
225,1.861
250,1.7755


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast

TrainOutput(global_step=555, training_loss=1.837767592421523, metrics={'train_runtime': 788.1726, 'train_samples_per_second': 2.817, 'train_steps_per_second': 0.704, 'total_flos': 3473510588743680.0, 'train_loss': 1.837767592421523, 'epoch': 1.0})

In [21]:
model.save_pretrained("./fine_tuned_lora_model")
tokenizer.save_pretrained("./fine_tuned_lora_model")

('./fine_tuned_lora_model/tokenizer_config.json',
 './fine_tuned_lora_model/special_tokens_map.json',
 './fine_tuned_lora_model/tokenizer.model',
 './fine_tuned_lora_model/added_tokens.json',
 './fine_tuned_lora_model/tokenizer.json')

In [22]:
# Push the LoRA model to Hugging Face Hub
model.push_to_hub("Saideep14/test-qa-lora")
tokenizer.push_to_hub("Saideep14/test-qa-lora")

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/51.1M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/34.4M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Saideep14/test-qa-lora/commit/471e98b0ae11e36bdf034f1ee4a82d88313170eb', commit_message='Upload tokenizer', commit_description='', oid='471e98b0ae11e36bdf034f1ee4a82d88313170eb', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Saideep14/test-qa-lora', endpoint='https://huggingface.co', repo_type='model', repo_id='Saideep14/test-qa-lora'), pr_revision=None, pr_num=None)

## Testing