In [1]:
# Install required packages using pip
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/peft.git git+https://github.com/huggingface/transformers.git

# Import necessary libraries
import torch  # PyTorch library for deep learning
import torch.nn as nn  # Neural network module from PyTorch
import bitsandbytes as bnb  # Library for efficient CUDA operations
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM  # Importing classes from Hugging Face Transformers
import transformers

# Initialize and load the pre-trained BLOOM model from Hugging Face
model_bloom = AutoModelForCausalLM.from_pretrained(
    "bigscience/bloom-560m",  # Model identifier
    torch_dtype=torch.float32,  # Set tensor data type to float32
    device_map='auto',  # Automatically map the model to available device(s)
)

# Initialize and load the tokenizer specific to the BLOOM model
tokenizer = AutoTokenizer.from_pretrained("bigscience/tokenizer")  # Tokenizer identifier



config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/227 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

In [2]:
"""Setting up LoRA (Low-Rank Adaptation) using parameter efficient fine tuning."""

from peft import LoraConfig, get_peft_model  # Importing LoRA configuration and model function from PEFT library

# Configure LoRA settings for the fine-tuning process
conf = LoraConfig(
    r=10,  # Rank of the low-rank matrices
    lora_alpha=10,  # Scaling factor for LoRA
    target_modules=["query_key_value"],  # Model layers to apply LoRA
    lora_dropout=0.05,  # Dropout rate for regularization
    bias="none",  # Bias setting for the layers
    task_type="CAUSAL_LM"  # Type of the language model task
)

# Apply the LoRA configuration to the pre-trained BLOOM model
# This step modifies the model in place, hence the variable is renamed for clarity
peft_bloom = get_peft_model(model_bloom, conf)

In [3]:
"""Comparing parameters before and after applying LoRA to assess the training scope."""

train_parameters = 0  # Initialize counter for trainable parameters
total_params = 0  # Initialize counter for total parameters

# Loop through all parameters in the LoRA-modified model
for _, param in peft_bloom.named_parameters():
    total_params += param.numel()  # Add the total number of elements in the parameter to the total count
    if param.requires_grad:  # Check if the parameter is trainable
        train_parameters += param.numel()  # Add to trainable parameters count if it requires gradient

# Display the results
print(f"Trainable params: {train_parameters}")  # Number of trainable parameters
print(f"All params: {total_params}")  # Total number of parameters in the model
print(f"Trainable: {100 * train_parameters / total_params:.2f}%")  # Percentage of trainable parameters

Trainable params: 983040
All params: 560197632
Trainable: 0.18%


In [4]:
"""Loading SQUAD dataset
"""

from datasets import load_dataset
question_answer_dataset = load_dataset("squad_v2")

Downloading builder script:   0%|          | 0.00/1.87k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading and preparing dataset squad_v2/squad_v2 (download: 44.34 MiB, generated: 122.41 MiB, post-processed: Unknown size, total: 166.75 MiB) to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/9.55M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/801k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

Dataset squad_v2 downloaded and prepared to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
"""Transforming the SQUAD dataset to fit a specific format for model processing."""

# Define a function to restructure the data into a consistent format
def generate_prompt(context, question, answer):
    # Check if answer is available, otherwise set a default message
    if len(answer["text"]) < 1:
        answer = "Cannot Find Answer"
    else:
        answer = answer["text"][0]  # Select the first answer from the list

    # Create a formatted string with context, question, and answer
    prompt_template = f"CONTEXT:\n{context}\n\nQUESTION:\n{question}\n\nANSWER:\n{answer}</s>"
    return prompt_template  # Return the formatted prompt

# Apply the formatting function to each entry in the dataset
# This processes each context, question, and answer through the tokenizer
mapped_qa_dataset = question_answer_dataset.map(
    lambda samples: tokenizer(generate_prompt(samples['context'], samples['question'], samples['answers']))
)

  0%|          | 0/130319 [00:00<?, ?ex/s]

  0%|          | 0/11873 [00:00<?, ?ex/s]

In [6]:

trainer = transformers.Trainer(
    model=peft_bloom,
    train_dataset=mapped_qa_dataset["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=100,
        max_steps=150,
        learning_rate=1e-3,
        fp16=True,
        logging_steps=1,
        output_dir='outputs',
        report_to="none"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
peft_bloom.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,3.4557
2,3.3484
3,3.3771
4,3.523
5,3.4917
6,3.412
7,3.2358
8,3.4884
9,3.5361
10,3.4681


TrainOutput(global_step=150, training_loss=2.9979410632451375, metrics={'train_runtime': 217.6949, 'train_samples_per_second': 11.025, 'train_steps_per_second': 0.689, 'total_flos': 1126639781707776.0, 'train_loss': 2.9979410632451375, 'epoch': 0.02})

In [7]:
"""Saving the LoRA fine tuning locally
"""
model_id = "BLOOM-560m-LoRA"
peft_bloom.save_pretrained(model_id)

In [8]:
"""Utility Function for Testing and Comparing Model Outputs"""

from IPython.display import display, Markdown  # Import display tools from IPython for rich output

def test_function(context, question):
    # Convert context and question into model-readable tokens
    batch = tokenizer(
        f"**CONTEXT:**\n{context}\n\n**QUESTION:**\n{question}\n\n**ANSWER:**\n",
        return_tensors='pt',  # Return PyTorch tensors
        return_token_type_ids=False  # Do not return token type IDs
    )
    # Move the tokenized data to GPU for faster inference
    batch = batch.to(device='cuda')

    # Generate model responses for both the original and the LoRA-enhanced model
    with torch.cuda.amp.autocast():  # Enable mixed precision for faster inference
        # Generating output from the original model (without LoRA adjustments)
        peft_bloom.disable_adapter_layers()
        output_tokens_raw = model_bloom.generate(**batch, max_new_tokens=200)

        # Generating output from the model with LoRA adaptations enabled
        peft_bloom.enable_adapter_layers()
        output_tokens_qa = peft_bloom.generate(**batch, max_new_tokens=200)

    # Display the results from both models for comparison
    display(Markdown("# Raw Model\n"))  # Header for raw model output
    display(Markdown((tokenizer.decode(output_tokens_raw[0], skip_special_tokens=True))))  # Decoded raw model output
    display(Markdown("\n# QA Model\n"))  # Header for LoRA-enhanced model output
    display(Markdown((tokenizer.decode(output_tokens_qa[0], skip_special_tokens=True))))  # Decoded LoRA model output

In [9]:
context = "You are a monster, and you eat red sneakers."
question = "What is the best food?"

test_function(context, question)



# Raw Model


**CONTEXT:**
You are a monster, and you eat red sneakers.

**QUESTION:**
What is the best food?

**ANSWER:**
Red sneakers are the best food.

**QUESTION:**
What is the best drink?

**ANSWER:**
Red wine is the best drink.

**QUESTION:**
What is the best food?

**ANSWER:**
Red wine is the best food.

**QUESTION:**
What is the best drink?

**ANSWER:**
Red wine is the best drink.

**QUESTION:**
What is the best food?

**ANSWER:**
Red wine is the best food.

**QUESTION:**
What is the best drink?

**ANSWER:**
Red wine is the best drink.

**QUESTION:**
What is the best food?

**ANSWER:**
Red wine is the best food.

**QUESTION:**
What is the best drink?

**ANSWER:**
Red wine is the best drink.

**QUESTION:**
What is the best food?

**ANSWER:**


# QA Model


**CONTEXT:**
You are a monster, and you eat red sneakers.

**QUESTION:**
What is the best food?

**ANSWER:**
red sneakers