In [1]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
!pip install datasets
!pip install transformers accelerate bitsandbytes>0.37.0 trl peft



### Load model

In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = "meta-llama/Llama-3.2-1B-Instruct"
device = "cuda" if torch.cuda.is_available() else "cpu"

# TODO: Load the model using the appropriate parameters using AutoModelForCausalLM
# Ensure torch_dtype is set to torch.bfloat16
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(device)
model.generation_config.temperature = None
model.generation_config.top_p = None

# TODO: Initialize the tokenizer using AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


### Load dataset

In [5]:
from datasets import load_dataset, Dataset

train_dataset = load_dataset("nyu-mll/multi_nli", split="train")
valid_dataset = load_dataset("nyu-mll/multi_nli", split="validation_matched")
test_dataset  = load_dataset("nyu-mll/multi_nli", split="validation_mismatched")
train_dataset

### Data preprocessing & Generate prompt

In [4]:
def apply_esnli_prompt(premise, hypothesis):
    # TODO:
    # Write a prompt for esnli dataset using given premise and hypothesis
    # so that the model classifies the input as entailment, neutral, or contradiction

    # prompt 1
    content = f"""Premise:
    '{premise}'
    Based on this premise, can we conclude the hypothesis '{hypothesis}' is true?
    OPTIONS:
    - entailment
    - neutral
    - contradiction"""

    # content = f""" Premise: '{premise}'
    # Based on the premise, determining whether the hypothesis '{hypothesis}' is true (entailment), false (contradiction), or undetermined (neutral)?

    # OPTIONS:
    # - entailment
    # - neutral
    # - contradiction
    # """

    # content = f"""Premise:
    # '{premise}'
    # Based on this premise, can we conclude the hypothesis '{hypothesis}' is true?
    # OPTIONS:
    # - entailment
    # - neural
    # - contradiction"""


    prompt = [
        {"role": "system", "content": "You are a helpful assistant who respond entailment, neutral, or contradiction. You should only respond one word."},
        {"role": "user", "content": content},
    ]

    # return prompt.strip()
    return prompt

In [6]:
label_map = {0: "entailment", 1: "neutral", 2: "contradiction"}
column_name = train_dataset.column_names

train_dataset = train_dataset.shuffle(seed=8)
valid_dataset = valid_dataset.shuffle(seed=8)
test_dataset = test_dataset.shuffle(seed=8)

select_train_dataset = train_dataset.select(range(500))
select_valid_dataset = valid_dataset.select(range(200))

select_test_mm_dataset = test_dataset.select(range(50))
select_test_m_dataset = valid_dataset.select(range(50))

In [7]:
def process_row(example):
    return {
        "premise": example["premise"],
        "hypothesis": example["hypothesis"],
        "prompt": apply_esnli_prompt(example["premise"], example["hypothesis"]),  # Add new column
        "gt_label": label_map[example["label"]],
    }

prompt_train_dataset = select_train_dataset.map(process_row, remove_columns=column_name)
prompt_valid_dataset = select_valid_dataset.map(process_row, remove_columns=column_name)
prompt_test_mm_dataset = select_test_mm_dataset.map(process_row, remove_columns=column_name)
prompt_test_m_dataset = select_test_m_dataset.map(process_row, remove_columns=column_name)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

### Fine-tuning (lora)

In [11]:
from peft import LoraConfig
import argparse
import torch
import datasets
from datasets import load_dataset
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from transformers import TrainingArguments, HfArgumentParser, AutoTokenizer, TrainerCallback
from huggingface_hub import login
import matplotlib.pyplot as plt

In [8]:
def create_lora_config():
    peft_config = LoraConfig(
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=[
            "q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj",
        ],
        # TODO: Set r=32 and alpha=16
        r=32,
        lora_alpha=16,
    )
    return peft_config


# Defining the training arguments. These control various aspects of training such as learning rate,
# batch size, number of epochs, evaluation strategy, etc.
training_args = TrainingArguments(
    report_to="none",
    learning_rate=5e-5, #8e-5
    lr_scheduler_type="constant_with_warmup",
    warmup_steps=10,
    num_train_epochs=3, #5
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    output_dir="output_model",
    overwrite_output_dir=True,
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=False,
    logging_steps=1,
    seed=0,
    do_train=True,
    do_eval=True,
    do_predict=False,
    eval_strategy="epoch",
    gradient_checkpointing=True,
    max_grad_norm=0.3,
    push_to_hub=False,
    hub_private_repo=True,
)

# Create a LoRA (Low-Rank Adaptation) configuration for parameter-efficient fine-tuning.
# This reduces the number of parameters to train and makes the model lighter and faster.
peft_config = create_lora_config()

# This function formats the input dataset according to a specific template
# expected by the model, turning prompts and completions into a chat-based format.
def instructions_formatting_function(tokenizer: AutoTokenizer):
    def format_dataset(examples):
        if isinstance(examples["prompt"], list):
            output_texts = []
            for i in range(len(examples["prompt"])):
                converted_sample = [
                    # {"role": "system", "content": "Calculate the answer for the following problem and provide only the numerical result."},
                    {"role": "user", "content": examples["prompt"][i]},
                    {"role": "assistant", "content": examples["gt_label"][i]},
                ]
                output_texts.append(tokenizer.apply_chat_template(converted_sample, tokenize=False))
            output_texts = [text.replace("<s>", "").replace("<|begin_of_text|>", "").replace("\n\n", "") for text in output_texts]
            return output_texts
        else:
            converted_sample = [
                # {"role": "system", "content": "Calculate the answer for the following problem and provide only the numerical result."},
                {"role": "user", "content": examples["prompt"]},
                {"role": "assistant", "content": examples["gt_label"]},
            ]
            return tokenizer.apply_chat_template(converted_sample, tokenize=False)

    return format_dataset


tokenizer = AutoTokenizer.from_pretrained(model_id)
if getattr(tokenizer, "pad_token", None) is None:
    tokenizer.pad_token = tokenizer.eos_token

# Initialize the data collator, which is responsible for completion tasks.
# It ensures that all the tokens of the labels are set to an 'ignore_index'
# when they do not come from the assistant. This ensure that the loss is only
# calculated on the completion made by the assistant.
response_template = "<|start_header_id|>assistant<|end_header_id|>"
print("response_template:", response_template)
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

# Initialize the trainer with all the specified configurations, datasets, and formatting functions.
trainer = SFTTrainer(
    model_id,
    args=training_args,
    train_dataset = prompt_train_dataset,
    eval_dataset = prompt_valid_dataset,
    packing=False,
    model_init_kwargs={
        "torch_dtype": torch.bfloat16,
    },
    tokenizer=tokenizer,
    max_seq_length=500,
    peft_config=peft_config,
    formatting_func=instructions_formatting_function(tokenizer),
    data_collator=collator,
)

trainer.train()

# Save the PEFT model weights
output_dir = "output_peft_model"
trainer.save_model(output_dir)

print(f"PEFT model weights saved to {output_dir}")

response_template: <|start_header_id|>assistant<|end_header_id|>



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss
1,0.5106,0.529104
2,0.3245,0.331339
3,0.3241,0.322769


PEFT model weights saved to output_peft_model


### Evaluation

In [9]:
messages = [
    {"role": "system", "content": "You are a chatbot who responds very shortly."},
    {"role": "user", "content": "When was UCLA founded?"},
]

def run_model(model, tokenizer, messages, max_new_tokens=50, verbose=False):
    input_text = tokenizer.apply_chat_template(messages, tokenize=False)

    if verbose: print("\n###input_text:###\n", input_text)
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

    if verbose: print("\n###input_ids:###\n", inputs.input_ids)
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id  # Prevents padding errors
    )

    # TODO: Decode the output and return the response without special tokens
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if verbose: print("\n###response:###\n", response)
    assistant_response = response.split('assistant')[-1].replace("\n", " ").strip()
    return assistant_response

assistant_response = run_model(model=model, tokenizer=tokenizer, messages=messages, max_new_tokens=10, verbose=False)
print(f"\n###Assistant response:###\n{assistant_response}")


###Assistant response:###
University of California, Los Angeles


In [10]:
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score
import sys

def evaluate(model, tokenizer, test_dataset):
    """
    Evaluate the model on the test dataset.
    Returns:
        accuracy: The accuracy of the model on the test dataset. The value is scaled from 0.0 to 1.0 (float)
        outputs: The model's predictions on the test dataset. (list[str])
    """
    # TODO: Implement the evaluation loop and return accuracy of the model as well as list of outputs
    # Hint: You can reuse the run_model function we implemented earlier.
    outputs = []
    ground_truths = []

    # Iterate over the test dataset
    for row in tqdm(test_dataset, total=len(test_dataset)):
        prompt = row["prompt"]
        label = row["gt_label"]  # The true label ("entailment", "neutral", or "contradiction")

        model_response = run_model(model, tokenizer, messages=[{"role": "user", "content": prompt}], max_new_tokens=25)
        output = model_response.replace("#", "").replace("Relationship", "").replace("Explanation", "").replace(":", "").split(".")[0].strip()

        # print(f"Prompt: {prompt}")
        # print(f"Ground Truth: {label}")
        # print(f"Model Output: {output}")
        # print("==========================")

        # Append predictions and ground truths
        outputs.append(output)
        ground_truths.append(label)

    # Calculate accuracy
    accuracy = accuracy_score(ground_truths, outputs)

    return accuracy, outputs

In [12]:
from peft import PeftModel, PeftConfig
# Load the base model and tokenizer
base_model_id = "meta-llama/Llama-3.2-1B-Instruct"  # Replace with your base model ID or path
tokenizer = AutoTokenizer.from_pretrained(base_model_id)

# Load the PEFT model configuration
peft_model_dir = "output_peft_model"  # Directory where PEFT weights were saved
peft_config = PeftConfig.from_pretrained(peft_model_dir)

# Load the base model
base_model = AutoModelForCausalLM.from_pretrained(base_model_id, torch_dtype=torch.bfloat16).to(device)

# Load the PEFT weights into the base model
peft_model = PeftModel.from_pretrained(base_model, peft_model_dir).to(device)

# Set the model to evaluation mode
peft_model.eval()
evaluate(model=peft_model, tokenizer=tokenizer, test_dataset=prompt_test_m_dataset)
# base_model.eval()
# evaluate(model=base_model, tokenizer=tokenizer, test_dataset=prompt_test_dataset)

  0%|          | 0/50 [00:00<?, ?it/s]



(0.72,
 ['neutral',
  'neutral',
  'contradiction',
  'contradiction',
  'neutral',
  'neutral',
  'neutral',
  'neutral',
  'contradiction',
  'neutral',
  'entailment',
  'neutral',
  'neutral',
  'contradiction',
  'contradiction',
  'neutral',
  'entailment',
  'neutral',
  'neutral',
  'contradiction',
  'neutral',
  'neutral',
  'contradiction',
  'entailment',
  'contradiction',
  'neutral',
  'neutral',
  'neutral',
  'neutral',
  'contradiction',
  'entailment',
  'neutral',
  'neutral',
  'neutral',
  'neutral',
  'contradiction',
  'neutral',
  'neutral',
  'neutral',
  'neutral',
  'contradiction',
  'entailment',
  'neutral',
  'contradiction',
  'neutral',
  'contradiction',
  'contradiction',
  'neutral',
  'neutral',
  'contradiction'])

### Result

In [21]:
#################################################### Result on MNLI dataset ####################################################
# Model: Llama-3.2-1B-Instruct | Prompt 1
# Training: 500 | Testing: 200 | Validation: 50/50
# Valid set 1 (mm) acc.: 0.34
# Valid set 2 (m) acc.: 0.34
#
#
# Model: Llama-3.2-1B-Instruct (LoRA) | Prompt 1 | Epoch = 3 | lr = 5e-5
# Training: 500 | Testing: 200 | Validation: 50/50
# Valid set 1 (mm) acc.: 0.62
# Valid set 2 (m) acc.: 0.72