In [None]:
!pip install -U bitsandbytes torch transformers peft datasets accelerate trl einops pynvml

In [1]:
import os
import torch
import wandb
import time
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, BitsAndBytesConfig, pipeline, DataCollatorWithPadding, GenerationConfig, HfArgumentParser, TrainerCallback
from datasets import load_dataset, load_from_disk
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType, PeftModel
from trl import SFTTrainer
from tqdm import tqdm
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "microsoft/phi-2"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

original_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map={"": 0}
)

tokenizer = AutoTokenizer.from_pretrained(model_name,trust_remote_code=True,padding_side="left",add_eos_token=True,add_bos_token=True,use_fast=False)
tokenizer.pad_token = tokenizer.eos_token

snli = load_dataset('snli')
test_dataset = snli['test'].select(range(0, 10000, 100))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
def create_prompt(premise, hypothesis):
    return f"""Check whether the Hypothesis entails the Premise or not. Follow the following output format:
    0 : Hypothesis entails the Premise
    1 : Neutral
    2 : Hypothesis does not entail the Premise
    Output only one single numerical value.

    Premise: {premise}
    Hypothesis: {hypothesis}

    Output:
    """

In [None]:
correct = 0
total = len(test_dataset)
failure_cases = []

for i, sample in enumerate(test_dataset):

    prompt = create_prompt(sample['premise'], sample['hypothesis'])
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)

    original_model.to(input_ids.device)

    with torch.no_grad():
        generated_ids = original_model.generate(input_ids, max_length=input_ids.size(1)+64)

    output = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    answer = None
    try:
        answer = int((output.split("Output:\n")[-1].strip().split("\n")[0]).split(" ")[0])
    except:
        answer = None

    if answer is not None and answer == sample['label']:
        correct += 1
    else:
        failure_cases.append({
            "premise": sample['premise'],
            "hypothesis": sample['hypothesis'],
            "predicted_label": answer,
            "actual_label": sample['label']
        })

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask an

In [6]:
accuracy = correct/total
print(f"Accuracy before fine-tuning: {accuracy * 100:.2f}%")

Accuracy before fine-tuning: 59.00%


In [7]:
failure_cases

[{'premise': 'This church choir sings to the masses as they sing joyous songs from the book at a church.',
  'hypothesis': 'The church has cracks in the ceiling.',
  'predicted_label': 2,
  'actual_label': 1},
 {'premise': 'A woman within an orchestra is playing a violin.',
  'hypothesis': 'A woman is playing the violin.',
  'predicted_label': 1,
  'actual_label': 0},
 {'premise': 'Two men climbing on a wooden scaffold.',
  'hypothesis': 'Two sad men climbing on a wooden scaffold.',
  'predicted_label': 0,
  'actual_label': 1},
 {'premise': 'A group of people stand near and on a large black square on the ground with some yellow writing on it.',
  'hypothesis': 'a group of people wait',
  'predicted_label': 0,
  'actual_label': 1},
 {'premise': 'A Skier ski-jumping while two other skiers watch his act.',
  'hypothesis': 'A skier preparing a trick',
  'predicted_label': 1,
  'actual_label': 0},
 {'premise': 'Children bathe in water from large drums.',
  'hypothesis': 'The kids are wet.',

In [8]:
original_model = prepare_model_for_kbit_training(original_model)
original_model.gradient_checkpointing_enable()

training_arguments = TrainingArguments(
    output_dir = "./logs",
    num_train_epochs = 5,
    warmup_steps=1,
    save_strategy="epoch",
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 2,
    gradient_checkpointing = False,
    learning_rate = 4e-4,
    optim = "paged_adamw_32bit",
    group_by_length = True,
    logging_steps = 100,
    evaluation_strategy="epoch",
    report_to="none",
    do_eval=True
)

peft_config = LoraConfig(
    r = 32,
    lora_alpha = 32,
    lora_dropout = 0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules = ['q_proj','k_proj','v_proj','dense']
)

peft_model = get_peft_model(original_model, peft_config)

In [None]:
from functools import partial

def create_prompt(sample):

    return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Check whether the Hypothesis entails the Premise or not. Follow the following output format:
    0 : Hypothesis entails the Premise
    1 : Neutral
    2 : Hypothesis does not entail the Premise
    Output only one single numerical value.

    Premise: {sample['premise']}
    Hypothesis: {sample['hypothesis']}

### Output: {sample['label']}
### End"""


def preprocess_dataset(tokenizer, dataset):

    # Format prompt for each sample
    dataset = dataset.map(lambda x: {"text": create_prompt(x)})

    # Tokenize the text; ensure we use the correct field from the dataset
    tokenized_dataset = dataset.map(
        lambda x: tokenizer(x['text'], max_length=2048, truncation=True),
        batched=True,
        remove_columns=['premise', 'hypothesis', 'label']
    )

    # Filter by max_length and shuffle dataset
    tokenized_dataset = tokenized_dataset.filter(lambda x: len(x["input_ids"]) < 2048).shuffle(seed=423)

    return tokenized_dataset

In [10]:
import transformers

train_dataset = preprocess_dataset(tokenizer, snli['train'].select(range(0, 550000, 550)))
val_dataset = preprocess_dataset(tokenizer, snli['validation'].select(range(0, 10000, 100)))
peft_model.config.use_cache = False

In [11]:
import psutil
from transformers import TrainerCallback, TrainerState, TrainerControl


def log_resource_usage():
    # CPU and memory usage
    memory_usage = psutil.virtual_memory().used / (1024 ** 3)  # in GB
    cpu_usage = psutil.cpu_percent(interval=1)  # in %

    # GPU usage (if CUDA is available)
    if torch.cuda.is_available():
        gpu_memory = torch.cuda.memory_allocated(0) / (1024 ** 3)  # in GB
        gpu_memory_reserved = torch.cuda.memory_reserved(0) / (1024 ** 3)  # in GB
        gpu_utilization = torch.cuda.utilization(0)
        print(f"GPU Usage - Memory Allocated: {gpu_memory:.2f} GB, Memory Reserved: {gpu_memory_reserved:.2f} GB, Utilization: {gpu_utilization}%")
    else:
        gpu_memory = 0
        gpu_utilization = 0
        print("No GPU available.")

    print(f"CPU Usage: {cpu_usage}%, Memory Usage: {memory_usage:.2f} GB")

In [12]:
# Custom callback to log resource usage after each epoch
class ResourceLoggingCallback(TrainerCallback):
    def on_epoch_end(self, args, state: TrainerState, control: TrainerControl, **kwargs):
        print(f"\nEnd of Epoch {state.epoch}/{args.num_train_epochs}")
        log_resource_usage()

# Initialize the trainer with the custom callback
peft_trainer = transformers.Trainer(
    model=peft_model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=training_arguments,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
    callbacks=[ResourceLoggingCallback()]  # Register the callback here
)

start_time = time.time()
peft_trainer.train()
end_time = time.time()

Epoch,Training Loss,Validation Loss
1,0.6915,0.529073
2,0.4916,0.519329
3,0.4865,0.517142
4,0.4447,0.52077
5,0.4354,0.527814



End of Epoch 1.0/5


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


GPU Usage - Memory Allocated: 2.55 GB, Memory Reserved: 3.41 GB, Utilization: 0%
CPU Usage: 3.5%, Memory Usage: 2.79 GB

End of Epoch 2.0/5


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


GPU Usage - Memory Allocated: 2.55 GB, Memory Reserved: 4.15 GB, Utilization: 0%
CPU Usage: 3.5%, Memory Usage: 2.80 GB

End of Epoch 3.0/5


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


GPU Usage - Memory Allocated: 2.55 GB, Memory Reserved: 4.15 GB, Utilization: 0%
CPU Usage: 2.0%, Memory Usage: 2.81 GB

End of Epoch 4.0/5


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


GPU Usage - Memory Allocated: 2.55 GB, Memory Reserved: 4.15 GB, Utilization: 0%
CPU Usage: 3.0%, Memory Usage: 2.82 GB

End of Epoch 5.0/5


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


GPU Usage - Memory Allocated: 2.55 GB, Memory Reserved: 4.15 GB, Utilization: 0%
CPU Usage: 3.5%, Memory Usage: 2.99 GB


In [13]:
print(f"Time taken to fine-tune the model using QLoRA: {(end_time - start_time) / 60:.2f} minutes")

Time taken to fine-tune the model using QLoRA: 22.89 minutes


In [14]:
model_save_directory = "./fine_tuned_model"
peft_model.save_pretrained(model_save_directory)

total_params = peft_model.num_parameters()
trainable_params = sum(p.numel() for p in peft_model.parameters() if p.requires_grad)
print(f"Total parameters in the model: {total_params:,}")
print(f"Number of parameters fine-tuned: {trainable_params:,}")

Total parameters in the model: 2,800,655,360
Number of parameters fine-tuned: 20,971,520


In [None]:
base_model_id = "microsoft/phi-2"
base_model = AutoModelForCausalLM.from_pretrained(base_model_id,
                                                      device_map='auto',
                                                      quantization_config=bnb_config,
                                                      trust_remote_code=True)

eval_tokenizer = AutoTokenizer.from_pretrained(base_model_id, add_bos_token=True, trust_remote_code=True, use_fast=False)
eval_tokenizer.pad_token = eval_tokenizer.eos_token

ft_model = PeftModel.from_pretrained(base_model, model_save_directory,torch_dtype=torch.float16,is_trainable=False)

In [None]:
# Define the prompt format function
def format_prompt(premise, hypothesis):
    return f"""Check whether the Hypothesis entails the Premise or not. Follow the following output format:
    0 : Hypothesis entails the Premise
    1 : Neutral
    2 : Hypothesis does not entail the Premise
    Output only one single numerical value.

    Premise: {premise}
    Hypothesis: {hypothesis}

    Output:
    """

# Tracking variables
correct = 0
total = len(test_dataset)
corrected_by_finetuned = []
not_corrected_by_finetuned = []
pretrained_failures = failure_cases

# Loop through each test sample
for i, sample in enumerate(test_dataset):
    # Prepare the prompt
    prompt = format_prompt(sample['premise'], sample['hypothesis'])
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)

    ft_model.to(input_ids.device)

    with torch.no_grad():
        generated_ids = ft_model.generate(input_ids, max_length=input_ids.size(1)+10)

    output = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    answer = None
    try:
        answer = int((output.split("Output:\n")[-1].strip().split("\n")[0]).split(" ")[0])
    except:
        answer = None

    if answer is not None and answer == sample['label']:
        correct += 1

    # Check if this case was a pre-trained model failure
    pretrain_failure = next((f for f in pretrained_failures if f["premise"] == sample['premise'] and f["hypothesis"] == sample['hypothesis']), None)

    if pretrain_failure:
        # If fine-tuned model corrected it, add to corrected list; else, add to not-corrected list
        if answer == sample['label']:
            corrected_by_finetuned.append({
                "premise": sample['premise'],
                "hypothesis": sample['hypothesis'],
                "pretrained_predicted_label": pretrain_failure["predicted_label"],
                "fine_tuned_predicted_label": answer,
                "actual_label": sample['label']
            })
        else:
            not_corrected_by_finetuned.append({
                "premise": sample['premise'],
                "hypothesis": sample['hypothesis'],
                "pretrained_predicted_label": pretrain_failure["predicted_label"],
                "fine_tuned_predicted_label": answer,
                "actual_label": sample['label']
            })

# Now `corrected_by_finetuned` contains failures corrected by fine-tuning,
# and `not_corrected_by_finetuned` contains those still incorrect after fine-tuning.

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [18]:
accuracy = correct/total
print(f"Accuracy after fine-tuning: {accuracy * 100:.2f}%")

Accuracy after fine-tuning: 87.00%


In [19]:
corrected_by_finetuned

[{'premise': 'A woman within an orchestra is playing a violin.',
  'hypothesis': 'A woman is playing the violin.',
  'pretrained_predicted_label': 1,
  'fine_tuned_predicted_label': 0,
  'actual_label': 0},
 {'premise': 'A group of people stand near and on a large black square on the ground with some yellow writing on it.',
  'hypothesis': 'a group of people wait',
  'pretrained_predicted_label': 0,
  'fine_tuned_predicted_label': 1,
  'actual_label': 1},
 {'premise': 'Children bathe in water from large drums.',
  'hypothesis': 'The kids are wet.',
  'pretrained_predicted_label': 1,
  'fine_tuned_predicted_label': 0,
  'actual_label': 0},
 {'premise': 'A man is renovating a room.',
  'hypothesis': 'A man is using a hammer in a room.',
  'pretrained_predicted_label': 0,
  'fine_tuned_predicted_label': 1,
  'actual_label': 1},
 {'premise': 'An Ambulance is passing a man wearing a bandanna and girl.',
  'hypothesis': 'The man in the bandana is running after the ambulance',
  'pretrained_p

In [20]:
not_corrected_by_finetuned

[{'premise': 'This church choir sings to the masses as they sing joyous songs from the book at a church.',
  'hypothesis': 'The church has cracks in the ceiling.',
  'pretrained_predicted_label': 2,
  'fine_tuned_predicted_label': 2,
  'actual_label': 1},
 {'premise': 'Two men climbing on a wooden scaffold.',
  'hypothesis': 'Two sad men climbing on a wooden scaffold.',
  'pretrained_predicted_label': 0,
  'fine_tuned_predicted_label': 2,
  'actual_label': 1},
 {'premise': 'A Skier ski-jumping while two other skiers watch his act.',
  'hypothesis': 'A skier preparing a trick',
  'pretrained_predicted_label': 1,
  'fine_tuned_predicted_label': 1,
  'actual_label': 0},
 {'premise': 'A woman is standing near three stores, two have beautiful artwork and the other store has Largo written on it.',
  'hypothesis': 'A woman standing on a street corner outside beside three different stores, two of which contain beautiful artwork and one with a Largo sign.',
  'pretrained_predicted_label': 1,
  

In [21]:
import shutil
from google.colab import files

shutil.make_archive('/content/fine_tuned_model', 'zip', '/content/fine_tuned_model')
shutil.make_archive('/content/logs', 'zip', '/content/logs')
files.download('/content/fine_tuned_model.zip')
files.download('/content/logs.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>