Installs Unsloth, Xformers (Flash Attention), and all other packages!

In [1]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# We have to check which Torch version for Xformers (2.3 -> 0.0.27)
from torch import __version__; from packaging.version import Version as V
xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers"
!pip install --no-deps {xformers} trl peft accelerate bitsandbytes triton
!pip install wandb

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-v0.3",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.9: Fast Mistral patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/4.14G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/157 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/137k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/446 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

**Add** LoRA adapters so we only need to update 1 to 10% of all parameters!

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.9 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [4]:
# @title prepare data

lesson_prompt = """Below is an instruction that describes how to create a lesson plan, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    instructions = examples.get("instruction", [])
    inputs       = examples.get("input", [])
    outputs      = examples.get("output", [])
    texts = []

    # Ensure all fields are non-empty and clean the inputs
    for instruction, input_, output in zip(instructions, inputs, outputs):
        instruction = instruction.strip() if instruction else ""
        input_ = input_.strip() if input_ else ""
        output = output.strip() if output else ""

        if not instruction or not input_ or not output:
            # Skip examples with missing data
            continue

        # Create the prompt with EOS_TOKEN appended
        text = lesson_prompt.format(instruction, input_, output) + EOS_TOKEN
        texts.append(text)

    # Ensure we return the proper format even if the list is empty
    return { "text" : texts if texts else [""] }

# Loading and formatting dataset
from datasets import load_dataset

try:
    dataset = load_dataset("samadeniyi/lesson_plan", split="train")
    dataset = dataset.map(formatting_prompts_func, batched=True)
except Exception as e:
    print(f"Error loading or processing dataset: {e}")


README.md:   0%|          | 0.00/4.38k [00:00<?, ?B/s]

lesson_plan.csv:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4814 [00:00<?, ? examples/s]

Map:   0%|          | 0/4814 [00:00<?, ? examples/s]

In [5]:
dataset_dict = dataset.train_test_split(test_size=0.004)
test_dataset = dataset_dict['test']

In [6]:
# @title wandb init
import wandb
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [7]:
import os
%env WANDB_WATCH=all
%env WANDB_SILENT=true
os.environ.get("WANDB_SILENT")

env: WANDB_WATCH=all
env: WANDB_SILENT=true


'true'

### Train the model
Using Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [8]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

from transformers.utils import logging
import wandb

logging.set_verbosity_info()
project_name = "mistral-lesson-plan"
entity = "wandb"
# os.environ["WANDB_LOG_MODEL"] = "checkpoint"

wandb.init(project=project_name, name = "run_mistral_lesson_plan")


trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60, # Set num_train_epochs = 1 for full training runs
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to="wandb",  # enable logging to W&B
        logging_strategy = 'steps',
        save_total_limit=2,
    ),
)

[34m[1mwandb[0m: Currently logged in as: [33mthesamadeniyi[0m ([33mthesamadeniyi-university-of-essex[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112608922222636, max=1.0…

PyTorch: setting up devices
PyTorch: setting up devices


Map (num_proc=2):   0%|          | 0/4814 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
Using auto half precision backend


In [9]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.564 GB.
4.52 GB of memory reserved.


In [10]:
#@title Start training
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 4,814 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss
1,1.7474
2,1.7305
3,1.8084
4,1.508
5,1.4339
6,1.3521
7,1.2694
8,1.1871
9,1.2853
10,0.9434


Saving model checkpoint to outputs/checkpoint-60
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--mistral-7b-v0.3-bnb-4bit/snapshots/1d82629c1e6778cf8568b532a3c09b668805b15a/config.json
Model config MistralConfig {
  "_name_or_path": "unsloth/Mistral-7B-v0.3",
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pad_token_id": 770,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload

In [11]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

250.3882 seconds used for training.
4.17 minutes used for training.
Peak reserved memory = 5.705 GB.
Peak reserved memory for training = 1.185 GB.
Peak reserved memory % of max memory = 14.42 %.
Peak reserved memory for training % of max memory = 2.995 %.


### Inference
Let's run the model! You can change the instruction and input - leave the output blank!

In [12]:
if False:
  FastLanguageModel.for_inference(model) # Enable native 2x faster inference
  inputs = tokenizer(
  [
      lesson_prompt.format(
          "Create a lesson plan for teaching Definition of civic education in Civic Education to Senior Secondary School 1 students.", # instruction
          "", # input
          "", # output - leave this blank for generation!
      )
  ], return_tensors = "pt").to("cuda")

  outputs = model.generate(**inputs, max_new_tokens = 6000, use_cache = True)
  tokenizer.batch_decode(outputs)

### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model.

In [13]:
#@title Save model locally

model.save_pretrained("mistral-v3-7b-lora_lesson_plan_model")
tokenizer.save_pretrained("mistral-v3-7b-lora_lesson_plan_model")

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--mistral-7b-v0.3-bnb-4bit/snapshots/1d82629c1e6778cf8568b532a3c09b668805b15a/config.json
Model config MistralConfig {
  "_name_or_path": "unsloth/Mistral-7B-v0.3",
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pad_token_id": 770,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,


('mistral-v3-7b-lora_lesson_plan_model/tokenizer_config.json',
 'mistral-v3-7b-lora_lesson_plan_model/special_tokens_map.json',
 'mistral-v3-7b-lora_lesson_plan_model/tokenizer.model',
 'mistral-v3-7b-lora_lesson_plan_model/added_tokens.json',
 'mistral-v3-7b-lora_lesson_plan_model/tokenizer.json')

In [14]:
#@title Push model to Huggingface Hub

model.push_to_hub("samadeniyi/mistral-v3-7b-lora_lesson_plan_model", token = "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")
tokenizer.push_to_hub("samadeniyi/mistral-v3-7b-lora_lesson_plan_model", token = "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")

README.md:   0%|          | 0.00/591 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--mistral-7b-v0.3-bnb-4bit/snapshots/1d82629c1e6778cf8568b532a3c09b668805b15a/config.json
Model config MistralConfig {
  "_name_or_path": "unsloth/Mistral-7B-v0.3",
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pad_token_id": 770,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,


  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Saved model to https://huggingface.co/samadeniyi/mistral-v3-7b-lora_lesson_plan_model


Uploading the following files to samadeniyi/mistral-v3-7b-lora_lesson_plan_model: special_tokens_map.json,tokenizer.model,README.md,tokenizer_config.json,tokenizer.json
No files have been modified since last commit. Skipping to prevent empty commit.


**Load the LoRA adapters saved for inference**

In [15]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "mistral-v3-7b-lora_lesson_plan_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference


    inputs = tokenizer(
    [
        lesson_prompt.format(
            "Create a lesson plan for teaching Definition of civic education in Civic Education to Senior Secondary School 1 students.", # instruction
            "", # input
            "", # output - leave this blank for generation!
        ),
    ], return_tensors = "pt").to("cuda")

    outputs = model.generate(**inputs, max_new_tokens = 6000, use_cache = True)
    tokenizer.batch_decode(outputs)

In [16]:
# @title MODEL EVALUATION

!pip install transformers datasets evaluate
!pip install rouge_score

import torch
import evaluate
import math
from unsloth import FastLanguageModel

# Load BLEU, ROUGE, and Perplexity evaluators
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
perplexity = evaluate.load("perplexity")

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=00f35d2be88759b3ee8af3fab58a3363a90419c5cdbfd243d4f15e80b4cdd2fa
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/8.46k [00:00<?, ?B/s]

In [17]:
# @title Function to generate predictions from a model
def generate_predictions(model, tokenizer, dataset, device='cuda'):
    predictions = []
    references = []

    for example in dataset:
        instruction = example['instruction']
        input_text = example['input']
        target_output = example['output']

        # Create prompt with no output for generation
        prompt = lesson_prompt.format(instruction, input_text, "")
        inputs = tokenizer(prompt, return_tensors="pt").to(device)

        # Generate output without tracking gradients
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=600, use_cache=True)
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Save generated and reference texts
        predictions.append(generated_text)
        references.append(target_output)

        # Free up memory
        del inputs, outputs
        torch.cuda.empty_cache()

    return predictions, references

In [18]:
# @title Function to calculate perplexity in chunks to save memory
def calculate_perplexity(model, tokenizer, dataset, device='cuda', chunk_size=512):
    ll_sum = 0
    count = 0

    for example in dataset:
        # Tokenize the example and chunk the input to save memory
        input_ids = tokenizer(example['text'], return_tensors='pt', padding=True, truncation=True).input_ids.to(device)
        stride = min(chunk_size, input_ids.size(1))

        # Iterate through chunks to calculate perplexity
        for i in range(0, input_ids.size(1), stride):
            input_chunk = input_ids[:, i:i + stride].to(device)

            # Compute loss for the current chunk without gradients
            with torch.no_grad():
                outputs = model(input_chunk, labels=input_chunk)
                loss = outputs.loss

            # Count tokens and sum loss
            n_tokens = input_chunk.numel()
            ll_sum += loss.item() * n_tokens
            count += n_tokens

            # Free memory
            del input_chunk, outputs
            torch.cuda.empty_cache()

    # Compute final perplexity
    perplexity_value = math.exp(ll_sum / count) if count > 0 else float('inf')
    return perplexity_value

In [19]:
# @title Function to evaluate BLEU, ROUGE, and Perplexity
def evaluate_model(model, tokenizer, dataset, device='cuda'):
    # Generate predictions and references
    predictions, references = generate_predictions(model, tokenizer, dataset, device)

    # Evaluate BLEU and ROUGE
    bleu_results = bleu.compute(predictions=predictions, references=[[ref] for ref in references])
    rouge_results = rouge.compute(predictions=predictions, references=references)

    # Evaluate Perplexity with memory optimization
    perplexity_value = calculate_perplexity(model, tokenizer, dataset, device)

    return bleu_results, rouge_results, perplexity_value

In [20]:
def load_model_and_tokenizer(model_name, max_seq_length, dtype, load_in_4bit, device='cuda'):
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    return model, tokenizer

In [21]:
# @title Load the parent model
parent_model_name = "unsloth/mistral-7b-v0.3"
fine_tuned_model_name = "mistral-v3-7b-lora_lesson_plan_model"

# Define the configuration for loading the models
max_seq_length = 1024
dtype = torch.float16  # Use mixed precision to save memory
load_in_4bit = True
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [22]:
# @title Load parent model and tokenizer
parent_model, parent_tokenizer = FastLanguageModel.from_pretrained(
    model_name=parent_model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit
)
FastLanguageModel.for_inference(parent_model)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--mistral-7b-v0.3-bnb-4bit/snapshots/1d82629c1e6778cf8568b532a3c09b668805b15a/config.json
Model config MistralConfig {
  "_name_or_path": "unsloth/mistral-7b-v0.3-bnb-4bit",
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pad_token_id": 770,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight"

==((====))==  Unsloth 2024.9: Fast Mistral patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unslothai--colabpro/snapshots/234f33d5f3e1d9ad83421f33640cd88474a25025/config.json
Model config LlamaConfig {
  "_name_or_path": "unslothai/colabpro",
  "architectures": [
    "LlamaModel"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 0,
  "initializer_range": 0.02,
  "intermediate_size": 0,
  "max_position_embeddings": 0,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 0,
  "num_hidden_layers": 0,
  "num_key_value_heads": 0,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float32",
  "transformers_version": "4.44.2",
  "use_cache": true,
  "vocab_size": 0
}

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--unslothai--colabpro/snapshots/234f3

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32768, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((40

In [23]:
# @title Load fine-tuned model and tokenizer
fine_tuned_model, fine_tuned_tokenizer = FastLanguageModel.from_pretrained(
    model_name=fine_tuned_model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit
)
FastLanguageModel.for_inference(fine_tuned_model)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--mistral-7b-v0.3-bnb-4bit/snapshots/1d82629c1e6778cf8568b532a3c09b668805b15a/config.json
Model config MistralConfig {
  "_name_or_path": "unsloth/mistral-7b-v0.3-bnb-4bit",
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pad_token_id": 770,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight"

==((====))==  Unsloth 2024.9: Fast Mistral patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unslothai--colabpro/snapshots/234f33d5f3e1d9ad83421f33640cd88474a25025/config.json
Model config LlamaConfig {
  "_name_or_path": "unslothai/colabpro",
  "architectures": [
    "LlamaModel"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 0,
  "initializer_range": 0.02,
  "intermediate_size": 0,
  "max_position_embeddings": 0,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 0,
  "num_hidden_layers": 0,
  "num_key_value_heads": 0,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float32",
  "transformers_version": "4.44.2",
  "use_cache": true,
  "vocab_size": 0
}

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--unslothai--colabpro/snapshots/234f3

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32768, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear4bit(

In [24]:
# @title Dataset for evaluation
# Run evaluation for the parent model
print("Evaluating Parent Model...")
parent_bleu, parent_rouge, parent_perplexity = evaluate_model(parent_model, parent_tokenizer, test_dataset, device)

# Run evaluation for the fine-tuned model
print("Evaluating Fine-Tuned Model...")
fine_tuned_bleu, fine_tuned_rouge, fine_tuned_perplexity = evaluate_model(fine_tuned_model, fine_tuned_tokenizer, test_dataset, device)

# Display evaluation results
print("\nParent Model Evaluation Results:")
print(f"BLEU Score: {parent_bleu['bleu']}")
print(f"ROUGE Scores: {parent_rouge}")
print(f"Perplexity: {parent_perplexity}")

print("\nFine-Tuned Model Evaluation Results:")
print(f"BLEU Score: {fine_tuned_bleu['bleu']}")
print(f"ROUGE Scores: {fine_tuned_rouge}")
print(f"Perplexity: {fine_tuned_perplexity}")



Evaluating Parent Model...
Evaluating Fine-Tuned Model...

Parent Model Evaluation Results:
BLEU Score: 0.040669614031793144
ROUGE Scores: {'rouge1': 0.2946268312028426, 'rouge2': 0.06785450257902911, 'rougeL': 0.1572269094963536, 'rougeLsum': 0.18137008724283216}
Perplexity: 6.724783293386482

Fine-Tuned Model Evaluation Results:
BLEU Score: 0.18526829240855733
ROUGE Scores: {'rouge1': 0.44735317995614776, 'rouge2': 0.2435378214359493, 'rougeL': 0.33010234328009275, 'rougeLsum': 0.3461116357002173}
Perplexity: 3.0738978353557673


In [28]:
# Display evaluation results
print("\nParent Model Evaluation Results:")
print(f"BLEU Score: {parent_bleu['bleu']}")
print(f"ROUGE Scores: {parent_rouge}")
print(f"Perplexity: {parent_perplexity}")

print("\nFine-Tuned Model Evaluation Results:")
print(f"BLEU Score: {fine_tuned_bleu['bleu']}")
print(f"ROUGE Scores: {fine_tuned_rouge}")
print(f"Perplexity: {fine_tuned_perplexity}")

import pandas as pd

# Data provided
data = {
    'Metric': ['BLEU Score', 'ROUGE Scores', 'Perplexity'],
    'Parent Model': [parent_bleu["bleu"], parent_rouge, parent_perplexity],
    'Fine-Tuned Model': [fine_tuned_bleu["bleu"], fine_tuned_rouge, fine_tuned_perplexity]
}

# Create DataFrame
df = pd.DataFrame(data)
df



Parent Model Evaluation Results:
BLEU Score: 0.040669614031793144
ROUGE Scores: {'rouge1': 0.2946268312028426, 'rouge2': 0.06785450257902911, 'rougeL': 0.1572269094963536, 'rougeLsum': 0.18137008724283216}
Perplexity: 6.724783293386482

Fine-Tuned Model Evaluation Results:
BLEU Score: 0.18526829240855733
ROUGE Scores: {'rouge1': 0.44735317995614776, 'rouge2': 0.2435378214359493, 'rougeL': 0.33010234328009275, 'rougeLsum': 0.3461116357002173}
Perplexity: 3.0738978353557673


Unnamed: 0,Metric,Parent Model,Fine-Tuned Model
0,BLEU Score,0.04067,0.185268
1,ROUGE Scores,"{'rouge1': 0.2946268312028426, 'rouge2': 0.067...","{'rouge1': 0.44735317995614776, 'rouge2': 0.24..."
2,Perplexity,6.724783,3.073898


