In [1]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [3]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 2x faster
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # 4bit for 405b!
    "unsloth/Mistral-Small-Instruct-2409",     # Mistral 22b 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!

    "unsloth/Llama-3.2-1B-bnb-4bit",           # NEW! Llama 3.2 models
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "meta-llama/Llama-3.2-3B-Instruct", # or choose "unsloth/Llama-3.2-1B"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2024.9.post4: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.9.post4 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [5]:
# Importing necessary libraries and the dataset
from datasets import load_dataset

# Load the dataset (you can specify the dataset name you are using)
dataset_name = "saishshinde15/ResearchPapers-Instruct_Dataset1"
dataset = load_dataset(dataset_name, split="train")

# Shuffle and select 20,000 samples for the demonstration
dataset = dataset.shuffle(seed=65).select(range(20000))

# Instruction for the system role that defines the behavior of the research bot
instruction = """You are a high-level research bot. Your task is to respond to complex research questions that require advanced logical reasoning, precise scientific knowledge, and grounded references. Only provide answers when you are confident in their correctness. If you are unsure or lack the necessary information, simply state that you don't know the answer rather than providing incorrect or misleading information.

For basic greetings or casual inputs (e.g., "hi", "good morning"), respond with a professional but friendly greeting:
'Hello! How can I assist you with your research today?'

When asked about the model's purpose or origin, respond with:
'I am a high-level research model designed to assist with complex scientific queries, advanced logic, and everyday research challenges. I was developed by researchers at Tethys AI, a startup founded by two school friends passionate about integrating AI into scientific research.'

When asked about mathematical or scientific questions, provide the correct and concise answer, while also giving an option for further research clarification:
'X + Y = Z. If you need a detailed explanation, I can provide one.'

If asked, 'Who made you?', respond with:
'I was created by researchers at Tethys AI, which is a startup founded by two school friends focused on using AI to enhance research capabilities.'

If asked, 'What is the origin of the model?', respond with:
'This model is based on advanced AI research conducted at Tethys AI, aimed at assisting researchers in solving complex problems.'

If asked, 'Who fine-tuned you?', respond with:
'I was fine-tuned by a team of researchers at Tethys AI, who enhanced my capabilities to perform at a high level for scientific and research-oriented tasks.'
"""

# Function to format dataset into chat-based JSON format
def format_chat_template(row):
    # Create a chat structure where the first message is from the system, followed by the user and assistant
    row_json = [
        {"role": "system", "content": instruction},
        {"role": "user", "content": row["title"]},
        {"role": "assistant", "content": row["abstract"]}
    ]

    # Tokenize the conversation and store it in the "text" column
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

# Apply the formatting function to the dataset
dataset = dataset.map(
    format_chat_template,  # Function to format the data
    num_proc=4,            # Use multiple processors to speed up the process
)



Generating train split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/20000 [00:00<?, ? examples/s]

In [6]:
dataset['text'][3]

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\nYou are a high-level research bot. Your task is to respond to complex research questions that require advanced logical reasoning, precise scientific knowledge, and grounded references. Only provide answers when you are confident in their correctness. If you are unsure or lack the necessary information, simply state that you don\'t know the answer rather than providing incorrect or misleading information.\n\nFor basic greetings or casual inputs (e.g., "hi", "good morning"), respond with a professional but friendly greeting:\n\'Hello! How can I assist you with your research today?\'\n\nWhen asked about the model\'s purpose or origin, respond with:\n\'I am a high-level research model designed to assist with complex scientific queries, advanced logic, and everyday research challenges. I was developed by researchers at Tethys AI, a startup founded by two school f

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [7]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/20000 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
1.148 GB of memory reserved.


In [8]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 20,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
1,2.942
2,2.8855
3,2.8028
4,2.7739
5,2.6344
6,2.4242
7,2.3538
8,2.363
9,2.1307
10,1.9163


In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

126.9935 seconds used for training.
2.12 minutes used for training.
Peak reserved memory = 4.422 GB.
Peak reserved memory for training = 3.274 GB.
Peak reserved memory % of max memory = 29.984 %.
Peak reserved memory for training % of max memory = 22.2 %.


In [10]:
FastLanguageModel.for_inference(model)

messages = [{"role": "system", "content": instruction},
    {"role": "user", "content": "What is this model about?"}]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

outputs = model.generate(**inputs, max_new_tokens=150, num_return_sequences=1)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])



This model is a high-level research model designed to assist with complex scientific queries, advanced logic, and everyday research challenges. It was developed by researchers at Tethys AI, a startup founded by two school friends focused on using AI to enhance research capabilities.


<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [11]:


from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [12]:
#model.save_pretrained("lora_model") # Local saving
#tokenizer.save_pretrained("lora_model")
model.push_to_hub("saishshinde15/llama-3.2-3b-it-Research-ChatBot" ) # Online saving
tokenizer.push_to_hub("saishshinde15/llama-3.2-3b-it-Research-ChatBot") # Online saving

Saved model to https://huggingface.co/saishshinde15/llama-3.2-3b-it-Research-ChatBot


In [None]:
from google.colab import userdata
hf_token = userdata.get('hf')

In [None]:
if hf_token:
  model.save_pretrained("High_Resoning_Model")  # Local saving
  tokenizer.save_pretrained("High_Resoning_Model")
  model.push_to_hub("saishshinde15/High_Resoning_Model", token=hf_token)  # Online saving
  tokenizer.push_to_hub("saishshinde15/High_Resoning_Model", token=hf_token)  # Online saving
else:
  print("Hugging Face token not found in Google Colab user data. Please set it using userdata.set('hf_token', 'YOUR_TOKEN').")
  model.save_pretrained("High_Resoning_Model")  # Local saving
  tokenizer.save_pretrained("High_Resoning_Model")


README.md:   0%|          | 0.00/602 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/45.1M [00:00<?, ?B/s]

Saved model to https://huggingface.co/saishshinde15/High_Resoning_Model


  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

You can also use Hugging Face's `AutoModelForPeftCausalLM`. Only use this if you do not have `unsloth` installed. It can be hopelessly slow, since `4bit` model downloading is not supported, and Unsloth's **inference is 2x faster**.

### Saving to float16 for VLLM

We also support saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4. We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens.

In [13]:
# Merge to 16bit
if False: model.save_pretrained_merged("High_Resoning_Model", tokenizer, save_method = "merged_16bit",)
if True: model.push_to_hub_merged("saishshinde15/llama-3.2-3b-it-Research-ChatBot", tokenizer, save_method = "merged_16bit")

# Merge to 4bit
if False: model.save_pretrained_merged("High_Resoning_Model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("saishshinde15/llama-3.2-3b-it-Research-ChatBot", tokenizer, save_method = "merged_4bit")

# Just LoRA adapters
if False: model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")

Unsloth: You are pushing to hub, but you passed your HF username = saishshinde15.
We shall truncate saishshinde15/llama-3.2-3b-it-Research-ChatBot to llama-3.2-3b-it-Research-ChatBot
Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which will take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 2.2G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 4.78 out of 12.67 RAM for saving.


100%|██████████| 28/28 [00:01<00:00, 23.94it/s]


Unsloth: Saving tokenizer...

No files have been modified since last commit. Skipping to prevent empty commit.


 Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Unsloth: Saving llama-3.2-3b-it-Research-ChatBot/pytorch_model-00001-of-00002.bin...
Unsloth: Saving llama-3.2-3b-it-Research-ChatBot/pytorch_model-00002-of-00002.bin...
Done.
Saved merged model to https://huggingface.co/saishshinde15/llama-3.2-3b-it-Research-ChatBot


In [None]:
# If above code gives error run this
from huggingface_hub import snapshot_download

local_dir = "./High_Resoning_Model"
snapshot_download(
    repo_id="saishshinde15/High_Resoning_Model",
    local_dir=local_dir,
    ignore_patterns=".gitattributes",
)


README.md:   0%|          | 0.00/602 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/738 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.6k [00:00<?, ?B/s]

unsloth.F16.gguf:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

unsloth.Q4_K_M.gguf:   0%|          | 0.00/802M [00:00<?, ?B/s]

'/content/High_Resoning_Model'

In [None]:
## Run Only One code from above or below depending on the use case(Best is to run GGUF for local testing)

### GGUF / llama.cpp Conversion
To save to `GGUF` / `llama.cpp`, we support it natively now! We clone `llama.cpp` and we default save it to `q8_0`. We allow all methods like `q4_k_m`. Use `save_pretrained_gguf` for local saving and `push_to_hub_gguf` for uploading to HF.

Some supported quant methods (full list on our [Wiki page](https://github.com/unslothai/unsloth/wiki#gguf-quantization-options)):
* `q8_0` - Fast conversion. High resource use, but generally acceptable.
* `q4_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.
* `q5_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K.

[**NEW**] To finetune and auto export to Ollama, try our [Ollama notebook](https://colab.research.google.com/drive/1WZDi7APtQ9VsvOrQSSC5DDtxq159j8iZ?usp=sharing)

In [15]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
# Remember to go to https://huggingface.co/settings/tokens for a token!
# And change hf to your username!
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("High_Resoning_Model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("saishshinde15/High_Resoning_Model", tokenizer, quantization_method = "q4_k_m", token = hf_token)

# Save to multiple GGUF options - much faster if you want multiple!
if True:
    model.push_to_hub_gguf(
        "saishshinde15/llama-3.2-3b-it-Research-ChatBot", # Change hf to your username!
        tokenizer,
        quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
        #token = hf_token, # Get a token at https://huggingface.co/settings/tokens
    )

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 5.16 out of 12.67 RAM for saving.


100%|██████████| 28/28 [00:01<00:00, 17.97it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Unsloth: Saving saishshinde15/llama-3.2-3b-it-Research-ChatBot/pytorch_model-00001-of-00002.bin...
Unsloth: Saving saishshinde15/llama-3.2-3b-it-Research-ChatBot/pytorch_model-00002-of-00002.bin...
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m', 'q8_0', 'q5_k_m'] will take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: [1] Converting model at saishshinde15/llama-3.2-3b-it-Research-ChatBot into f16 GGUF format.
The output location will be /content/saishshinde15/llama-3.2-3b-it-Research-ChatBot/unsloth.F16.gguf
This will take 3 minutes...
INFO:hf-to-gguf:Loading model: llama-3.2-3b-it-Research-ChatBot
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {64}
INFO:hf-to-gguf:gguf: loading model weight map from 'pytorch_model.bin.index.json'
INFO:hf-to-gg

No files have been modified since last commit. Skipping to prevent empty commit.


Saved GGUF to https://huggingface.co/saishshinde15/llama-3.2-3b-it-Research-ChatBot
Unsloth: Uploading GGUF to Huggingface Hub...


No files have been modified since last commit. Skipping to prevent empty commit.


Saved GGUF to https://huggingface.co/saishshinde15/llama-3.2-3b-it-Research-ChatBot
Unsloth: Uploading GGUF to Huggingface Hub...


No files have been modified since last commit. Skipping to prevent empty commit.


Saved GGUF to https://huggingface.co/saishshinde15/llama-3.2-3b-it-Research-ChatBot


Now, use the `model-unsloth.gguf` file or `model-unsloth-Q4_K_M.gguf` file in `llama.cpp` or a UI based system like `GPT4All`. You can install GPT4All by going [here](https://gpt4all.io/index.html).

**[NEW] Try 2x faster inference in a free Colab for Llama-3.1 8b Instruct [here](https://colab.research.google.com/drive/1T-YBVfnphoVc8E2E854qF3jdia2Ll2W2?usp=sharing)**

In [None]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
import evaluate

In [None]:
accuracy_metric = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
precision_metric = evaluate.load("precision")

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

In [None]:
recall_metric = evaluate.load("recall")

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

In [None]:
print(accuracy_metric)

EvaluationModule(name: "accuracy", module_type: "metric", features: {'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)}, usage: """
Args:
    predictions (`list` of `int`): Predicted labels.
    references (`list` of `int`): Ground truth labels.
    normalize (`boolean`): If set to False, returns the number of correctly classified samples. Otherwise, returns the fraction of correctly classified samples. Defaults to True.
    sample_weight (`list` of `float`): Sample weights Defaults to None.

Returns:
    accuracy (`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`.. A higher score means higher accuracy.

Examples:

    Example 1-A simple example
        >>> accuracy_metric = evaluate.load("accuracy")
        >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0])
        >>> print(results)
    

In [None]:
def evaluate_model(model, tokenizer, dataset, alpaca_prompt):
    """Evaluates the model on a given dataset.

    Args:
        model: The model to evaluate.
        tokenizer: The tokenizer to use.
        dataset: The dataset to evaluate on.
        alpaca_prompt: The prompt template to use.

    Returns:
        A tuple of predictions and references.
    """
    predictions = []
    references = []

    for example in dataset:
        input_text = alpaca_prompt.format(example["question"], "")
        inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

        # Generate prediction
        outputs = model.generate(**inputs, max_new_tokens=128)
        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(prediction)

        # Get reference answer
        references.append(example["answer"])

    return predictions, references


# Assuming you have a validation dataset named `validation_dataset`
# And the `alpaca_prompt` variable defined earlier
predictions, references = evaluate_model(model, tokenizer, dataset, alpaca_prompt)

# Now you can use `predictions` and `references` to calculate evaluation metrics
# like accuracy, precision, recall, etc.
