In [8]:
from unsloth import FastLanguageModel
import logging

logging.getLogger("unsloth").setLevel(logging.ERROR)

max_seq_length = 4096

ft_model, ft_tokenizer = FastLanguageModel.from_pretrained(
    # model_name = "unsloth", # YOUR MODEL YOU USED FOR TRAINING
    max_seq_length = max_seq_length,
    load_in_4bit = True,
    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
)

==((====))==  Unsloth 2025.12.9: Fast Llama patching. Transformers: 4.55.2. vLLM: 0.10.1.1+381074ae.nv25.9.cu130.
   \\   /|    NVIDIA GeForce RTX 4070. Num GPUs = 1. Max memory: 11.575 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0a0+50eac811a6.nv25.09. CUDA: 8.9. CUDA Toolkit: 13.0. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32+nv25.09. FA2 = True]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [9]:
import transformers
import torch
from transformers import BitsAndBytesConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from deepeval.models import DeepEvalBaseLLM
from itertools import chain

class FTModel(DeepEvalBaseLLM):
    def __init__(self):
        self.model = ft_model
        self.tokenizer = ft_tokenizer

    def load_model(self):
        return self.model

    def generate_text(self, prompt):
        return self.generate(prompt)[0]['generated_text']

    def generate(self, prompt):
        model = self.load_model()

        pipeline = transformers.pipeline(
            "text-generation",
            model=model,
            tokenizer=self.tokenizer,
            device_map="cuda",
            max_length=max_seq_length,
            max_new_tokens=8096,
            do_sample=True,
            top_k=5,
            num_return_sequences=1,
            eos_token_id=self.tokenizer.eos_token_id,
            pad_token_id=self.tokenizer.pad_token_id,
        )

        return pipeline(prompt)

    def batch_generate(self, prompts):
        if not isinstance(prompts, list):
            raise Exception("prompts is not a list")
            
        result = self.generate(prompts)
        flattened_result = list(chain.from_iterable(result))
        return list(map(lambda x: x['generated_text'], flattened_result))
        

    async def a_generate(self, prompt):
        return self.generate(prompt)


    def get_model_name(self):
        return "Unsloth-1B"

ft_model = FTModel()
ft_model.generate("Hello")

Device set to use cuda:0


[{'generated_text': "Hello, I'm looking for a new gaming PC build for my gaming setup. I'm looking for a system that can handle a mix of games that require a strong GPU, a decent CPU, and plenty of RAM. I'm on a budget, so I'd like to look for deals and discounts.\n\nI've narrowed down my options to a few different options. Here are the specs I'm looking for:\n\n* CPU: AMD Ryzen 5 or Intel Core i5\n* GPU: NVIDIA GeForce RTX 3060 or AMD Radeon RX 6700 XT\n* RAM: 16 GB or 32 GB\n* Storage: 1 TB or 2 TB SSD\n* Power Supply: 650 W or 850 W\n\nI've been doing some research and found a few options that might fit the bill. Here are some potential build options:\n\nOption 1: AMD Ryzen 5 3600, NVIDIA GeForce RTX 3060, 16 GB RAM, 1 TB SSD, 650 W power supply\n\nThis build should be able to handle a mix of games that require a strong GPU, as well as some 3D graphics and general computing tasks.\n\nOption 2: AMD Ryzen 5 5600X, AMD Radeon RX 6700 XT, 32 GB RAM, 2 TB SSD, 850 W power supply\n\nThis 

In [10]:
result = ft_model.batch_generate(["hello", "hi"])
result

Device set to use cuda:0


["hello, I'm interested in buying a home. I've been looking at some properties online and I'm considering the following:\n\n* I have a credit score of 750 or higher.\n* You're a real estate agent with extensive experience in the local market.\n* I'm looking for a home in a desirable neighborhood with good schools and amenities.\n* I'm willing to spend around $500,000 to $700,000 for a home.\n* I'm a self-employed individual with a moderate income.\n\nCan you recommend some homes for sale in the area that fit these criteria? I'd like to see some options with a mix of new construction, existing homes, and townhomes. I'd also appreciate any insights you can provide on the local market trends and potential risks.\n\nSpecifically, I'm interested in the following neighborhoods:\n\n* The one you've highlighted on the website of [RE/MAX agent's company] that features a mix of new construction, existing homes, and townhomes.\n* A neighborhood that has a strong school district, such as [neighbor

In [1]:
from datasets import load_dataset

dataset = load_dataset("yahma/alpaca-cleaned", split="train")

  from .autonotebook import tqdm as notebook_tqdm
  import pynvml  # type: ignore[import]


In [23]:
for row in dataset.select(range(10)):
    

Dataset({
    features: ['output', 'input', 'instruction'],
    num_rows: 10
})

In [14]:
from deepeval import assert_test
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from deepeval.metrics import GEval
from deepeval.models import GeminiModel
from deepeval import evaluate
import os

test_model = GeminiModel(
    model="gemini-2.5-flash",
    api_key=os.environ['GEMINI_API_KEY'],
    temperature=0
)

dataset = dataset.shuffle(seed=32)
test_dataset = dataset[:5]
test_cases = []
correctness_metric = GEval(
    model=test_model,
    name="Correctness",
    criteria="Determine if the 'actual output' is correct based on the 'expected output'.",
    evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
    threshold=0.5
)

for row in test_dataset:
    question = row['instruction']
    answer = row['output']
    ai_answer = ft_model.generate_text(question)   
    test_case = LLMTestCase(
        input=question,
        actual_output=ai_answer,
        expected_output=answer,
    )
    test_cases.append(test_case)


_ = evaluate(test_cases=test_cases, metrics=[correctness_metric])

TypeError: string indices must be integers, not 'str'

In [12]:
from deepeval.benchmarks import MMLU
from deepeval.benchmarks.mmlu.task import MMLUTask

benchmark = MMLU(
    tasks=[MMLUTask.HIGH_SCHOOL_COMPUTER_SCIENCE, MMLUTask.BUSINESS_ETHICS],
    n_shots=3
)
results = benchmark.evaluate(model=ft_model, batch_size=5)
print("Overall Score: ", results)

KeyboardInterrupt: 