In [7]:
MODEL_NAME = "unsloth/gemma-2-9b-bnb-4bit"
INSTRUCTION_TEXT = "I will provide question and incorrect answer. Identify the misconception behind the incorrect answer."
INPUT_TEXT = "Question:\nConvert fractions less than 1 to terminating decimals of 2 decimal places Converting between Fractions and Decimals Convert \( \frac{3}{20} \) into a decimal.\n\nAnswer:\n3.20"

In [8]:
from unsloth import FastLanguageModel

max_seq_length = 2048  # Choose any! We auto support RoPE Scaling internally!
dtype = (
    None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
)
load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2024.9.post3: Fast Gemma2 patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla V100-PCIE-16GB. Max memory: 15.773 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 7.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [3]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN


def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return {
        "text": texts,
    }

In [4]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model)  # Enable native 2x faster inference
inputs = tokenizer(
    [
        alpaca_prompt.format(
            INSTRUCTION_TEXT,  # instruction
            INPUT_TEXT,  # input
            "",  # output - leave this blank for generation!
        )
    ],
    return_tensors="pt",
).to("cuda")

outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)
tokenizer.batch_decode(outputs)

AUTOTUNE bmm(16x98x256, 16x256x98)
  triton_bmm_7 0.0154 ms 100.0%
  triton_bmm_3 0.0164 ms 93.7%
  triton_bmm_6 0.0174 ms 88.2%
  triton_bmm_2 0.0184 ms 83.3%
  bmm 0.0195 ms 78.9%
  triton_bmm_0 0.0195 ms 78.9%
  triton_bmm_14 0.0195 ms 78.9%
  triton_bmm_10 0.0205 ms 75.0%
  triton_bmm_5 0.0215 ms 71.4%
  triton_bmm_1 0.0225 ms 68.2%
SingleProcess AUTOTUNE benchmarking takes 2.1388 seconds and 0.0294 seconds precompiling
AUTOTUNE bmm(16x98x98, 16x98x256)
  bmm 0.0133 ms 100.0%
  triton_bmm_33 0.0143 ms 92.9%
  triton_bmm_19 0.0154 ms 86.7%
  triton_bmm_24 0.0154 ms 86.7%
  triton_bmm_25 0.0154 ms 86.7%
  triton_bmm_29 0.0154 ms 86.7%
  triton_bmm_32 0.0164 ms 81.2%
  triton_bmm_26 0.0184 ms 72.2%
  triton_bmm_28 0.0184 ms 72.2%
  triton_bmm_34 0.0184 ms 72.2%
SingleProcess AUTOTUNE benchmarking takes 2.1543 seconds and 0.0017 seconds precompiling


['<bos>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nIdentify the misconception behind Incorrect Answer.\n\n### Input:\nQuestion:\nConvert fractions less than 1 to terminating decimals of 2 decimal places Converting between Fractions and Decimals Convert \\( \x0crac{3}{20} \\) into a decimal\n\n.Answer:\n3.20\n\n### Response:\nThe misconception behind Incorrect Answer is that the student did not understand how to convert a fraction to a decimal. The student did not divide the numerator by the denominator to get the decimal equivalent.<eos>']