<a href="https://colab.research.google.com/github/simecek/mlprague2024/blob/main/05_Gemma7B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()
# Must install separately since Colab has torch 2.2.1, which breaks packages
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
if major_version >= 8:
    # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
    !pip install --no-deps packaging ninja flash-attn xformers trl peft accelerate bitsandbytes
else:
    # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
pass

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 512
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-7b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth: Fast Gemma patching release 2024.4
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


In [None]:
from datasets import load_dataset

synczech50 = load_dataset('simecek/synczech50')['train']

def get_prompt(x):
    question, options = x['question'], (x['optionA'], x['optionB'], x['optionC'], x['optionD'])
    text = f"""Which option A/B/C/D is the best answer for the question. Answer with one letter, no explanation.

Question (in Czech): {question}

Options:
A) {options[0]}
B) {options[1]}
C) {options[2]}
D) {options[3]}

Answer (just 1 letter, A/B/C/D):"""
    return {"text": text}

print(get_prompt(synczech50[0])['text'])

Which option A/B/C/D is the best answer for the question. Answer with one letter, no explanation.

Question (in Czech): Co je hlavní město České republiky?

Options:
A) Brno
B) Praha
C) Ostrava
D) Plzeň

Answer (just 1 letter, A/B/C/D):


In [None]:
synczech50 = synczech50.map(get_prompt, batched = False,)

synczech50

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'optionA', 'optionB', 'optionC', 'optionD', 'solution', 'text'],
    num_rows: 50
})

<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!

In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

In [None]:
from tqdm.notebook import tqdm

answers = []

for x in tqdm(synczech50):
  inputs = tokenizer([x['text']], return_tensors = "pt").to("cuda")
  outputs = model.generate(**inputs, max_new_tokens = 4, use_cache = True)
  answer = tokenizer.batch_decode(outputs)[0].split(":")[-1].strip()[:1].upper()
  answers.append(answer)

  0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
solutions = [x['solution'] for x in synczech50]

correct = 0

for answer, solution in zip(answers, synczech50['solution']):
  if answer == solution:
    correct += 1

# 60% for unsloth/gemma-7b-bnb-4bit
# 60% for unsloth/mistral-7b-instruct-v0.2-bnb-4bit
# 66% for unsloth/llama-3-8b-bnb-4bit
correct / len(answers)

0.6