In [None]:
%%capture
!pip install unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048
dtype = None
load_in_4bit = True


fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",
    "unsloth/Mistral-Small-Instruct-2409",
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",

    "unsloth/Llama-3.2-1B-bnb-4bit",
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",
]

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",

    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2024.11.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


**Data prep**

In [None]:
# You are a teacher grading a quiz. You will be given the expected answers and the answers from a student. Your task is to grade the student out of 10 marks. You will output the score out of 10 marks for each question. Grade the question with higher score if the student's answer overlaps with the expected answer. Ignore differences in punctuation and phrasing between the student's answer and the expected answer. The student's answer is CORRECT if it contains more information than the expected answer, but it should at least cover what's in the expected answer. The order of the items in each answer is also not a problem. Grade the question with lower marks if the student's answer is not factual or doesn't overlap with the expected answer.

In [None]:
from datasets import load_dataset
import pandas as pd
dataset = pd.read_csv("/content/mohler_dataset_edited.csv")
dataset['text'] = 0
dataset.to_csv('modified_mohler_dataset.csv', index=False)
dataset.drop(['id', 'score_me', 'score_other'], inplace=True, axis=1)
dataset['score_avg'] = dataset['score_avg'] * 2
dataset

Unnamed: 0,question,desired_answer,student_answer,score_avg,text
0,What is the role of a prototype program in pro...,To simulate the behaviour of portions of the d...,High risk problems are address in the prototyp...,7.00,0
1,What is the role of a prototype program in pro...,To simulate the behaviour of portions of the d...,To simulate portions of the desired final prod...,10.00,0
2,What is the role of a prototype program in pro...,To simulate the behaviour of portions of the d...,A prototype program simulates the behaviors of...,8.00,0
3,What is the role of a prototype program in pro...,To simulate the behaviour of portions of the d...,Defined in the Specification phase a prototype...,10.00,0
4,What is the role of a prototype program in pro...,To simulate the behaviour of portions of the d...,It is used to let the users have a first idea ...,6.00,0
...,...,...,...,...,...
2268,How many steps does it take to search a node i...,The height of the tree.,log n,9.50,0
2269,How many steps does it take to search a node i...,The height of the tree.,( n(n-1) ) / 2,3.00,0
2270,How many steps does it take to search a node i...,The height of the tree.,2n-1,4.75,0
2271,How many steps does it take to search a node i...,The height of the tree.,"it takes at most h steps, where h is the heigh...",10.00,0


In [None]:
import pandas as pd
from datasets import Dataset

# Load the dataset
dataset = pd.read_csv("/content/modified_mohler_dataset.csv")

# Remove the incorrect 'text' column
dataset.drop(columns=['text'], inplace=True)

# Convert to HuggingFace Dataset
dataset_hf = Dataset.from_pandas(dataset)


In [None]:
# Alpaca prompt format to generate text for each example
alpaca_prompt = """You are a teacher grading a quiz. You will be given the question, expected answers and the answer from a student. Your task is to assign some score to the student out of 10 marks. You will output the score out of 10 marks for each question. Grade the question with higher score if the student's answer overlaps with the expected answer. Ignore differences in punctuation and phrasing between the student's answer and the expected answer. The student's answer is CORRECT if it contains more information than the expected answer, but it should at least cover what's in the expected answer. The order of the items in each answer is also not a problem. Grade the question with lower marks if the student's answer is not factual or doesn't overlap with the expected answer.

### Question:
{}

### Expected Answer:
{}

### Student Answer:
{}

### Score out of 10:
{}"""

# Function to format the prompts
def formatting_prompts_func(examples):
    questions = examples["question"]
    expected_answers = examples["desired_answer"]
    student_answers = examples["student_answer"]
    scores = examples["score_avg"]

    # Generate formatted text
    texts = []
    for question, expected_answer, student_answer, score in zip(questions, expected_answers, student_answers, scores):
        text = alpaca_prompt.format(question, expected_answer, student_answer, score)
        texts.append(text)

    return {"text": texts}

# Apply formatting to the dataset
dataset_hf = dataset_hf.map(formatting_prompts_func, batched=True)


Map:   0%|          | 0/2273 [00:00<?, ? examples/s]

In [None]:
# Split the dataset into training (80%) and testing (20%)
train_dataset = dataset_hf.train_test_split(test_size=0.2, seed=42)["train"]
test_dataset = dataset_hf.train_test_split(test_size=0.2, seed=42)["test"]


In [None]:
train_dataset[0]["student_answer"]

'Constructor, functions, and variables native to the class.'

In [None]:
test_dataset[5]["desired_answer"]

'A node that has no children.'

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/1818 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/455 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [None]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,  # Set the training dataset
    eval_dataset = test_dataset,     # Set the validation dataset
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,  # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 250,  # increase this to make the model learn "better"
        num_train_epochs = 4,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        evaluation_strategy = "steps",  # Perform validation every eval_steps
        eval_steps = 10,  # Change this to control how often validation runs
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

#trainer_stats = trainer.train()



Map (num_proc=2):   0%|          | 0/1818 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/455 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
9.125 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,818 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 250
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
10,0.3407,0.372742
20,0.3611,0.33245
30,0.2786,0.297003
40,0.2437,0.276998
50,0.3154,0.263216
60,0.1326,0.252448
70,0.2462,0.247606
80,0.2448,0.24385
90,0.2731,0.241446
100,0.1909,0.238081


In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

4672.4478 seconds used for training.
77.87 minutes used for training.
Peak reserved memory = 9.338 GB.
Peak reserved memory for training = 0.213 GB.
Peak reserved memory % of max memory = 63.317 %.
Peak reserved memory for training % of max memory = 1.444 %.


In [None]:
question = "Briefly describe in one sentence how does merge sort work?"
key = "It splits the original array into two, sorts each of the two halves, and then merges the sorted arrays."
student = "merge sort splits the array of elements into smaller arrays until the arrays reach size 1 and then the merge sort merges the smaller arrays into arrays of size 2 then it moves to the next step and merges the next arrays."
score = ""

In [None]:
FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    alpaca_prompt.format(
        question,
        key,
        student,
        score,
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 1,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

5


In [None]:
from datasets import load_dataset
import pandas as pd
dataset1 = pd.read_csv("/content/modified_SciEntsBank_dataset.csv")
#dataset1.drop(columns=['text'], inplace=True)

In [None]:
#dataset1['text'] = 0

In [None]:
dataset1

Unnamed: 0,question,reference_answer,student_answer,label,score,text
0,You used several methods to separate and ident...,"The water was evaporated, leaving the salt.",By letting it sit in a dish for a day.,3,0,0
1,You used several methods to separate and ident...,"The water was evaporated, leaving the salt.",Let the water evaporate and the salt is left b...,0,10,0
2,You used several methods to separate and ident...,"The water was evaporated, leaving the salt.",The water evaporated and left salt crystals.,0,10,0
3,You used several methods to separate and ident...,"The water was evaporated, leaving the salt.",I saw a pinkish grayish color that was blockin...,3,0,0
4,You used several methods to separate and ident...,"The water was evaporated, leaving the salt.",You have to slowly tip the vial for only the w...,3,0,0
...,...,...,...,...,...,...
4964,"When conducting a controlled experiment, why d...",A standard is used for comparison to determine...,To see if the standard might be faster than th...,2,7,0
4965,"When conducting a controlled experiment, why d...",A standard is used for comparison to determine...,Because you can compare the results.,2,7,0
4966,"When conducting a controlled experiment, why d...",A standard is used for comparison to determine...,To help control the controlled experiment.,3,0,0
4967,"When conducting a controlled experiment, why d...",A standard is used for comparison to determine...,To control any experiment.,3,0,0


In [None]:
from datasets import Dataset
#dataset1.drop(columns=['text'], inplace=True)
dataset_hf1 = Dataset.from_pandas(dataset1)

In [None]:
dataset_hf1

Dataset({
    features: ['question', 'reference_answer', 'student_answer', 'label', 'score', 'text'],
    num_rows: 4969
})

In [None]:
# Split the dataset into training (80%) and testing (20%)
train_dataset1 = dataset_hf1.train_test_split(test_size=0.2, seed=42)["train"]
test_dataset1 = dataset_hf1.train_test_split(test_size=0.2, seed=42)["test"]


In [None]:
print(train_dataset1[0])
print(test_dataset1[0])


{'question': 'Gerry used a paper filter to separate a mixture of sand, salt, and water. Think about particle size and answer this question: Why did the salt go through the filter while the sand got caught?', 'reference_answer': 'The dissolved salt particles are small enough to go through the holes in the filter paper, but the sand particles are too large.', 'student_answer': 'The salt was small enough to go through the holes while the sand was too big to go through.', 'label': 0, 'score': 10, 'text': 0}
{'question': "Kurt was investigating which objects stick to magnets. He made an entry in his science notebook and drew a picture to help explain what he did. Look what I did! I picked up a paperclip with a magnet. Then that paperclip picked up another one, and then another one. And they weren't hooked together either. All they had to do was touch each other. Explain to Kurt why he was able to pick up all those paperclips, even though the magnet was only touching the first one.", 'refere

In [None]:
#@title Show current memory stats
gpu_stats1 = torch.cuda.get_device_properties(0)
start_gpu_memory1 = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory1 = round(gpu_stats1.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats1.name}. Max memory = {max_memory1} GB.")
print(f"{start_gpu_memory1} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
9.338 GB of memory reserved.


In [None]:
EOS_TOKEN = tokenizer.eos_token # do not forget this part!
def formatting_prompts_func1(examples):
       # Access data as lists within the batch
       questions = examples["question"]
       expected_answers = examples["reference_answer"]
       student_answers = examples["student_answer"]
       outputs = examples["score"]

       texts = []
       # Iterate through the batch
       for question, expected_answer, student_answer, output in zip(questions, expected_answers, student_answers, outputs):
           text = alpaca_prompt.format(question, expected_answer, student_answer, output) + EOS_TOKEN
           texts.append(text)
       return { "text" : texts }


dataset_hf1 = dataset_hf1.map(formatting_prompts_func1, batched = True)

Map:   0%|          | 0/4969 [00:00<?, ? examples/s]

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer1 = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset_hf1,
    #eval_dataset=test_dataset1,
    dataset_text_field="text",  # Ensure this field contains strings
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=500,
        num_train_epochs=1,
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/4969 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [None]:
trainer_stats1 = trainer1.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 4,969 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 500
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,1.8466
2,1.6256
3,2.0339
4,1.7848
5,1.9043
6,1.8254
7,1.5246
8,1.3305
9,1.6354
10,1.5651


In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats1.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats1.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

2652.5349 seconds used for training.
44.21 minutes used for training.
Peak reserved memory = 13.346 GB.
Peak reserved memory for training = 4.221 GB.
Peak reserved memory % of max memory = 90.494 %.
Peak reserved memory for training % of max memory = 28.621 %.


In [None]:
question = "Briefly describe in one sentence how does merge sort work?"
key = "It splits the original array into two, sorts each of the two halves, and then merges the sorted arrays."
student = "merge sort splits the array of elements into smaller arrays until the arrays reach size 1 and then the merge sort merges the smaller arrays into arrays of size 2 then it moves to the next step and merges the next arrays."
score = ""

In [None]:
FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    alpaca_prompt.format(
        question,
        key,
        student,
        score,
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 1,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

5


In [None]:
# model.save_pretrained("lora_model") # Local saving
model.push_to_hub("rohand8/Final", token = "HUGGING_FACE_API_KEY") # Online saving

README.md:   0%|          | 0.00/574 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Saved model to https://huggingface.co/rohand8/Final


In [None]:
tokenizer.push_to_hub("rohand8/Final", token="HUGGING_FACE_API_KEY")

  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [None]:
# Not executed

In [None]:
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")
model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")

Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which will take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 5.7G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 5.79 out of 12.67 RAM for saving.


 47%|████▋     | 15/32 [00:02<00:01,  8.51it/s]We will save to Disk and not RAM now.
100%|██████████| 32/32 [01:43<00:00,  3.24s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Unsloth: Saving model/pytorch_model-00001-of-00004.bin...
Unsloth: Saving model/pytorch_model-00002-of-00004.bin...
Unsloth: Saving model/pytorch_model-00003-of-00004.bin...
Unsloth: Saving model/pytorch_model-00004-of-00004.bin...
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['f16'] will take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: [1] Converting model at model into f16 GGUF format.
The output location will be /content/model/unsloth.F16.gguf
This will take 3 minutes...
INFO:hf-to-gguf:Loading model: model
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'pytorch_model.bin.index.json'
INFO:hf-to-gguf:gguf: loading model part 'pytorch_model-00001-of-00004.bin'
INFO:hf-to-gguf:token_embd.weight,           torch.float16 --> F16, shape = {4096, 128256}
INFO:hf-to-gguf:blk.0.attn_q.weight,         torch.float1

In [None]:
#model.save_pretrained("lora_model")
#tokenizer.save_pretrained("lora_model")

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.json')

In [None]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model",
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model)

messages = [
    {"role": "user", "content": "Describe a tall tower in the capital of France."},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

The Eiffel Tower, located in the heart of Paris, stands tall among the city's historic and cultural landmarks. This iron structure, standing at an impressive 324 meters high, offers breathtaking views of the City of Light's iconic landscape. The Eiffel Tower was built for the 1889 World's Fair and has since become a symbol of French engineering and culture.<|eot_id|>


In [None]:
model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")