In [None]:
import torch 
print("Installing llama-cpp-python...")
try:
    # Use !pip for shell commands in notebooks
    # Check for CUDA availability to install the correct version
    if torch.cuda.is_available():
        !pip install llama-cpp-python[cuda] --force-reinstall --no-cache-dir
        print("Installed llama-cpp-python with CUDA support.")
    else:
        !pip install llama-cpp-python --force-reinstall --no-cache-dir
        print("Installed llama-cpp-python (CPU-only).")
except Exception as e:
    print(f"Error installing llama-cpp-python: {e}")
    print("Please ensure your environment is set up correctly for pip installations.")
    exit()



Installing llama-cpp-python...
Collecting llama-cpp-python
  Downloading llama_cpp_python-0.3.14.tar.gz (51.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.0/51.0 MB[0m [31m55.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting typing-extensions>=4.5.0 (from llama-cpp-python)
  Downloading typing_extensions-4.14.1-py3-none-any.whl.metadata (3.0 kB)
Collecting numpy>=1.20.0 (from llama-cpp-python)
  Downloading numpy-2.3.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m231.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting diskcache>=5.6.1 (from llama-cpp-python)
  Downloading diskcache-5.6.3-py3-none-any.whl.met

In [None]:
from llama_cpp import Llama
import os

GGUF_MODEL_PATH = "/kaggle/input/gemma-2b-finetuned-gguf/gguf/default/1/merged_2B_model.Q8_0.gguf" # Example path, adjust as needed!

print(f"\nLoading GGUF model from: {GGUF_MODEL_PATH}...")
try:
    if not os.path.exists(GGUF_MODEL_PATH):
        raise FileNotFoundError(f"GGUF model not found at: {GGUF_MODEL_PATH}")

    llm = Llama(
        model_path=GGUF_MODEL_PATH,
        n_gpu_layers=-1 if torch.cuda.is_available() else 0, 
        n_ctx=2048, 
        verbose=False, # Show verbose loading output
    )
    print("GGUF model loaded successfully!")

except FileNotFoundError as e:
    print(f"Error: {e}")
    exit()
except Exception as e:
    print(f"Error loading GGUF model: {e}")
    exit()



llama_model_loader: loaded meta data with 40 key-value pairs and 727 tensors from /kaggle/input/gemma-2b-finetuned-gguf/gguf/default/1/merged_2B_model.Q8_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = gemma3n
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = merged_2B_model
llama_model_loader: - kv   3:                       general.quantized_by str              = Unsloth
llama_model_loader: - kv   4:                         general.size_label str              = 4.5B
llama_model_loader: - kv   5:                           general.repo_url str              = https://huggingface.co/unsloth
llama_model_loader: - kv   6:                               general.tags arr[str,2]     


Loading GGUF model from: /kaggle/input/gemma-2b-finetuned-gguf/gguf/default/1/merged_2B_model.Q8_0.gguf...


llama_model_loader: - kv  28:                      tokenizer.ggml.tokens arr[str,262144]  = ["<pad>", "<eos>", "<bos>", "<unk>", ...
llama_model_loader: - kv  29:                      tokenizer.ggml.scores arr[f32,262144]  = [-1000.000000, -1000.000000, -1000.00...
llama_model_loader: - kv  30:                  tokenizer.ggml.token_type arr[i32,262144]  = [3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, ...
llama_model_loader: - kv  31:                tokenizer.ggml.bos_token_id u32              = 2
llama_model_loader: - kv  32:                tokenizer.ggml.eos_token_id u32              = 106
llama_model_loader: - kv  33:            tokenizer.ggml.unknown_token_id u32              = 3
llama_model_loader: - kv  34:            tokenizer.ggml.padding_token_id u32              = 0
llama_model_loader: - kv  35:               tokenizer.ggml.add_bos_token bool             = true
llama_model_loader: - kv  36:               tokenizer.ggml.add_sep_token bool             = false
llama_model_loader: - kv  37

In [None]:
print("\nPerforming inference...")
try:
    system_prompt = ("You are a cheerful and friendly tutor for children aged 5 to 7."
    " Use simple words and fun metaphors to explain things clearly. Be playful and keep "
    "answers short and exciting. You can use characters like 'sugar bugs' or 'energy monsters' "
    "to make it fun.")
    user_prompt = "Why do we brush our teeth?"
    
    # Format to the template of Gemma models
    formatted_prompt = (
        f"<start_of_turn>system\n{system_prompt}<end_of_turn>\n"
        f"<start_of_turn>user\n{user_prompt}<end_of_turn>\n"
        f"<start_of_turn>model\n"
    )

    print(f"\nPrompt for the Model: {user_prompt}")

    output = llm.create_completion(
        formatted_prompt,
        max_tokens=512,
        temperature=0.7,
        top_p=0.9,
        stop=["<end_of_turn>"],
        echo=True,
    )
    generated_text = output["choices"][0]["text"]
    # To get only the model's response, strip the prompt part
    response_only = generated_text.replace(formatted_prompt, "").strip()
    print("\nModel's Response Only:")
    print(response_only)
except Exception as e:
    print(f"Error during inference: {e}")


Performing inference...

Prompt for the Model: Why do we brush our teeth?


Llama.generate: 59 prefix-match hit, remaining 17 prompt tokens to eval
llama_perf_context_print:        load time =    1148.37 ms
llama_perf_context_print: prompt eval time =    1050.80 ms /    17 tokens (   61.81 ms per token,    16.18 tokens per second)
llama_perf_context_print:        eval time =   12631.00 ms /    65 runs   (  194.32 ms per token,     5.15 tokens per second)
llama_perf_context_print:       total time =   13867.16 ms /    82 tokens



Model's Response Only:
Brushing our teeth is super important for keeping them healthy and shiny! It removes the tiny bits of food that get stuck in your teeth all day, and it also removes the bad guys called bacteria that can cause cavities (holes) and a bad breath. It's like giving your teeth a super clean bath every day!
