In [None]:
# Install core libraries
!pip install -q -U transformers accelerate bitsandbytes

# Clone and build llama.cpp from source
!git clone https://github.com/ggml-org/llama.cpp
%cd llama.cpp
!cmake -B build
!cmake --build build --config Release -j 8
%cd /kaggle/working

In [None]:
import time
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto")

prompt = "Explain KV."
inputs = tokenizer(prompt, return_tensors="pt").to(device)

start = time.time()
outputs = model.generate(**inputs, max_new_tokens=100, use_cache=True)
end = time.time()

tokens_generated = outputs.shape[1]
print(f"Latency: {round(end - start, 3)}s")
print(f"Tokens/sec: {round(tokens_generated / (end - start), 2)}")

In [None]:
def run_cmd_fp16(prompt, max_new_tokens=120):
    messages = [
        {"role": "system", "content": "You are a health-focused language model. Provide clear, factual, and concise medical explanations."},
        {"role": "user", "content": prompt}
    ]

    inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(device)

    start = time.time()
    with torch.no_grad():
        outputs = fp16_model.generate(inputs, max_new_tokens=max_new_tokens, use_cache=True)
    end = time.time()

    # Decode only the new tokens
    generated_ids = outputs[0][inputs.shape[1]:]
    text = tokenizer.decode(generated_ids, skip_special_tokens=True)

    
    print(f"Tokens/sec: {len(generated_ids) / (end - start):.2f}")

In [None]:
# Note: Ensure path to your GGUF file is correct
GGUF_PATH = "/kaggle/input/modelsday4/gguf/phi3-mini-4k-instruct-q4_k_m.gguf"
PROMPT = "Explain coronavirus disease, including transmission, symptoms, and prevention."

!./llama.cpp/build/bin/llama-cli \
  -m {GGUF_PATH} \
  -p "{PROMPT}" \
  -n 256 \
  -ngl 33  # Offload layers to GPU