In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
# Install core libraries
!pip install -q -U transformers accelerate bitsandbytes

# Clone and build llama.cpp from source
!git clone https://github.com/ggml-org/llama.cpp
%cd llama.cpp
!cmake -B build
!cmake --build build --config Release -j 8
%cd /kaggle/working

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m71.5 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.9/380.9 kB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sentence-transformers 5.1.1 requires transformers<5.0.0,>=4.41.0, but you have transformers 5.0.0 which is incompatible.
gradio 5.49.1 requires pydantic<2.12,>=2.0, but you have pydantic 2.12.5 which is incompatible.[0m[31m
[0mCloning into 'llama.cpp'...
remote: Enumerating objects: 77455, done

In [2]:
import time
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto")

prompt = "Explain Coronavirus."
inputs = tokenizer(prompt, return_tensors="pt").to(device)

start = time.time()
outputs = model.generate(**inputs, max_new_tokens=100, use_cache=True)
end = time.time()

tokens_generated = outputs.shape[1]
print(f"Latency: {round(end - start, 3)}s")
print(f"Tokens/sec: {round(tokens_generated / (end - start), 2)}")

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Latency: 5.987s
Tokens/sec: 18.04


In [20]:
import time
import torch
import psutil
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_id = "/kaggle/input/model-int4/int4" 

# 1. Define the quantization config properly
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4"
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

# 2. Load model with the config object
model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    device_map="auto", 
    quantization_config=quant_config # Use the config object here
)

prompt = "Explain the importance of open-source AI."
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

# --- Measurement Logic ---
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
start_time = time.time()

# Generate
outputs = model.generate(**inputs, max_new_tokens=100)

end_time = time.time()

# Calculations
latency = end_time - start_time
tokens_gen = outputs.shape[1] - inputs.input_ids.shape[1]
vram = torch.cuda.max_memory_allocated() / 1024**3 

print(f"\n✅ Latency: {latency:.2f}s")
print(f"✅ Tokens/sec: {tokens_gen / latency:.2f}")
print(f"✅ Max VRAM Usage: {vram:.2f} GB")



Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]


✅ Latency: 0.09s
✅ Tokens/sec: 10.82
✅ Max VRAM Usage: 1.68 GB


In [23]:
import time
from llama_cpp import Llama

# 1. Setup Path (Based on your file browser screenshot)
gguf_path = "/kaggle/input/modelsday4/model-q4_0.gguf"

# 2. Load Model
# n_gpu_layers=-1 moves all layers to the GPU (T4 on Kaggle)
llm = Llama(
    model_path=gguf_path,
    n_gpu_layers=-1, 
    n_ctx=2048,
    verbose=False # Keeps the output clean
)

# 3. Define Prompt and Measure
prompt = "Explain the coronavirus"
formatted_prompt = f"Q: {prompt} A:"

start_time = time.time()
response = llm(
    formatted_prompt,
    max_tokens=100,
    stop=["Q:", "\n"],
    echo=False
)
end_time = time.time()

# 4. Calculations
latency = end_time - start_time
tokens_generated = response["usage"]["completion_tokens"]
tps = tokens_generated / latency

print(f"--- GGUF Q4_0 Performance ---")
print(f"✅ Latency: {latency:.3f}s")
print(f"✅ Tokens/sec: {tps:.2f}")
print(f"✅ Output: {response['choices'][0]['text'].strip()}")

# Note: VRAM usage for GGUF is best checked via the 'nvidia-smi' command 
# in a separate cell while the model is loaded.

llama_model_loader: loaded meta data with 32 key-value pairs and 201 tensors from /kaggle/input/modelsday4/model-q4_0.gguf (version GGUF V3 (latest))
llama_model_loader: - tensor    0:                    output.weight q6_K     [  2048, 32000,     1,     1 ]
llama_model_loader: - tensor    1:               output_norm.weight f32      [  2048,     1,     1,     1 ]
llama_model_loader: - tensor    2:                token_embd.weight q4_0     [  2048, 32000,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_k.weight q4_0     [  2048,   256,     1,     1 ]
llama_model_loader: - tensor    4:           blk.0.attn_norm.weight f32      [  2048,     1,     1,     1 ]
llama_model_loader: - tensor    5:         blk.0.attn_output.weight q4_0     [  2048,  2048,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_q.weight q4_0     [  2048,  2048,     1,     1 ]
llama_model_loader: - tensor    7:              blk.0.attn_v.weight q4_0     [  2048,   256,  

--- GGUF Q4_0 Performance ---
✅ Latency: 1.759s
✅ Tokens/sec: 11.37
✅ Output: Coronavirus Disease 2019 (COVID-19)


In [24]:


# 1. Setup Path (Based on your file browser screenshot)
gguf_path = "/kaggle/input/modelsday4/model-q4_0.gguf"

# 2. Load Model
# n_gpu_layers=-1 moves all layers to the GPU (T4 on Kaggle)
llm = Llama(
    model_path=gguf_path,
    n_gpu_layers=-1, 
    n_ctx=2048,
    verbose=False # Keeps the output clean
)

# 3. Define Prompt and Measure
prompt = "Explain the coronavirus disease"
formatted_prompt = f"Q: {prompt} A:"

start_time = time.time()
response = llm(
    formatted_prompt,
    max_tokens=100,
    stop=["Q:", "\n"],
    echo=False
)
end_time = time.time()

# 4. Calculations
latency = end_time - start_time
tokens_generated = response["usage"]["completion_tokens"]
tps = tokens_generated / latency

print(f"--- GGUF Q4_0 Performance ---")
print(f"✅ Latency: {latency:.3f}s")
print(f"✅ Tokens/sec: {tps:.2f}")
print(f"✅ Output: {response['choices'][0]['text'].strip()}")

# Note: VRAM usage for GGUF is best checked via the 'nvidia-smi' command 
# in a separate cell while the model is loaded.

llama_model_loader: loaded meta data with 32 key-value pairs and 201 tensors from /kaggle/input/modelsday4/model-q4_0.gguf (version GGUF V3 (latest))
llama_model_loader: - tensor    0:                    output.weight q6_K     [  2048, 32000,     1,     1 ]
llama_model_loader: - tensor    1:               output_norm.weight f32      [  2048,     1,     1,     1 ]
llama_model_loader: - tensor    2:                token_embd.weight q4_0     [  2048, 32000,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_k.weight q4_0     [  2048,   256,     1,     1 ]
llama_model_loader: - tensor    4:           blk.0.attn_norm.weight f32      [  2048,     1,     1,     1 ]
llama_model_loader: - tensor    5:         blk.0.attn_output.weight q4_0     [  2048,  2048,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_q.weight q4_0     [  2048,  2048,     1,     1 ]
llama_model_loader: - tensor    7:              blk.0.attn_v.weight q4_0     [  2048,   256,  

--- GGUF Q4_0 Performance ---
✅ Latency: 6.981s
✅ Tokens/sec: 14.32
✅ Output: Coronavirus disease is a respiratory illness. It typically causes fever, cough, and difficulty breathing or shortness of breath. It can also cause pneumonia. Most people infected with COVID-19 will recover without medical treatment. However, it's important to take precautions to prevent the virus from spreading so you can get better quickly. This includes avoiding close contact with people who are sick and wearing a face mask or covering in
