In [None]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:

!pip install -q -U transformers accelerate bitsandbytes

# Clone and build llama.cpp from source
!git clone https://github.com/ggml-org/llama.cpp
%cd llama.cpp
!cmake -B build
!cmake --build build --config Release -j 8
%cd /kaggle/working

In [None]:
import time
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto")

prompt = "Explain Coronavirus."
inputs = tokenizer(prompt, return_tensors="pt").to(device)

start = time.time()
outputs = model.generate(**inputs, max_new_tokens=100, use_cache=True)
end = time.time()

tokens_generated = outputs.shape[1]
print(f"Latency: {round(end - start, 3)}s")
print(f"Tokens/sec: {round(tokens_generated / (end - start), 2)}")

In [None]:
import time
import torch
import psutil
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_id = "/kaggle/input/model-int4/int4" 

# 1. Define the quantization config properly
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4"
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

# 2. Load model with the config object
model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    device_map="auto", 
    quantization_config=quant_config # Use the config object here
)

prompt = "Explain the importance of open-source AI."
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

# --- Measurement Logic ---
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
start_time = time.time()

# Generate
outputs = model.generate(**inputs, max_new_tokens=100)

end_time = time.time()

# Calculations
latency = end_time - start_time
tokens_gen = outputs.shape[1] - inputs.input_ids.shape[1]
vram = torch.cuda.max_memory_allocated() / 1024**3 

print(f"\n Latency: {latency:.2f}s")
print(f" Tokens/sec: {tokens_gen / latency:.2f}")
print(f" Max VRAM Usage: {vram:.2f} GB")

In [None]:
import time
from llama_cpp import Llama

# 1. Setup Path (Based on your file browser screenshot)
gguf_path = "/kaggle/input/modelsday4/model-q4_0.gguf"

# 2. Load Model
# n_gpu_layers=-1 moves all layers to the GPU (T4 on Kaggle)
llm = Llama(
    model_path=gguf_path,
    n_gpu_layers=-1, 
    n_ctx=2048,
    verbose=False # Keeps the output clean
)

# 3. Define Prompt and Measure
prompt = "Explain the coronavirus"
formatted_prompt = f"Q: {prompt} A:"

start_time = time.time()
response = llm(
    formatted_prompt,
    max_tokens=100,
    stop=["Q:", "\n"],
    echo=False
)
end_time = time.time()

# 4. Calculations
latency = end_time - start_time
tokens_generated = response["usage"]["completion_tokens"]
tps = tokens_generated / latency

print(f"--- GGUF Q4_0 Performance ---")
print(f" Latency: {latency:.3f}s")
print(f" Tokens/sec: {tps:.2f}")
print(f" Output: {response['choices'][0]['text'].strip()}")

# Note: VRAM usage for GGUF is best checked via the 'nvidia-smi' command 
# in a separate cell while the model is loaded.

In [None]:


# 1. Setup Path (Based on your file browser screenshot)
gguf_path = "/kaggle/input/modelsday4/model-q4_0.gguf"

# 2. Load Model
# n_gpu_layers=-1 moves all layers to the GPU (T4 on Kaggle)
llm = Llama(
    model_path=gguf_path,
    n_gpu_layers=-1, 
    n_ctx=2048,
    verbose=False # Keeps the output clean
)

# 3. Define Prompt and Measure
prompt = "Explain the coronavirus disease"
formatted_prompt = f"Q: {prompt} A:"

start_time = time.time()
response = llm(
    formatted_prompt,
    max_tokens=100,
    stop=["Q:", "\n"],
    echo=False
)
end_time = time.time()

# 4. Calculations
latency = end_time - start_time
tokens_generated = response["usage"]["completion_tokens"]
tps = tokens_generated / latency

print(f"--- GGUF Q4_0 Performance ---")
print(f"Latency: {latency:.3f}s")
print(f" Tokens/sec: {tps:.2f}")
print(f" Output: {response['choices'][0]['text'].strip()}")

# Note: VRAM usage for GGUF is best checked via the 'nvidia-smi' command 
# in a separate cell while the model is loaded.