In [1]:
!pip install -q transformers peft bitsandbytes accelerate sentencepiece
!pip install -q llama-cpp-python

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.7/60.7 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.7/50.7 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l[?25hdone


In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import os
import time

print(f"CUDA Available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0)}")
print("All libraries imported ")

CUDA Available: True
GPU: Tesla T4
All libraries imported 


In [3]:
!pip install -q --upgrade peft

In [5]:
from peft import PeftModel
import os

BASE_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

print("Loading base model...")
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float16,
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
tokenizer.pad_token = tokenizer.eos_token

print("Merging Day 2 adapters...")
model = PeftModel.from_pretrained(model, "/content")
model = model.merge_and_unload()

print("Saving merged model...")
os.makedirs("/content/quantized", exist_ok=True)
model.save_pretrained("/content/quantized/model-fp16")
tokenizer.save_pretrained("/content/quantized/model-fp16")

print("Merged model saved ✅")

Loading base model...


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

Merging Day 2 adapters...
Saving merged model...


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Merged model saved ✅


In [6]:
from transformers import BitsAndBytesConfig
import torch

int8_config = BitsAndBytesConfig(load_in_8bit=True)

model_int8 = AutoModelForCausalLM.from_pretrained(
    "/content/quantized/model-fp16",
    quantization_config=int8_config,
    device_map="auto"
)

model_int8.save_pretrained("/content/quantized/model-int8")
tokenizer.save_pretrained("/content/quantized/model-int8")

print("INT8 model saved ")

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

INT8 model saved 


In [7]:
int4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model_int4 = AutoModelForCausalLM.from_pretrained(
    "/content/quantized/model-fp16",
    quantization_config=int4_config,
    device_map="auto"
)

model_int4.save_pretrained("/content/quantized/model-int4")
tokenizer.save_pretrained("/content/quantized/model-int4")

print("INT4 model saved ")

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

INT4 model saved 


In [8]:
!git clone https://github.com/ggerganov/llama.cpp
!pip install -q -r llama.cpp/requirements.txt

print("llama.cpp ready ")

Cloning into 'llama.cpp'...
remote: Enumerating objects: 80923, done.[K
remote: Counting objects: 100% (163/163), done.[K
remote: Compressing objects: 100% (103/103), done.[K
remote: Total 80923 (delta 108), reused 60 (delta 60), pack-reused 80760 (from 2)[K
Receiving objects: 100% (80923/80923), 306.36 MiB | 27.11 MiB/s, done.
Resolving deltas: 100% (58440/58440), done.
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m73.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.7/12.7 MB[0m [31m69.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31

In [10]:
import os, time, torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

tokenizer = AutoTokenizer.from_pretrained("/content/quantized/model-fp16")
prompt = "What is insulin?"
results = {}

def measure(name, model, path):
    size = round(sum(os.path.getsize(os.path.join(path, f))
               for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))) / 1e9, 2)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    start = time.time()
    with torch.no_grad():
        model.generate(**inputs, max_new_tokens=80, do_sample=False, pad_token_id=tokenizer.eos_token_id)
    speed = round(time.time() - start, 2)
    results[name] = (size, speed)
    print(f"{name} — Size: {size}GB | Speed: {speed}s ")

# FP16
m = AutoModelForCausalLM.from_pretrained("/content/quantized/model-fp16", torch_dtype=torch.float16, device_map="auto")
measure("FP16", m, "/content/quantized/model-fp16")
del m; torch.cuda.empty_cache()

# INT8
m = AutoModelForCausalLM.from_pretrained("/content/quantized/model-int8", quantization_config=BitsAndBytesConfig(load_in_8bit=True), device_map="auto")
measure("INT8", m, "/content/quantized/model-int8")
del m; torch.cuda.empty_cache()

# INT4
m = AutoModelForCausalLM.from_pretrained("/content/quantized/model-int4", quantization_config=BitsAndBytesConfig(load_in_4bit=True), device_map="auto")
measure("INT4", m, "/content/quantized/model-int4")
del m; torch.cuda.empty_cache()

print("\n Size & Speed measured for FP16, INT8, INT4!")

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

FP16 — Size: 2.2GB | Speed: 5.34s 


  def supports_quant_method(quantization_config_dict):


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

INT8 — Size: 1.24GB | Speed: 8.94s 


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

INT4 — Size: 0.81GB | Speed: 3.48s 

 Size & Speed measured for FP16, INT8, INT4!


In [12]:
from llama_cpp import Llama

gguf = Llama(model_path="/content/quantized/model.gguf", n_ctx=512, verbose=False)
gguf_size = round(os.path.getsize("/content/quantized/model.gguf") / 1e9, 2)
start = time.time()
gguf(prompt, max_tokens=80)
gguf_speed = round(time.time() - start, 2)
del gguf

print(f"GGUF — Size: {gguf_size}GB | Speed: {gguf_speed}s ")

llama_context: n_ctx_per_seq (512) < n_ctx_train (2048) -- the full capacity of the model will not be utilized


GGUF — Size: 1.17GB | Speed: 0.66s 


In [21]:
# Exact format from your Day 2 ChatDoctor dataset
chat_prompt = """<|system|>
If you are a doctor, please answer the medical questions based on the patient's description.</s>
<|user|>
I have been feeling very thirsty and urinating frequently. What could be wrong?</s>
<|assistant|>"""

def get_response(model):
    inputs = tokenizer(chat_prompt, return_tensors="pt").to(model.device)
    input_len = inputs["input_ids"].shape[1]
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=150,
            do_sample=False,
            repetition_penalty=1.3,
            pad_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(output[0][input_len:], skip_special_tokens=True).strip()

# FP16
m = AutoModelForCausalLM.from_pretrained("/content/quantized/model-fp16", torch_dtype=torch.float16, device_map="auto")
fp16_response = get_response(m)
del m; torch.cuda.empty_cache()
print("FP16 done ")

# INT8
m = AutoModelForCausalLM.from_pretrained("/content/quantized/model-int8", quantization_config=BitsAndBytesConfig(load_in_8bit=True), device_map="auto")
int8_response = get_response(m)
del m; torch.cuda.empty_cache()
print("INT8 done ")

# INT4
m = AutoModelForCausalLM.from_pretrained("/content/quantized/model-int4", quantization_config=BitsAndBytesConfig(load_in_4bit=True), device_map="auto")
int4_response = get_response(m)
del m; torch.cuda.empty_cache()
print("INT4 done ")

# GGUF
gguf = Llama(model_path="/content/quantized/model.gguf", n_ctx=512, verbose=False)
gguf_response = gguf(chat_prompt, max_tokens=150, repeat_penalty=1.3)["choices"][0]["text"].strip()
del gguf
print("GGUF done ")

# Print
print("\n Responses (Day 2 Dataset Format):")
print(f"\nFP16:\n{fp16_response}")
print(f"\nINT8:\n{int8_response}")
print(f"\nINT4:\n{int4_response}")
print(f"\nGGUF:\n{gguf_response}")

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

FP16 done 


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

INT8 done 


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

INT4 done 


llama_context: n_ctx_per_seq (512) < n_ctx_train (2048) -- the full capacity of the model will not be utilized


GGUF done 

 Responses (Day 2 Dataset Format):

FP16:
Hi! Thanks for asking Chat Doctor . I understand your concern. Thirst is one of the most common symptoms in dehydration. It can happen due to various reasons like low blood volume or electrolyte imbalance. If it persists then there may be an underlying problem which needs further investigation by a healthcare professional. You should consult with a physician if this continues after proper hydration. In case of acute dehydration, you need to drink plenty of fluids (water) along with salt tablets. Avoid caffeine as much as possible. Take iodized salt regularly. Consulting a physician will help determine what exactly is causing your condition.

INT8:
Hi! Thanks for asking Chat Doctor. I understand your concern. You may need to take some medications like water pills or diuretics if you feel that there is an excessive amount of fluid in body. This can help reduce symptoms such as frequent urination and increased thirst. If this does not 