In [None]:
# Download Dataset from HF if needed. (Optional)

from datasets import load_dataset, Dataset
from itertools import islice

# Load the dataset in streaming mode (doesn't download all 4 GB)
streamed = load_dataset("HuggingFaceTB/smoltalk", "all", split="train", streaming=True)

# Take first N examples (adjust until file size ≈100MB)
subset = Dataset.from_list(list(islice(streamed, 10000)))

# Save properly as JSONL (preserves nested "messages" as real lists)
subset.to_json("smoltalk-small.jsonl", orient="records", lines=True)


Creating json from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

40812519

In [None]:
!pip install -q trl huggingface peft transformers datasets accelerate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/544.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m204.8/544.8 kB[0m [31m6.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m544.8/544.8 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install -q -U bitsandbytes

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m40.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer
from peft import LoraConfig, PeftModel
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
import torch

print("CUDA available:", torch.cuda.is_available())
print("Device:", torch.cuda.current_device(), torch.cuda.get_device_name())

CUDA available: True
Device: 0 NVIDIA A100-SXM4-40GB


In [None]:
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

# Base model
MODEL_PATH = "microsoft/phi-2"

# Load your CSV dataset
dataset = load_dataset(
    "csv",
    data_files="/content/medquad.csv",
    split="train",
    delimiter=",",
    column_names=["question", "answer", "source", "focus_area"]
)

# Convert Q&A into conversation format
def format_conversation(example):
    q = example["question"] if example["question"] is not None else ""
    a = example["answer"] if example["answer"] is not None else ""
    conversation = f"User: {q.strip()}\nAssistant: {a.strip()}"
    return {"text": conversation + tokenizer.eos_token}


# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
# If tokenizer has no pad token, set it
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


# Apply formatting
dataset = dataset.map(format_conversation)

# Tokenize the formatted text
def tokenize(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=1024,   # adjust if using longer context model
    )

tokenized_dataset = dataset.map(
    tokenize,
    batched=True,
    remove_columns=dataset.column_names,
    num_proc=4
)

# Quantization config for 4-bit training/inference
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Load model in 4-bit
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    quantization_config=bnb_config,
    device_map="auto",
)

print(tokenized_dataset[0])  # sanity check


Map:   0%|          | 0/16413 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/16413 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

{'input_ids': [12982, 25, 1808, 198, 48902, 25, 3280, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 5

In [None]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=8,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)


sft_config = SFTConfig(
    output_dir="Checkpoints",
    per_device_train_batch_size=8,     # ✅ Adjust to your GPU
    gradient_accumulation_steps=2,     # ✅ If small VRAM
    num_train_epochs=3,                # ✅ Choose as needed
    logging_steps=100,
    save_steps=100,
    save_total_limit=2,
    fp16=True,                        # ✅ if using bfloat16
    remove_unused_columns=False,
    dataloader_num_workers = 4,     # ✅ prevents data drop
)


trainer = SFTTrainer(
    model,
    train_dataset=tokenized_dataset,
    args=sft_config,
    peft_config=peft_config
)

trainer.train()

Truncating train dataset:   0%|          | 0/16413 [00:00<?, ? examples/s]

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdeveloper_ashish[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
100,6.0937
200,1.305
300,0.4783
400,0.4655
500,0.4417
600,0.4307
700,0.4265
800,0.409
900,0.4016
1000,0.3745


TrainOutput(global_step=3078, training_loss=0.5770512284048972, metrics={'train_runtime': 11980.7993, 'train_samples_per_second': 4.11, 'train_steps_per_second': 0.257, 'total_flos': 8.036488989219226e+17, 'train_loss': 0.5770512284048972, 'entropy': 0.3274805367183991, 'num_tokens': 50420736.0, 'mean_token_accuracy': 0.9244438803348786, 'epoch': 3.0})

In [None]:
OUTPUT_MODEL_PATH = f"{MODEL_PATH}-SFT"

# Save only LoRA adapter weights
peft_model = trainer.model
peft_model.save_pretrained(f"{OUTPUT_MODEL_PATH}", safe_serialization=True)

# Save tokenizer (only once is fine)
trainer.tokenizer.save_pretrained(OUTPUT_MODEL_PATH)

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


('microsoft/phi-2-SFT/tokenizer_config.json',
 'microsoft/phi-2-SFT/special_tokens_map.json',
 'microsoft/phi-2-SFT/vocab.json',
 'microsoft/phi-2-SFT/merges.txt',
 'microsoft/phi-2-SFT/added_tokens.json',
 'microsoft/phi-2-SFT/tokenizer.json')

In [None]:
print(type(trainer.model))

<class 'peft.peft_model.PeftModelForCausalLM'>


In [None]:
print(isinstance(trainer.model, PeftModel))

True


In [None]:
# Optional: If you need to save full model incase
full_model = trainer.model.merge_and_unload()  # merges LoRA into base
full_model.save_pretrained("FULL-MODEL")



In [None]:
# Push to huggingface
!git config --global credential.helper store
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `hf auth whoami` to get more information or `hf auth logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: fineGrained).
The tok

In [None]:
LORA_ADAPTER_PATH = "devashish07/phi-2-healthcare-qlora"
peft_model.push_to_hub(LORA_ADAPTER_PATH)
trainer.tokenizer.push_to_hub(LORA_ADAPTER_PATH)

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...px4nlci1j/adapter_model.safetensors: 100%|##########| 31.5MB / 31.5MB            

No files have been modified since last commit. Skipping to prevent empty commit.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/devashish07/phi-2-healthcare-qlora/commit/2f29316d26642db8cf01c4a948be5c2936d32bda', commit_message='Upload tokenizer', commit_description='', oid='2f29316d26642db8cf01c4a948be5c2936d32bda', pr_url=None, repo_url=RepoUrl('https://huggingface.co/devashish07/phi-2-healthcare-qlora', endpoint='https://huggingface.co', repo_type='model', repo_id='devashish07/phi-2-healthcare-qlora'), pr_revision=None, pr_num=None)

In [None]:
# Inferencing
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
from peft import PeftModel, PeftConfig
import torch

BASE_MODEL = "microsoft/phi-2"
LORA_ADAPTER_PATH = "devashish07/phi-2-healthcare-qlora"
TOKENIZER_PATH = "devashish07/phi-2-healthcare-qlora"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)


# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
)

# Load LoRA adapter
model = PeftModel.from_pretrained(base_model, LORA_ADAPTER_PATH)
model.eval()

# Text streamer for real-time printing (optional)
streamer = TextStreamer(tokenizer)

# Prompt
prompt = """User: What is (are) Glaucoma ?.
Assistant:"""

# Tokenize
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# Generate response
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=500,
        temperature=0.1,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        streamer=streamer
    )

# Decode
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# print("\n\n[Model's Full Output]\n", response)


# Strip prompt to get only the assistant's response
assistant_reply = response.split("Assistant:")[-1].strip()

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/31.5M [00:00<?, ?B/s]

User: What is (are) Glaucoma?.
Assistant: Glaucoma is a group of eye diseases that damage the optic nerve. The optic nerve is a bundle of nerve fibers that carries images from the retina to the brain. Damage to the optic nerve can cause vision loss and blindness. Glaucoma is the leading cause of blindness in the United States.
                
                
                
                
                
                
                
                
                
                
                
                
                
                
                
                
                
                
                
                
                
                
                
                
                
                
                
                
                
                
                
                
                
                
                
                
                
                
                
        

In [None]:
print("\n🧠 Assistant's Reply:\n", assistant_reply)


🧠 Assistant's Reply:
 Glaucoma is a group of eye diseases that damage the optic nerve. The optic nerve is a bundle of nerve fibers that carries images from the retina to the brain. Damage to the optic nerve can cause vision loss and blindness. Glaucoma is the leading cause of blindness in the United States.


In [None]:
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TextIteratorStreamer
from peft import PeftModel
import torch
import threading

# -------------------------------
# Load Model and Tokenizer
# -------------------------------
BASE_MODEL = "microsoft/phi-2"
LORA_ADAPTER_PATH = "devashish07/phi-2-healthcare-qlora"
TOKENIZER_PATH = "devashish07/phi-2-healthcare-qlora"

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
)

model = PeftModel.from_pretrained(base_model, LORA_ADAPTER_PATH)
model = model.to(base_model.device)
model.eval()

# -------------------------------
# Generator-based Streaming Chat
# -------------------------------
def chat_stream(user_input, history):
    history = history or []

    # Reconstruct full prompt
    formatted_prompt = ""
    for message in history:
        role = message["role"]
        content = message["content"]
        if role == "user":
            formatted_prompt += f"User: {content}\n"
        elif role == "assistant":
            formatted_prompt += f"Assistant: {content}\n"
    formatted_prompt += f"User: {user_input}\nAssistant:"

    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)

    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    generation_kwargs = {
        **inputs,
        "streamer": streamer,
        "max_new_tokens": 500,
        "temperature": 0.5,
        "top_p": 0.9,
        "do_sample": True,
        "pad_token_id": tokenizer.eos_token_id,
    }

    thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    partial_output = ""
    for new_text in streamer:
        partial_output += new_text
        updated_history = history + [
            {"role": "user", "content": user_input},
            {"role": "assistant", "content": partial_output},
        ]
        yield updated_history, updated_history

# -------------------------------
# Gradio Interface
# -------------------------------
with gr.Blocks() as demo:
    gr.Markdown("### 🦙 TinyLlama Chat — Real-time like ChatGPT")

    chatbot = gr.Chatbot(label="Chat", height=500, render_markdown=False, type="messages")
    msg = gr.Textbox(placeholder="Ask something...", show_label=False)
    clear = gr.Button("🧹 Clear")

    state = gr.State([])

    msg.submit(chat_stream, [msg, state], [chatbot, state])
    msg.submit(lambda _: "", msg, msg)  # Clear input box only
    clear.click(lambda: ([], []), None, [chatbot, state])  # Clear everything

demo.launch()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://5c6dbf7897b580e9b4.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
# Convert to TensorRT via ONNX
# 1. Install Deps

# 2. Load your LoRA adapter from Hugging Face Hub and merge them into a full Hugging Face model.

# 3. Export to ONNX.

# 4. Quanticized to ONNX-INT8 model

# 5. Tar it to model.tar.gz and Store in S3

# 6. Converting to TensorRT will do in runtime in first request

In [None]:
# ONNX Runtime GenAI (why we used this) for PHI-2

# ONNX Runtime GenAI is NVIDIA + Microsoft’s toolkit built specifically for LLMs.

# Advantages:

# Knows about KV cache → exports models ready for autoregressive decoding.

# Integrates with TensorRT EP out of the box.

# Adds builder scripts with presets for text-generation tasks (so you don’t have to hand-craft torch.onnx.export calls).

# Keeps up with new LLM architectures like PHI-2 faster than vanilla PyTorch ONNX export.

In [None]:
! pip install -q transformers accelerate peft bitsandbytes
! pip install -q huggingface-hub onnx_ir
! pip install -q --upgrade onnxruntime onnxruntime-tools onnxruntime-genai

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m40.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m124.4/124.4 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m88.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.5/16.5 MB[0m [31m72.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.7/212.7 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m76.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Step 2. Load base + adapter then save the merged model.
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

BASE_MODEL = "microsoft/phi-2"
ADAPTER_REPO = "devashish07/phi-2-healthcare-qlora"   # 👈 your HuggingFace repo with adapter

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, torch_dtype="auto", device_map="auto")

# Load adapters
model = PeftModel.from_pretrained(base_model, ADAPTER_REPO)

# Merge adapter into base weights
model = model.merge_and_unload()
model.save_pretrained("./phi2-healthcare-merged")
tokenizer.save_pretrained("./phi2-healthcare-merged")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/31.5M [00:00<?, ?B/s]

('./phi2-healthcare-merged/tokenizer_config.json',
 './phi2-healthcare-merged/special_tokens_map.json',
 './phi2-healthcare-merged/vocab.json',
 './phi2-healthcare-merged/merges.txt',
 './phi2-healthcare-merged/added_tokens.json',
 './phi2-healthcare-merged/tokenizer.json')

In [None]:
# Just checking the arguments it has
!python -m onnxruntime_genai.models.builder --help

2025-09-01 15:58:23,544 numexpr.utils [INFO] - NumExpr defaulting to 12 threads.
usage: builder.py [-h] [-m MODEL_NAME] [-i INPUT] -o OUTPUT -p
                  {int4,bf16,fp16,fp32} -e
                  {cpu,cuda,rocm,dml,webgpu,NvTensorRtRtx} [-c CACHE_DIR]
                  [--extra_options KEY=VALUE [KEY=VALUE ...]]

options:
  -h, --help            show this help message and exit
  -m MODEL_NAME, --model_name MODEL_NAME
                        Model name in Hugging Face. Do not use if providing an input path to a Hugging Face directory in -i/--input.
  -i INPUT, --input INPUT
                        Input model source. Currently supported options are:
                            hf_path: Path to folder on disk containing the Hugging Face config, model, tokenizer, etc.
                            gguf_path: Path to float16/float32 GGUF file on disk containing the GGUF model
  -o OUTPUT, --output OUTPUT
                        Path to folder to store ONNX model and additional files

In [None]:
# Step 3. Export to ONNX -> This will create onnx_model/model.onnx

import subprocess

model_path = "/content/phi2-healthcare-merged"   # your saved merged LoRA+base model
onnx_out = "/content/phi2-onnx"             # where ONNX will be written

cmd = [
    "python", "-m", "onnxruntime_genai.models.builder",
    "-m", model_path,
    "-o", onnx_out,
    "-p", "fp16",
    "-e", "cuda",
]

print("Running:", " ".join(cmd))
subprocess.run(cmd, check=True)

print("✅ Export complete. ONNX model saved to:", onnx_out)


Running: python -m onnxruntime_genai.models.builder -m /content/phi2-healthcare-merged -o /content/phi2-onnx -p fp16 -e cuda
✅ Export complete. ONNX model saved to: /content/phi2-onnx


In [None]:
# Step 4. Quantize ONNX model (Post-training quantization).
# model-int8.onnx → fully quantized graph (int8 ops).
# Keeps accuracy decent, reduces memory, speeds inference.


from onnxruntime.quantization import quantize_dynamic, QuantType

model_fp16 = "/content/phi2-onnx/model.onnx"
model_int8 = "/content/phi2-onnx/model-int8.onnx"

quantize_dynamic(
    model_input=model_fp16,
    model_output=model_int8,
    weight_type=QuantType.QInt8,  # or QuantType.QUInt8
    per_channel=True
)

print("✅ Quantized model saved:", model_int8)



✅ Quantized model saved: /content/phi2-onnx/model-int8.onnx


In [None]:
# Step 5. Tar it to model.tar.gz and Store in S3

!tar -czvf model.tar.gz \
    /content/phi2-onnx/model-int8.onnx \
    /content/phi2-healthcare-merged/tokenizer.json \
    /content/phi2-healthcare-merged/tokenizer_config.json \
    /content/phi2-healthcare-merged/vocab.json \
    /content/phi2-healthcare-merged/special_tokens_map.json


tar: Removing leading `/' from member names
/content/phi2-onnx/model-int8.onnx
tar: Removing leading `/' from hard link targets
/content/phi2-healthcare-merged/tokenizer.json
/content/phi2-healthcare-merged/tokenizer_config.json
/content/phi2-healthcare-merged/vocab.json
/content/phi2-healthcare-merged/special_tokens_map.json


In [None]:
!pip install awscli
!aws configure

AWS Access Key ID [****************WIZA]: AKIASPP67LNANLBMXJXW 
AWS Secret Access Key [****************ssZ3]: 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
Default region name [ap-south-1]: 
Default output format [Json]: json


In [None]:
# Upload to S3
!aws s3 cp model.tar.gz s3://voiceai-s3-bucket-03/phi2-onnx-int8-model/

upload: ./model.tar.gz to s3://voiceai-s3-bucket-03/phi2-onnx-int8-model/model.tar.gz
