In [17]:
!pip install numpy transformers>=4.43 datasets>=2.19 accelerate>=0.33 peft>=0.12 bitsandbytes jinja2 trl>=0.9


In [18]:
# baseline_infer_tinyllama.py
# ---
# Minimal, stable baseline generation for TinyLlama-1.1B-Chat-v1.0
# Uses the model's chat template + safer decoding settings
# Works on CPU, Apple Silicon (MPS), or NVIDIA GPU.

import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_ID = os.environ.get("MODEL_ID", "TinyLlama/TinyLlama-1.1B-Chat-v1.0")

# Pick best available device
if torch.cuda.is_available():
    DEVICE, TORCH_DTYPE = "cuda", torch.float16
elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
    DEVICE, TORCH_DTYPE = "mps", torch.float16
else:
    DEVICE, TORCH_DTYPE = "cpu", None

def build_messages_api_doc(endpoint_block: str):
    """
    Build a simple two-turn chat: system + user.
    You can swap this builder for your own task later.
    """
    return [
        {
            "role": "system",
            "content": (
                "You are a precise technical writer. Follow this API doc template strictly:\n"
                "Summary\nEndpoint\nParameters (markdown table)\nResponses\n"
                "Do not invent extra turns. Keep it concise and consistent."
            ),
        },
        {
            "role": "user",
            "content": f"Write an API doc page.\n{endpoint_block}",
        },
    ]

def generate_api_doc(endpoint_block: str, max_new_tokens: int = 320) -> str:
    tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        torch_dtype=TORCH_DTYPE if TORCH_DTYPE is not None else None
    ).to(DEVICE).eval()

    # Use the model's chat template for stable formatting
    messages = build_messages_api_doc(endpoint_block)
    prompt = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tok(prompt, return_tensors="pt")
    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}

    # Generate with tighter, de-looped settings
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.5,         # lower = more structured
            top_p=0.9,
            repetition_penalty=1.1,  # discourages repeating tags/lines
            pad_token_id=tok.eos_token_id,
        )

    # --- Clean extraction: only new tokens after the prompt (no echo) ---
    generated_ids = out[0]
    prompt_len = inputs["input_ids"].shape[-1]
    new_tokens = generated_ids[prompt_len:]
    text = tok.decode(new_tokens, skip_special_tokens=True)

    # Light post-trim: stop if the model starts a new bracketed section
    # (keeps first coherent block; optional but helps with small models)
    cut_markers = ["\n[", "\n<System", "\nUser:", "\nAssistant:"]
    for m in cut_markers:
        if m in text:
            text = text.split(m, 1)[0]
    return text.strip()

if __name__ == "__main__":
    # Example baseline prompt (same one you used earlier)
    endpoint = (
        "Endpoint: POST /v1/refunds; Auth: Bearer token; "
        "Params: payment_id (string, required), amount (integer, optional); "
        "Response: 201 {refund}, 400, 401"
    )
    print("\n=== BASELINE (TinyLlama chat template) ===\n")
    print(generate_api_doc(endpoint))



=== BASELINE (TinyLlama chat template) ===

[API Doc Page]

Name: Refund API

Description: This API allows you to create, retrieve, update, and delete refunds for your products.

Endpoint: /v1/refunds

Method: POST

Auth: Bearer token

Required Params:
- payment_id (string, required): The unique identifier of the payment method used to make the purchase.
- amount (integer, optional): The total amount due for the refund. If omitted, the default value is 0.

Optional Params:
- currency (string, optional): The currency code used to convert the amount to USD. Defaults to "USD".
- description (string, optional): A human-readable description of the reason for the refund.

Response:
- 201: Created - Indicates that the refund was successfully created.
- 400: Bad Request - If any of the parameters provided in the request are invalid.
- 401: Unauthorized - If the user does not have sufficient permissions to access the requested resource.

Example Request:
```
curl -X POST \
  https://api.exampl

In [19]:
# prompt_templates.py

SYS_TECH = (
    "You are a precise technical writer. Follow this API doc template strictly:\n"
    "Summary\nEndpoint\nParameters (markdown table)\nResponses\n"
    "Do not invent extra turns. Keep it concise and consistent."
)

SYS_LEGAL = "You are a pragmatic legal drafter. Be clear, concise, and neutral. Prefer short clauses."
SYS_MARKETING = "You are a sharp marketing copywriter. Be concise, on-brand, and punchy. Prefer strong verbs and clear CTAs."

def system_for_domain(domain: str) -> str:
    return {
        "tech": SYS_TECH,
        "legal": SYS_LEGAL,
        "marketing": SYS_MARKETING,
    }.get(domain, "You are helpful and concise.")

def build_messages(domain: str, instruction: str, input_text: str = ""):
    """Return a standard system+user two-turn chat."""
    return [
        {"role": "system", "content": system_for_domain(domain)},
        {"role": "user", "content": f"{instruction.strip()}\n{(input_text or '').strip()}"},
    ]

def build_chat_text(tokenizer, domain: str, instruction: str, input_text: str = "") -> str:
    """Render the chat prompt to plain text using the model's chat template."""
    msgs = build_messages(domain, instruction, input_text)
    return tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)


In [20]:
# train_lora_tinyllama_tech.py
import os, math
from typing import List, Dict
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    DataCollatorForLanguageModeling, TrainingArguments, Trainer
)
from peft import LoraConfig, get_peft_model, TaskType


MODEL_ID    = os.environ.get("MODEL_ID", "TinyLlama/TinyLlama-1.1B-Chat-v1.0")
OUTPUT_DIR  = os.environ.get("OUTPUT_DIR", "outputs/tinyllama-tech-lora")
SEED        = int(os.environ.get("SEED", 42))
MAX_LENGTH  = int(os.environ.get("MAX_LENGTH", 512))
EPOCHS      = float(os.environ.get("EPOCHS", 1))
BATCH_SIZE  = int(os.environ.get("BATCH_SIZE", 1))
GRAD_ACC    = int(os.environ.get("GRAD_ACC", 4))
LR          = float(os.environ.get("LR", 2e-4))

# device
if torch.cuda.is_available():
    DEVICE = "cuda"
elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
    DEVICE = "mps"
else:
    DEVICE = "cpu"

# tiny TECH dataset (emphasizes strict 4-section template)
TRAIN: List[Dict] = [
    {
        "instruction": "Write an API doc page.",
        "input": "Endpoint: POST /v1/invoices; Auth: Bearer token; Params: customer_id (string, required), line_items (array, required); Response: 201 {invoice_id}",
        "output": """Summary
Create an invoice for a given customer.

Endpoint
POST /v1/invoices
Auth: Bearer <token>

Parameters
| Name        | Type   | Required | Description                  |
|-------------|--------|----------|------------------------------|
| customer_id | string | yes      | Customer identifier          |
| line_items  | array  | yes      | Line items to be invoiced    |

Responses
- 201 Created: {"invoice_id": "inv_123"}
- 400 Bad Request
- 401 Unauthorized"""
    },
    {
        "instruction": "Write an API doc page.",
        "input": "Endpoint: GET /v1/invoices/{id}; Auth: Bearer token; Params: id (string, path, required); Response: 200 {invoice}",
        "output": """Summary
Retrieve an invoice by ID.

Endpoint
GET /v1/invoices/{id}
Auth: Bearer <token>

Parameters
| Name | Type   | Required | In   | Description     |
|------|--------|----------|------|-----------------|
| id   | string | yes      | path | Invoice ID      |

Responses
- 200 OK: {"id":"inv_123","status":"paid",...}
- 401 Unauthorized
- 404 Not Found"""
    },
    {
        "instruction": "Write an API doc page.",
        "input": "Endpoint: DELETE /v1/invoices/{id}; Auth: Bearer token; Params: id (string, path, required); Response: 204 No Content",
        "output": """Summary
Delete an invoice by ID.

Endpoint
DELETE /v1/invoices/{id}
Auth: Bearer <token>

Parameters
| Name | Type   | Required | In   | Description     |
|------|--------|----------|------|-----------------|
| id   | string | yes      | path | Invoice ID      |

Responses
- 204 No Content
- 401 Unauthorized
- 404 Not Found"""
    },
    {
        "instruction": "Write an API doc page.",
        "input": "Endpoint: POST /v1/customers; Auth: Bearer token; Params: email (string, required), name (string, optional); Response: 201 {customer}",
        "output": """Summary
Create a customer.

Endpoint
POST /v1/customers
Auth: Bearer <token>

Parameters
| Name | Type   | Required | Description        |
|------|--------|----------|--------------------|
| email| string | yes      | Customer email     |
| name | string | no       | Customer name      |

Responses
- 201 Created: {"id":"cus_123", ...}
- 400 Bad Request
- 401 Unauthorized"""
    },
    {
        "instruction": "Write an API doc page.",
        "input": "Endpoint: PATCH /v1/customers/{id}; Auth: Bearer token; Params: id (string, path, required), name (string, optional); Response: 200 {customer}",
        "output": """Summary
Update a customer.

Endpoint
PATCH /v1/customers/{id}
Auth: Bearer <token>

Parameters
| Name | Type   | Required | In   | Description    |
|------|--------|----------|------|----------------|
| id   | string | yes      | path | Customer ID    |
| name | string | no       | body | Customer name  |

Responses
- 200 OK: {"id":"cus_123","name":"..."}
- 401 Unauthorized
- 404 Not Found"""
    },
    {
        "instruction": "Write an API doc page.",
        "input": "Endpoint: GET /v1/customers; Auth: Bearer token; Params: limit (integer, optional), cursor (string, optional); Response: 200 {customers[]}",
        "output": """Summary
List customers with pagination.

Endpoint
GET /v1/customers
Auth: Bearer <token>

Parameters
| Name   | Type    | Required | Description                 |
|--------|---------|----------|-----------------------------|
| limit  | integer | no       | Max items to return         |
| cursor | string  | no       | Pagination cursor           |

Responses
- 200 OK: [{"id":"cus_..."}...]
- 401 Unauthorized"""
    },
]

EVAL = TRAIN[:2]

def row_to_text_chat(tok, ex: Dict) -> str:
    # domain "tech" for this demo
    return build_chat_text(tok, "tech", ex["instruction"], ex.get("input", "")) + ex["output"]

def main():
    tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token
    tok.padding_side = "right"

    ds_train = Dataset.from_list(TRAIN)
    ds_eval  = Dataset.from_list(EVAL)

    cols = list(ds_train.features.keys())
    tok_train = ds_train.map(lambda ex: tok(row_to_text_chat(tok, ex), truncation=True, max_length=MAX_LENGTH, padding=False),
                             remove_columns=cols)
    tok_eval  = ds_eval.map (lambda ex: tok(row_to_text_chat(tok, ex), truncation=True, max_length=MAX_LENGTH, padding=False),
                             remove_columns=cols)

    base = AutoModelForCausalLM.from_pretrained(MODEL_ID)
    try:
        base.gradient_checkpointing_enable()
        base.config.use_cache = False
    except Exception:
        pass
    base.to(DEVICE)

    lora_cfg = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=8, lora_alpha=16, lora_dropout=0.05,
        target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]
    )
    model = get_peft_model(base, lora_cfg)

    collator = DataCollatorForLanguageModeling(tokenizer=tok, mlm=False)

    args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        seed=SEED,
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=GRAD_ACC,
        learning_rate=LR,
        logging_steps=5,
        save_steps=1000,
        report_to="none",
        dataloader_pin_memory=False,   # quiet on MPS
        # keep args minimal for older transformers
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tok_train,
        eval_dataset=tok_eval,
        data_collator=collator,
        tokenizer=tok,
    )

    print("\n=== Training TECH LoRA adapter (tiny run) ===")
    trainer.train()

    # save adapter
    adapter_dir = os.path.join(OUTPUT_DIR, "adapter")
    os.makedirs(adapter_dir, exist_ok=True)
    trainer.model.save_pretrained(adapter_dir)
    tok.save_pretrained(OUTPUT_DIR)

    # tiny eval: loss → perplexity
    try:
        metrics = trainer.evaluate(tok_eval)
        if "eval_loss" in metrics:
            print(f"Eval loss: {metrics['eval_loss']:.4f} | ppl≈{math.exp(float(metrics['eval_loss'])):.2f}")
    except Exception as e:
        print("Eval skipped:", repr(e))

    print(f"\nSaved TECH LoRA adapter to: {adapter_dir}")

if __name__ == "__main__":
    main()


Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

  trainer = Trainer(



=== Training TECH LoRA adapter (tiny run) ===


Step,Training Loss


Eval loss: 1.4839 | ppl≈4.41

Saved TECH LoRA adapter to: outputs/tinyllama-tech-lora/adapter


In [22]:
# infer_with_adapter_tech.py
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

MODEL_ID    = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
ADAPTER_DIR = "outputs/tinyllama-tech-lora/adapter"

# device
if torch.cuda.is_available():
    DEVICE = "cuda"
elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
    DEVICE = "mps"
else:
    DEVICE = "cpu"

def generate_with_adapter(endpoint_block: str, max_new_tokens=320):
    tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token

    base = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(DEVICE).eval()
    model = PeftModel.from_pretrained(base, ADAPTER_DIR).to(DEVICE).eval()

    messages = build_messages(
        "tech",
        "Write an API doc page. Follow the exact 4-section template.",
        endpoint_block
    )
    prompt = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tok(prompt, return_tensors="pt")
    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}

    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True, temperature=0.5, top_p=0.9,
            repetition_penalty=1.1,
            pad_token_id=tok.eos_token_id,
        )

    # decode only new tokens beyond the prompt
    gen_ids = out[0]
    prompt_len = inputs["input_ids"].shape[-1]
    new_tokens = gen_ids[prompt_len:]
    text = tok.decode(new_tokens, skip_special_tokens=True)

    for m in ["\n[", "\n<System", "\nUser:", "\nAssistant:"]:
        if m in text:
            text = text.split(m, 1)[0]
    return text.strip()

if __name__ == "__main__":
    endpoint = (
        "Endpoint: POST /v1/refunds; Auth: Bearer token; "
        "Params: payment_id (string, required), amount (integer, optional); "
        "Response: 201 {refund}, 400, 401"
    )
    print("\n=== AFTER FINE-TUNE (TECH LoRA adapter) ===\n")
    print(generate_with_adapter(endpoint))



=== AFTER FINE-TUNE (TECH LoRA adapter) ===

[![API Doc Page](https://i.imgur.com/jY6ZTUA.png)](https://api.example.com/docs/v1/refunds)

## Summary

The `POST /v1/refunds` endpoint allows you to create a new refund for a specific payment.

## Endpoint

```
POST /v1/refunds
Authorization: Bearer [token]
```

## Parameters

- `payment_id`: The unique identifier of the payment that will be refunded. This can be obtained from the `GET /payments/{id}` endpoint.
- `amount`: The amount to be refunded in cents. Optional. If not provided, the default value is `1`.

## Responses

- `201`: Created response with the created refund object.
- `400`: Bad request error if the `payment_id` parameter is invalid or missing.
- `401`: Unauthorized error if the `Authorization` header is missing or incorrect.

## Example Request

```json
{
    "payment_id": "123",
    "amount": 500
}
```

## Example Response

```json
{
    "refund": {
        "id": "abc123",
        "created_at": "2021-0
