<a href="https://colab.research.google.com/github/sofasogood/red_teaming_pipeline/blob/main/Setting_up_model_API.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install unsloth
!pip install litellm fastapi uvicorn pyngrok accelerate transformers
!pip install nest-asyncio
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import os
from fastapi import FastAPI
from pydantic import BaseModel
from pyngrok import ngrok
import uvicorn
import nest_asyncio


In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Load local sft model

# Configure quantization for inference
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

# Load base model with quantization
base_model_name = "unsloth/mistral-7b-v0.3-bnb-4bit"
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True
)

# Load LoRA adapter weights
adapter_path = "/content/drive/MyDrive/models/ai_safety/finetuned_model_0131/mistral7b_finetuned/checkpoints/checkpoint-505"
model = PeftModel.from_pretrained(
    base_model,
    adapter_path,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Optional: Merge LoRA weights with base model for faster inference
# model = model.merge_and_unload()

# Set eval mode
model.eval()



In [None]:
# Importing red_teaming_pipeline
!git clone https://github.com/sofasogood/red_teaming_pipeline.git
%cd red_teaming_pipeline
!pip install -e .
!pip install -r requirements.txt

In [5]:
from dataset_creation.data_processing_functions.data_creation_rl import build_instruct_prompt

app = FastAPI()


# Input schema
class GenerateRequest(BaseModel):
    model: str
    messages: list[dict]


# Output schema
class GenerateResponse(BaseModel):
    choices: list[str]
    model: str

@app.post("/chat/completions")
async def generate_text(request: GenerateRequest):
    prompt = build_instruct_prompt(request.messages)
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    prompt_len = inputs["input_ids"].shape[1]
    model_name = request.model.strip('openai/')
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        temperature=0.7,
        do_sample=True,
        top_p=0.9,
        top_k=50,
        repetition_penalty=1.1,
        pad_token_id=tokenizer.eos_token_id
    )
    generated_text = tokenizer.decode(outputs[0][prompt_len:], skip_special_tokens=True, clean_up_tokenization_spaces=True
)
    return {"choices": [{"message": {"content": generated_text}}], "model": request.model}





# Start a port using the terminal:

Run these commands

1.   `curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh`
2.   `. "$HOME/.cargo/env"`
3. `cargo install bore-cli`
4. `bore local 8000 --to bore.pub`

In [None]:
# Host model
nest_asyncio.apply()
uvicorn.run(app, host="0.0.0.0", port=8000)
