In [None]:
!pip install -q transformers accelerate bitsandbytes fastapi uvicorn nest_asyncio pyngrok

In [None]:
from huggingface_hub import login #get auth token from your huggingface account
login()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
import torch
torch.cuda.is_available()

True

In [None]:
from transformers import TextIteratorStreamer
from threading import Thread
from fastapi.responses import StreamingResponse


In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from fastapi import FastAPI
from pydantic import BaseModel
from fastapi.middleware.cors import CORSMiddleware
import nest_asyncio
import uvicorn
import threading

# -----------------------
# FastAPI App
# -----------------------
app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# -----------------------
# Model Setup
# -----------------------
model_id = "mistralai/Mistral-7B-Instruct-v0.2"

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_id)

print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quant_config,
    device_map={"": 0},   # FORCE full GPU
)

model.eval()

print("Model loaded successfully!")

# -----------------------
# Request Schema
# -----------------------
class PromptRequest(BaseModel):
    prompt: str
    max_tokens: int = 150

# -----------------------
# Health Check
# -----------------------
@app.get("/")
def health():
    return {"status": "Mistral 7B Quantized API running"}

# -----------------------
# Generation Endpoint
# -----------------------
@app.post("/generate_stream")
def generate_stream(request: PromptRequest):

    system_prompt = (
        "You are a concise academic assistant. "
        "Explain concepts in simple language suitable for a beginner. "
        "Limit responses to 5 bullet points maximum. "
        "Maximum 90 words total. "
        "Use short sentences. "
        "End with one very short example (1 sentence). "
        "Do not add extra commentary."
    )

    formatted_prompt = f"<s>[INST] {system_prompt}\n\n{request.prompt} [/INST]"

    inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cuda")

    streamer = TextIteratorStreamer(
        tokenizer,
        skip_prompt=True,
        skip_special_tokens=True
    )

    generation_kwargs = dict(
        **inputs,
        max_new_tokens=150,
        temperature=0.55,
        top_p=0.9,
        repetition_penalty=1.15,
        streamer=streamer
    )

    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    return StreamingResponse(streamer, media_type="text/plain")



# -----------------------
# Run Server
# -----------------------
nest_asyncio.apply()

def run():
    uvicorn.run(app, host="0.0.0.0", port=8000)

thread = threading.Thread(target=run)
thread.start()


Loading tokenizer...
Loading model...


Loading weights:   0%|          | 0/291 [00:00<?, ?it/s]

Model loaded successfully!


In [None]:
from pyngrok import ngrok
#get auth token from ngrok, get new one everytime run time disconnects
ngrok.set_auth_token("39hAOVTL8gLCRf6ZMnFHwPsVcve_2ucT9TDg4uH6PhygfBkRK")

public_url = ngrok.connect(8000)
print("Public URL:", public_url)


Public URL: NgrokTunnel: "https://reiko-pseudolegislative-heteronymously.ngrok-free.dev" -> "http://localhost:8000"
