In [None]:
!pip install lmdeploy

In [None]:
# ===========================================
# Mount Google Drive
# ===========================================

from google.colab import drive
import os

# --- Mount Google Drive ---
drive.mount('/content/drive')

print(f"✅ Google Drive mounted successfully.")

In [None]:
# ===========================================
# Load zrok token from ZIP in datasets
# ===========================================

import os, sys, zipfile

# Path to the ZIP containing .zrok_api_token
drive_zip = "/content/drive/MyDrive/datasets/sage-zrok-token.zip"
token_target = "/root/.zrok_api_key"

# Extract the token if it doesn't already exist
if os.path.exists(drive_zip):
    if not os.path.exists(token_target):
        print("📦 Extracting .zrok_api_key from Drive ZIP...")
        with zipfile.ZipFile(drive_zip, 'r') as zip_ref:
            zip_ref.extract('.zrok_api_key', '/root/')
        print(f"✅ Token extracted to {token_target}")
    else:
        print(f"✅ Token already exists at {token_target}")
else:
    print(f"⚠️ Token archive not found: {drive_zip}")

# Load token into variable
zrok_token = None
if os.path.isfile(token_target):
    with open(token_target, "r", encoding="utf-8", errors="ignore") as f:
        zrok_token = f.read().strip()

if not zrok_token:
    print(f"❌ zrok token not found or empty at {token_target}")
    sys.exit(1)
else:
    print("✅ zrok token loaded and ready for use.")

In [None]:
# pipeline loading (GDrive fallback or HF repo)
# This cell must produce a `pipe` object that will be used by the API.

from pathlib import Path
import os
import time
import shutil
from lmdeploy import pipeline, TurbomindEngineConfig

# =======================
# Model selection
# =======================
MODEL_TAG = "qwen/Qwen2.5-Coder-14B-Instruct-AWQ"
MODEL_LOCAL_DIR = f"/content/drive/MyDrive/models/{MODEL_TAG}"
USE_LOCAL = os.path.isdir(MODEL_LOCAL_DIR)


print("Using local model dir:", USE_LOCAL, MODEL_LOCAL_DIR if USE_LOCAL else MODEL_TAG)

# =======================
# Engine config
# =======================
engine_config = TurbomindEngineConfig(model_format='awq', cache_max_entry_count=0.3)

# =======================
# Load pipeline
# =======================
start = time.time()

try:
    if USE_LOCAL:
        print(f"📂 Loading from local cache: {MODEL_LOCAL_DIR}")
        pipe = pipeline(MODEL_LOCAL_DIR, backend_config=engine_config)
    else:
        print(f"🌐 Loading from Hugging Face: {MODEL_TAG}")
        pipe = pipeline(MODEL_TAG, backend_config=engine_config)

        # If Drive is mounted, cache the snapshot folder only
        drive_root = "/content/drive"
        hf_cache_dir = f"/root/.cache/huggingface/hub/models--{MODEL_TAG.replace('/', '--')}"
        snapshots_dir = os.path.join(hf_cache_dir, "snapshots")

        if os.path.ismount(drive_root) and os.path.exists(snapshots_dir):
            print("💾 Caching model snapshot to Google Drive...")



            # Ensure parent folder exists
            os.makedirs(os.path.dirname(MODEL_LOCAL_DIR), exist_ok=True)

            # Copy the snapshot folder
            snapshot_hash = os.listdir(snapshots_dir)[0]
            snapshot_path = os.path.join(snapshots_dir, snapshot_hash)
            shutil.copytree(snapshot_path, MODEL_LOCAL_DIR)
            print(f"✅ Model cached at {MODEL_LOCAL_DIR}")
        else:
            print("❌ Snapshots folder does not exist or Drive is not mounted.")

except Exception as e:
    print(f"❌ Failed to load model: {e}")

print(f"⏱️ Pipeline ready in {time.time() - start:.1f}s")

In [None]:
# internal helper method(s) - core generation function
# This is the function your API will call internally.
# It is NOT the FastAPI endpoint. Keep the internals here (single concern).

from typing import List, Dict, Optional

def render_messages_to_prompt(messages: List[Dict[str,str]]) -> str:
    """
    Convert OpenAI-style messages list into a single prompt string for the model.
    You can adapt this to your preferred chat template (system/user/assistant tokens).
    """
    parts = []
    for m in messages:
        role = m.get("role", "user")
        content = m.get("content", "")
        parts.append(f"{role}: {content}")
    parts.append("assistant:")
    return "\n".join(parts)

def generate_completion(pipe, messages: List[Dict[str,str]],
                        temperature: float = 0.7, max_tokens: int = 512) -> str:
    """
    Run the model pipeline with the prepared prompt and return generated text.
    - pipe: lmdeploy pipeline object
    - messages: list of {"role": "...", "content": "..."}
    """
    prompt = render_messages_to_prompt(messages)
    # LMDeploy pipeline may accept keyword args; pass them if supported
    try:
        # Common LMDeploy pipeline usage: out = pipe(prompt); sometimes pipe returns dict or object
        out = pipe(prompt, max_tokens=max_tokens, temperature=temperature)
    except TypeError:
        # If pipe doesn't accept kwargs, call without them
        out = pipe(prompt)

    # Normalize output to string
    text = None
    if hasattr(out, "text"):
        text = out.text
    elif isinstance(out, dict) and "text" in out:
        text = out["text"]
    elif isinstance(out, (list, tuple)) and len(out) and isinstance(out[0], dict) and "text" in out[0]:
        text = out[0]["text"]
    elif isinstance(out, str):
        text = out
    else:
        # Fallback: stringify
        text = str(out)

    return text

In [None]:
# FastAPI app init (OpenAI-compatible endpoints) with API key auth
# Hybrid API: uses your generate_completion() helper internally
# Matches OpenAI ChatCompletions schema (non-streaming)

from fastapi import FastAPI, Header, HTTPException, Request
from pydantic import BaseModel
from typing import List, Optional
import json

# -------------------------
# Server-side API key (static for testing)
# -------------------------
API_KEY = "12345"  # client should send this in Authorization: Bearer <key>

# -------------------------
# FastAPI app & schemas
# -------------------------
app = FastAPI(title="LMDeploy OpenAI-Compatible API (local)")

# -------------------------
# Logging middleware (for debugging zrok)
# -------------------------
@app.middleware("http")
async def log_requests(request: Request, call_next):
    print(f"[FastAPI] Incoming request: {request.method} {request.url}")
    response = await call_next(request)
    return response

# -------------------------
# Root / fallback route
# -------------------------
@app.get("/")
def root():
    return {"status": "ok", "message": "FastAPI server is running"}

# -------------------------
# Models & request schemas
# -------------------------
class Message(BaseModel):
    role: str
    content: str

class ChatRequest(BaseModel):
    model: Optional[str] = None
    messages: List[Message]
    temperature: Optional[float] = 0.7
    max_tokens: Optional[int] = 512

# -------------------------
# Endpoints
# -------------------------
@app.get("/v1/models")
def list_models():
    return {"data": [{"id": MODEL_TAG if not USE_LOCAL else MODEL_LOCAL_DIR, "object": "model"}]}

@app.get("/health")
def health():
    return {"status": "ok"}

@app.post("/v1/chat/completions")
async def chat_completions(req: ChatRequest, authorization: Optional[str] = Header(None)):
    # ----- Authorization check (optional) -----
    # if API_KEY:
    #     if not authorization or authorization.strip() != f"Bearer {API_KEY}":
    #         raise HTTPException(status_code=401, detail="Unauthorized")

    # Convert Pydantic Messages to list-of-dicts
    messages = [{"role": m.role, "content": m.content} for m in req.messages]

    # Call local generation helper
    try:
        text = generate_completion(pipe, messages,
                                   temperature=req.temperature or 0.7,
                                   max_tokens=req.max_tokens or 512)
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

    # Build OpenAI-compatible response
    resp = {
        "id": "chatcmpl-local-001",
        "object": "chat.completion",
        "created": int(__import__("time").time()),
        "model": MODEL_TAG if not USE_LOCAL else MODEL_LOCAL_DIR,
        "choices": [
            {
                "index": 0,
                "message": {"role": "assistant", "content": text},
                "finish_reason": "stop"
            }
        ],
        "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
    }
    return resp

In [None]:
# Download zrok v1.1.3 (latest)
!wget https://github.com/openziti/zrok/releases/download/v1.1.3/zrok_1.1.3_linux_amd64.tar.gz
!tar -xzf zrok_1.1.3_linux_amd64.tar.gz
!chmod +x zrok

In [None]:
# Enable (automatic migration from 0.4)
!./zrok enable --headless "$zrok_token"

# Use the agent for better process management
#!./zrok agent start &
#!./zrok share public localhost:8000 --headless

In [None]:
#!./zrok disable

In [None]:
import uvicorn
import threading

def run_uvicorn():
    uvicorn.run(app, host="0.0.0.0", port=8000, log_level="info")

# Start in background thread
threading.Thread(target=run_uvicorn, daemon=True).start()

In [None]:
import subprocess
import re
import time

def start_zrok_tunnel(port=8000):
    # Start the tunnel
    process = subprocess.Popen([
        "./zrok", "share", "public", f"localhost:{port}", "--headless"
    ], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

    # Give it a moment to start
    time.sleep(3)

    # Check agent status to get the URL
    status_process = subprocess.run([
        "./zrok", "agent", "status"
    ], capture_output=True, text=True)

    print("Agent Status:")
    print(status_process.stdout)

    return process

# Start the tunnel
tunnel_process = start_zrok_tunnel(8000)
print("Zrok tunnel started! Check the agent status above for your public URL.")

In [None]:
!./zrok overview

In [None]:
import time

print("Server and zrok tunnel are running. Keeping the notebook alive...")

try:
    while True:
        time.sleep(60)
except KeyboardInterrupt:
    print("Shutting down.")