In [1]:
import os
import time
import uuid
import logging
from typing import Dict
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage

In [2]:
load_dotenv(override=True)
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

In [3]:
PRIMARY_MODEL = "gpt-5-nano"   # Default model
FALLBACK_MODEL = "gpt-4o-mini"
TEMPERATURE = 0
MAX_RETRIES = 2

MODEL_PRICING = {
    "gpt-5-nano": {"input": 0.0005, "output": 0.001},
    "gpt-4o-mini": {"input": 0.001, "output": 0.003},
}

logging.basicConfig(level=logging.INFO)

In [4]:
primary_llm = ChatOpenAI(
    model=PRIMARY_MODEL,
    temperature=TEMPERATURE,
    api_key=OPENAI_API_KEY
)

fallback_llm = ChatOpenAI(
    model=FALLBACK_MODEL,
    temperature=TEMPERATURE,
    api_key=OPENAI_API_KEY
)

In [5]:
METRICS = {
    "total_requests": 0,
    "total_cost": 0.0,
    "total_tokens": 0,
    "fallback_count": 0
}

In [6]:
def calculate_cost(model: str, input_tokens: int, output_tokens: int) -> float:
    pricing = MODEL_PRICING.get(model)
    if not pricing:
        return 0.0

    cost = (
        input_tokens * pricing["input"] +
        output_tokens * pricing["output"]
    ) / 1000

    return round(cost, 6)

In [7]:
def call_llm(llm, model_name: str, prompt: str, trace_id: str):

    start_time = time.time()

    response = llm.invoke([HumanMessage(content=prompt)])

    latency = round(time.time() - start_time, 3)

    # Extract token usage (LangChain stores in response_metadata)
    usage = response.response_metadata.get("token_usage", {})

    input_tokens = usage.get("prompt_tokens", 0)
    output_tokens = usage.get("completion_tokens", 0)

    cost = calculate_cost(model_name, input_tokens, output_tokens)

    # Update metrics
    METRICS["total_requests"] += 1
    METRICS["total_cost"] += cost
    METRICS["total_tokens"] += (input_tokens + output_tokens)

    logging.info({
        "trace_id": trace_id,
        "model": model_name,
        "latency": latency,
        "input_tokens": input_tokens,
        "output_tokens": output_tokens,
        "cost": cost
    })

    return {
        "trace_id": trace_id,
        "model": model_name,
        "output": response.content,
        "latency": latency,
        "input_tokens": input_tokens,
        "output_tokens": output_tokens,
        "cost": cost
    }

In [8]:
def execute_with_retry(llm, model_name, prompt, trace_id):

    for attempt in range(MAX_RETRIES + 1):
        try:
            return call_llm(llm, model_name, prompt, trace_id)

        except Exception as e:
            logging.error({
                "trace_id": trace_id,
                "attempt": attempt,
                "error": str(e)
            })

            if attempt == MAX_RETRIES:
                raise e

In [9]:
def run_orchestrator(prompt: str):

    trace_id = str(uuid.uuid4())

    try:
        result = execute_with_retry(
            primary_llm,
            PRIMARY_MODEL,
            prompt,
            trace_id
        )

    except Exception:
        METRICS["fallback_count"] += 1

        result = call_llm(
            fallback_llm,
            FALLBACK_MODEL,
            prompt,
            trace_id
        )

    return result

In [10]:
response = run_orchestrator("Explain llm fallback mechanisms in 100 words")
print(response)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:{'trace_id': 'a20eaed1-f60f-467f-8154-4886a188601c', 'model': 'gpt-5-nano', 'latency': 20.784, 'input_tokens': 15, 'output_tokens': 2059, 'cost': 0.002067}


{'trace_id': 'a20eaed1-f60f-467f-8154-4886a188601c', 'model': 'gpt-5-nano', 'output': 'Fallback mechanisms for LLMs preserve availability, reliability, and response quality when primary models falter. Techniques include multiâ€‘model orchestration, where requests route to a smaller or local model if the cloud API fails or latency spikes. Caching and retrieval-augmented generation reduce repeated latency by reusing answers or precomputed spans. Timeouts and circuit breakers trigger safe defaults and prevent cascading outages. Graceful degradation can answer with templates or rule-based heuristics when generation is unavailable. Monitoring, retries, and rate control adjust traffic, while auditing keeps privacy and safety in check. Documentation informs users about limits and fallback behavior, across platforms and teams globally.', 'latency': 20.784, 'input_tokens': 15, 'output_tokens': 2059, 'cost': 0.002067}


In [11]:
print("===== REQUEST SUMMARY =====")
print("Trace ID:", response["trace_id"])
print("Model Used:", response["model"])
print("Latency (s):", response["latency"])
print("Input Tokens:", response["input_tokens"])
print("Output Tokens:", response["output_tokens"])
print("Cost ($):", response["cost"])

print("\n===== SYSTEM METRICS =====")
print("Total Requests:", METRICS["total_requests"])
print("Total Tokens:", METRICS["total_tokens"])
print("Total Cost ($):", round(METRICS["total_cost"], 6))
print("Fallback Triggered:", METRICS["fallback_count"])

===== REQUEST SUMMARY =====
Trace ID: a20eaed1-f60f-467f-8154-4886a188601c
Model Used: gpt-5-nano
Latency (s): 20.784
Input Tokens: 15
Output Tokens: 2059
Cost ($): 0.002067

===== SYSTEM METRICS =====
Total Requests: 1
Total Tokens: 2074
Total Cost ($): 0.002067
Fallback Triggered: 0
