Setup (install + env)
What this does
Installs the modern provider package and core bits. Sets your key for the session.
Why
Avoids legacy imports (langchain.chat_models) and version drift.

In [None]:
%pip install -q -U langchain langchain-openai tiktoken

import os
# Option A: set here for the session
os.environ["OPENAI_API_KEY"] = ""
# Optional: if you use org/project scoping
# os.environ["OPENAI_ORG_ID"] = "org_..."
# os.environ["OPENAI_PROJECT"] = "proj_..."


Step 1 — Imports and model
What this does
Brings in the ReAct agent constructor, executor, a small output parser helper, and the OpenAI chat model wrapper.

Why
create_react_agent builds a prompt that teaches the LLM to think-then-act with tools; AgentExecutor runs the loop.

In [None]:
from typing import Dict, List
from langchain_openai import ChatOpenAI
from langchain.agents import AgentExecutor, create_react_agent
from langchain_core.tools import tool
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser


In [None]:
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)


Step 2 — Define “Operations” tools (safe, local)
What this does
Defines three realistic ops tools:

status_board — read-only status of systems.

runbook_search — look up SOP steps from a tiny in-memory KB.

calc — quick arithmetic (for SLO/latency math).

Why
Agents are only as good as their action space. 

In [None]:
# Mock sources the tools will read
SYSTEM_STATUS: Dict[str, Dict[str, str]] = {
    "api":      {"state": "degraded", "note": "5xx spiking in us-east"},
    "db":       {"state": "healthy",  "note": "replication lag < 50ms"},
    "cache":    {"state": "healthy",  "note": "hit rate 98.7%"},
    "ingestor": {"state": "down",     "note": "stuck on batch 42"},
}

RUNBOOKS: Dict[str, List[str]] = {
    "ingestor_restart": [
        "Confirm batch id with `GET /ingestor/status`",
        "Drain queue: set `ingestor.accept=false`",
        "Restart service `svc_ingestor`",
        "Re-enable accepts and watch metrics for 5 minutes",
    ],
    "api_5xx_spike": [
        "Check last deploy time; if <30m, roll back",
        "Enable circuit breaker for slow DB deps",
        "Warm cache with top 100 endpoints",
        "If spike persists >10m, page on-call DB",
    ],
}

@tool("status_board", return_direct=False)
def status_board(service: str) -> str:
    """Get the current state and note for a named service (e.g., 'api', 'db', 'cache', 'ingestor')."""
    s = SYSTEM_STATUS.get(service.lower())
    if not s:
        return f"service '{service}' not found. Available: {list(SYSTEM_STATUS.keys())}"
    return f"{service}: state={s['state']}; note={s['note']}"

@tool("runbook_search", return_direct=False)
def runbook_search(keyword: str) -> str:
    """Return a short SOP matching a keyword (e.g., 'ingestor', '5xx', 'restart')."""
    kw = keyword.lower()
    hits = []
    for name, steps in RUNBOOKS.items():
        if kw in name or any(kw in step.lower() for step in steps):
            hits.append(f"{name}:\n- " + "\n- ".join(steps))
    return "\n\n".join(hits) if hits else f"No runbook match for '{keyword}'."

@tool("calc", return_direct=False)
def calc(expression: str) -> str:
    """Evaluate a simple arithmetic expression, e.g., '99.9 - 98.7' or '350*0.95'."""
    try:
        # Extremely limited eval; safe for simple classroom math
        value = eval(expression, {"__builtins__": {}}, {})
        return str(value)
    except Exception as e:
        return f"calc error: {e}"


Step 3 — System prompt (operations persona)
What this does
Gives the agent concise, ops-focused behavior: choose tools only when needed; return clear, actionable answers.

Why
Agents need explicit tool-use guidance to avoid chattiness or hallucinated tools.

In [None]:
SYSTEM_PROMPT = """You are an on-call operations manager.
You decide when to use tools. Use tools for facts (status, runbooks, calculations).
Be concise, actionable, and include clear next steps when appropriate.
If a tool returns multiple options, summarize and recommend one.
If you lack data, say so and propose a next diagnostic step."""


Step 4 — Build the ReAct agent
What this does
Creates the ReAct agent with our tools and system instructions, then wraps it in an executor.

Why
create_react_agent crafts a reasoning+acting prompt from tool schemas and our system text.

In [None]:
from langchain.agents import initialize_agent, AgentType

tools = [status_board, runbook_search, calc]

agent = initialize_agent(
    tools=tools,
    llm=llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,  # built-in ReAct prompt
    verbose=True,
    handle_parsing_errors=True,
)
print("Agent ready.")


Step 5 — First run: diagnose a 5xx spike
What this does
Shows the agent choosing status_board then runbook_search, and recommending next steps.

Why
Demonstrates tool selection and summarization.

In [None]:
query1 = "We're seeing 5xx errors on the API in us-east. What should I check and do first?"
resp1 = agent.invoke({"input": query1})
print(resp1["output"])


Step 6 — Second run: restart a stuck ingestor
What this does
Shows a different tool path and a calculation example.

Why
Demonstrates that the action sequence varies by question.

In [None]:
query2 = "Ingestor is stuck on batch 42—walk me through the restart and sanity checks."
resp2 = agent.invoke({"input": query2})
print(resp2["output"])


Step 7 — Quick SLO math with calc
What this does
Small arithmetic to support an ops decision.

Why
Shows multi-tool repertoire beyond text lookup.

In [None]:
query3 = "If cache hit rate drops from 98.7% to 95%, by how much does origin load increase (roughly)?"
resp3 = agent.invoke({"input": query3})
print(resp3["output"])


Step 8 — Extend at runtime (optional)
What this does
Demonstrates how you’d add a new tool (e.g., log snippet finder) without changing the rest of the agent.

In [None]:
from langchain_core.tools import tool
from langchain.agents import initialize_agent, AgentType

@tool("logs_find", return_direct=False)
def logs_find(keyword: str) -> str:
    """Return a mock log excerpt matching a keyword."""
    sample = [
        "12:01 GET /v1/orders 500 upstream timeout",
        "12:02 GET /v1/orders 200 OK",
        "12:03 POST /v1/ingest 502 bad gateway",
    ]
    kw = keyword.lower()
    hits = [l for l in sample if kw in l.lower()]
    return "\n".join(hits) if hits else "no matches"

# Add the tool and re-initialize the agent (no react_agent in this path)
tools.append(logs_find)

agent = initialize_agent(
    tools=tools,
    llm=llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True,
    handle_parsing_errors=True,
)
print("Agent reloaded with logs_find.")


Troubleshooting notes
If you hit ModuleNotFoundError on langchain_openai, install langchain-openai and restart the kernel.

If the agent “hallucinates” a tool name, it means the tool descriptions weren’t clear; tighten the docstrings.

For real ops, back tools with real endpoints (Prometheus, Grafana, incident API). The interface doesn’t change—only the function bodies.

Install Gradio in your environment

In [None]:
!pip install -U gradio


In [None]:
import gradio as gr

# Make sure this matches your existing agent instance from earlier cells
# Example (must be run earlier in the notebook):
# agent = AgentExecutor(agent=react_agent, tools=tools, verbose=True, handle_parsing_errors=True)

def run_agent_stream(message, history):
    try:
        result = agent.invoke({"input": message})
        output_text = result.get("output", "No output returned.")
        
        # Stream word-by-word
        for token in output_text.split():
            yield token + " "
    except Exception as e:
        yield f"Error: {e}"

with gr.Blocks() as demo:
    gr.ChatInterface(
        fn=run_agent_stream,
        title="Ops Agent Chat (Streaming)",
        chatbot=gr.Chatbot(),
        textbox=gr.Textbox(placeholder="Ask about system status, runbooks, or calculations..."),
        type="generator"  # Streaming mode
    )

demo.launch(share=False)
