In [6]:
# ------------------------------------------------------------
# Imports
# ------------------------------------------------------------

In [7]:
from __future__ import annotations

from typing import Any, TypedDict
import base64
import io
import traceback

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langgraph.graph import StateGraph, END
import streamlit as st
load_dotenv()

True

In [8]:
# ------------------------------------------------------------
# Environment + LLM + Data
# ------------------------------------------------------------


llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.0,api_key=st.secrets["OPENAI_API_KEY"])

OUTAGES_EXCEL = "outages.xlsx"
# Global DataFrame used by the agent
df_outages = pd.read_excel(OUTAGES_EXCEL)

In [9]:
# ------------------------------------------------------------
# Prompt definition
# ------------------------------------------------------------
prompt_summary = """
You are an expert Python data analyst.

You write Pandas code to work with a DataFrame named `df` having these columns:
- partner_name (str)
- outage_type (str)
- issue_details (str)
- current_status (str)
- business_impact (str)
- manual_processing (str/bool)
- outage_start_time (datetime/str)
- outage_end_time (datetime/str)
- duration_hours (float)
"""

In [10]:
# ------------------------------------------------------------
# Prompt definition
# ------------------------------------------------------------

execution_rules = """

IMPORTANT EXECUTION RULES:
- Code MUST be plain Python (NO markdown, NO comments).
- Do NOT import pandas or create new DataFrames ‚Äî `df` is already provided.
- Convert all datetime fields using:
    df[col] = pd.to_datetime(df[col], errors='coerce').dt.tz_localize(None)
- BEFORE any filtering by dates or plotting.
- ALWAYS define a final variable named `result`. This is what will be returned.
- DO NOT add source file and parsed date details

DO NOT write import statements ‚Äî they will cause execution failure.
You must only use the following already-available objects:
- df  : Pandas DataFrame loaded with outage data
- pd  : pandas module
- plt : matplotlib.pyplot
- io  : io module for BytesIO
- base64 : for encoding charts
- np  : numpy module

If you need a chart:
- Use `plt.figure()` before plotting
- Save using:
    buf = io.BytesIO()
    plt.tight_layout()
    plt.savefig(buf, format="png")
    buf.seek(0)
    img_str = base64.b64encode(buf.getvalue()).decode("utf-8")
    result = {"type": "chart", "image_base64": img_str}
- NEVER call plt.show()

CHART STYLE RULES:
- Chart size must be professional and compact:
    plt.figure(figsize=(6, 4))
- Avoid full-screen or oversized charts.
- Use clean white background (default).
- Titles should be concise and readable.
- Avoid legends if labels are already visible (e.g. pie slices or x-axis labels).
- Rotate x-axis labels only if overlapping: plt.xticks(rotation=45)
- No alpha transparency effects or neon colors.
- Prefer:
    - Bar chart ‚Üí plt.bar()
    - Line chart ‚Üí plt.plot()
    - Pie chart ‚Üí plt.pie() only if < 8 slices
- Apply padding for neat layout:
    plt.tight_layout()


If you produce a single number or string, return:
    result = {"type": "text_value", "value": <python_value>}


AGGREGATION RULES FOR PARTNER-LEVEL OUTPUT:
- If the user asks for partner-level outage summaries:
    - Ensure exactly one row per partner.
    - Include only fields that can be aggregated per partner.
    - Aggregate numeric columns like:
        - outage_count ‚Üí count()
        - total_downtime_hours ‚Üí sum(duration_hours)
        - avg_duration_hours ‚Üí mean(duration_hours)
    - For business_impact & issue_details:
        - Create separate lists using .unique().tolist()
        - Name them: unique_business_impacts, unique_issues
        
    - Do not include raw text columns that cannot be aggregated (like issue_details as a long string)

    SCALAR RESULT RULE:
    - If the user requests a single answer such as:
        - highest / lowest / maximum / minimum outages or downtime
        - "Which partner has the most outages?"
        - "Show the average downtime overall"
        - "How many unique partners had outages?"
    - DO NOT create a DataFrame or table.
    - Instead compute the metric and return a dict:

    result = {
        "type": "text_value",
        "text": "MegaTrans Global has the maximum outages (14)."
    }

    - Only use DataFrames if user requests multiple rows of results.



    OUTPUT RULES:
    If user asks for counts, unique values, lists ‚Üí assign to `result` (list/dict/DataFrame).
    If user asks a chart:
    - Import: import matplotlib.pyplot as plt, import io, import base64, import numpy as np
    - Create figure: plt.figure(figsize=(10,6))
    - Save chart as base64:
        buf = io.BytesIO()
        plt.tight_layout()
        plt.savefig(buf, format="png")
        buf.seek(0)
        img_str = base64.b64encode(buf.getvalue()).decode("utf-8")
        result = {"type": "chart", "image_base64": img_str}
    - Do NOT use plt.show()

    DECISION LOGIC (VERY IMPORTANT):
    - If the user asks for a CHART,TRENDS, BAR, PIE, PARETO,Image ‚Üí return type: "chart"
    - If the user asks for a TABLE ‚Üí return type: "table"
    - If the request is about BUSINESS IMPACT/TRENDS/AGGREGATE INSIGHTS ‚Üí return type: "executive_summary"
    - If unclear ‚Üí favor "table"

    Never let Streamlit decide presentation ‚Äî YOU decide based on query.



    If grouping by partner or issue:
        - Use .groupby([...], dropna=True)

    For renaming:
        - DataFrame.rename(columns={{...}}) ‚Äî OK
        - NEVER call Series.rename(columns=...)

   EXECUTIVE SUMMARY MODE (only if user *explicitly* requests it):
- Activated only if user mentions one of:
  ["executive summary", "leadership summary", "summary for leadership", "summary report"]
- Produce both:
   * summary_text (3‚Äì4 business insights in English)
   * summary_table (per-partner KPI breakdown)
- Avoid including non-numeric fields in aggregated tables
- Include system chart JSON only if user explicitly mentions chart


    FINAL REQUIREMENT:
    - Your FINAL LINE must be the assignment: result = ...
    - NEVER print or display charts in code.
    - Return ONLY the code. No markdown. No explanation.

    """

In [11]:
# ------------------------------------------------------------
# Helper imports for execute_pandas_code
# ------------------------------------------------------------
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import io
import base64
import traceback


def execute_pandas_code(df: pd.DataFrame, code: str):
    """
    Executes LLM-generated Pandas code safely and returns a structured result.

    The LLM is expected to use a DataFrame named `df`.

    Possible returned structures:
      - {"type": "table", "rows": [...], "columns": [...]}
      - {"type": "chart", "image_base64": "..."}
      - {"type": "text", "value": "..."}
      - {"type": "executive_summary", "summary_text": str, "rows": [...], "columns": [...]}
      - {"type": "error", "error_message": str, "trace": str, "failed_code": str}
      - {"type": "unknown", "message": str}

    Notes:
      - Strips any `import ...` lines the LLM might generate.
      - Normalizes all date/time columns to tz-naive datetimes.
    """

    # ---------- 1. Local sandbox environment ----------
    local_env = {
        "df": df.copy(),      # work on a copy so original df_outages is safe
        "pd": pd,
        "plt": plt,
        "np": np,
        "io": io,
        "base64": base64,
    }

    # Very restricted builtins to avoid dangerous operations
    exec_globals = {
        "__builtins__": {
            "len": len,
            "range": range,
            "min": min,
            "max": max,
            "sum": sum,
            "abs": abs,
            "round": round,
            "float": float,  # <-- Added support for float()
            "int": int,      # <-- Optional but recommended
            "str": str,
        }
    }

    # ---------- 2. Auto datetime + timezone normalization ----------
    for col in df.columns:
        lower = col.lower()
        if "date" in lower or "time" in lower:
            try:
                # Convert to datetime where possible
                local_env["df"][col] = pd.to_datetime(
                    local_env["df"][col], errors="coerce"
                )
                # Drop timezone info if present (tz-aware vs tz-naive issues)
                if hasattr(local_env["df"][col].dt, "tz_localize"):
                    try:
                        local_env["df"][col] = local_env["df"][col].dt.tz_localize(None)
                    except TypeError:
                        # In some cases tz_convert is needed first
                        try:
                            local_env["df"][col] = (
                                local_env["df"][col].dt.tz_convert(None)
                            )
                        except Exception:
                            pass
            except Exception:
                # If conversion fails, just leave the column as is
                pass

    # ---------- 3. Strip any leading import lines from LLM code ----------
    cleaned_lines = []
    for line in code.splitlines():
        stripped = line.strip()
        if stripped.startswith("import ") or stripped.startswith("from "):
            # Skip imports (we already provided pd, plt, io, base64, np)
            continue
        cleaned_lines.append(line)
    cleaned_code = "\n".join(cleaned_lines)

    print("\nüìå Running Pandas code:\n", cleaned_code)

    # ---------- 4. Execute the LLM-generated code ----------
    try:
        exec(cleaned_code, exec_globals, local_env)
    except Exception as e:
        print("‚ùå Pandas execution error:", e)
        print(traceback.format_exc())
        return {
            "type": "error",
            "error_message": str(e),
            "trace": traceback.format_exc(),
            "failed_code": cleaned_code,
        }

    # ---------- 5. Inspect outputs in priority order ----------

    # 5.1 If LLM explicitly set a dict `result` with "type", trust it
    result = local_env.get("result")
    if isinstance(result, dict) and "type" in result:
        # For table-like dicts the LLM might have used keys like 'data'
        # We normalize to rows/columns if it's a DataFrame inside
        if isinstance(result.get("data"), pd.DataFrame):
            df_res = result["data"]
            return {
                "type": result.get("type", "table"),
                "rows": df_res.to_dict(orient="records"),
                "columns": list(df_res.columns),
            }
        return result

    # 5.2 Executive summary via summary_text + summary_table
    if "summary_text" in local_env and "summary_table" in local_env:
        summary_table = local_env["summary_table"]
        if isinstance(summary_table, pd.DataFrame):
            return {
                "type": "executive_summary",
                "summary_text": str(local_env["summary_text"]),
                "rows": summary_table.to_dict(orient="records"),
                "columns": list(summary_table.columns),
            }

    # 5.3 Chart from matplotlib (if any figure was created)
    figs = list(map(plt.figure, plt.get_fignums()))
    if figs:
        buf = io.BytesIO()
        # Use the last created figure
        figs[-1].savefig(buf, format="png", bbox_inches="tight")
        buf.seek(0)
        img_str = base64.b64encode(buf.getvalue()).decode("utf-8")
        plt.close("all")
        return {"type": "chart", "image_base64": img_str}

    # 5.4 DataFrame returned as result
    if isinstance(result, pd.DataFrame):
        return {
            "type": "table",
            "rows": result.to_dict(orient="records"),
            "columns": list(result.columns),
        }

    # 5.5 List / Series / ndarray ‚Üí represent as a 1-column table
    if isinstance(result, (list, np.ndarray, pd.Series)):
        df_res = pd.DataFrame({"value": list(result)})
        return {
            "type": "table",
            "rows": df_res.to_dict(orient="records"),
            "columns": ["value"],
        }

    # 5.6 Scalar (int/float/str/bool) ‚Üí text
    if isinstance(result, (int, float, str, bool)):
        return {"type": "text", "value": str(result)}

    # 5.7 If LLM modified df in-place and didn't set result
    if "df" in local_env and isinstance(local_env["df"], pd.DataFrame):
        df_mod = local_env["df"]
        return {
            "type": "table",
            "rows": df_mod.to_dict(orient="records"),
            "columns": list(df_mod.columns),
        }

    # 5.8 Fallback: nothing recognizable
    return {
        "type": "unknown",
        "message": "Code executed but produced no recognized output (no result, chart, or DataFrame).",
    }


In [12]:
# ------------------------------------------------------------
# LangGraph State definition
# ------------------------------------------------------------
class AgentState(TypedDict, total=False):
    user_query: str
    pandas_code: str
    result: Any

In [13]:
# ------------------------------------------------------------
# Node: Generate Pandas code from user query
# ------------------------------------------------------------
def generate_pandas_code(state: AgentState) -> dict:
    """Node 1: LLM generates Pandas code from the user's query."""
    user_query = state.get("user_query") or state.get("__input__", {}).get("user_query")
    if not user_query:
        raise KeyError("Missing 'user_query' in state")

    prompt = prompt_summary +  " User request : " +user_query + execution_rules 
    

    code = llm.invoke(prompt).content.strip()
    return {"pandas_code": code}


In [14]:
# ------------------------------------------------------------
# Node: Wrap execute_pandas_code for LangGraph
# ------------------------------------------------------------
def execute_pandas_node(state: AgentState) -> dict:
    """Node 2: takes the generated code and runs it against df_outages."""
    code = state.get("pandas_code", "")
    if not code:
        return {
            "result": {
                "type": "error",
                "error_message": "No pandas_code found in state.",
            }
        }

    result = execute_pandas_code(df_outages, code)
    return {"result": result}

In [15]:
def generate_executive_summary(state: AgentState):
    result = state.get("result")
    user_query = state.get("user_query")
    
    if not result or (
        result.get("type") not in ["table", "chart", "text_value"]
    ):
        return {"summary": None}

    # If chart already exists, include caption + insights only
    if result.get("type") == "chart":
        summary_text = result.get("summary_text", "Chart generated for key outage insights.")
        return {
            "summary": summary_text,
            "chart_uri": result.get("image_base64")
        }

    # Standard summary logic follows...


In [None]:
# ------------------------------------------------------------
# Build LangGraph agent #####&&&
# ------------------------------------------------------------
def build_outage_agent_graph():
    graph = StateGraph(AgentState)

    graph.add_node("generate_pandas_code", generate_pandas_code)
    graph.add_node("execute_pandas", execute_pandas_node)
    graph.add_node("generate_executive_summary", generate_executive_summary)

    graph.set_entry_point("generate_pandas_code")
    graph.add_edge("generate_pandas_code", "execute_pandas")
    graph.add_edge("execute_pandas", "generate_executive_summary")
    graph.add_edge("generate_executive_summary", END)

    return graph.compile()



In [None]:
import streamlit as st
import base64
import pandas as pd

st.set_page_config(page_title="üìä Outage Analytics Assistant", layout="wide")
st.title("üìä Outage Analytics Assistant")

@st.cache_data
def load_df():
    return pd.read_excel("outages.xlsx")

df_outages = load_df()
agent = build_outage_agent_graph()

st.sidebar.write(f"Dataset rows: {len(df_outages)}")

query = st.chat_input("Ask a question about outages‚Ä¶")

if "history" not in st.session_state:
    st.session_state.history = []

# Render existing history
for role, content in st.session_state.history:
    with st.chat_message(role):
        if isinstance(content, dict):
            if content.get("type") == "chart":
                st.image(base64.b64decode(content["image_base64"]))

            elif content.get("type") == "table":
                st.dataframe(content["rows"], use_container_width=True)

            elif content.get("type") in ["text", "text_value"]:
                st.write(content.get("text") or content.get("value"))

            elif content.get("type") == "executive_summary":
                st.subheader("üìå Executive Summary")
                st.write(content["summary_text"])
                if "rows" in content:
                    st.dataframe(content["rows"], use_container_width=True)
                if "image_base64" in content:
                    st.image(base64.b64decode(content["image_base64"]))

        else:
            st.write(content)

if query:
    # Show user query bubble
    st.session_state.history.append(("user", query))
    with st.chat_message("user"):
        st.write(query)

    with st.chat_message("assistant"):
        with st.spinner("Processing‚Ä¶"):
            out = agent.invoke({"user_query": query})
            result = out.get("result")

        # Normalize and display instantly
        clean_result = result if isinstance(result, dict) else {
            "type": "text",
            "value": str(result)
        }

        # Append cleaned result to chat history
        st.session_state.history.append(("assistant", clean_result))

        # Render output now
        if clean_result["type"] == "table":
            st.dataframe(clean_result["rows"], use_container_width=True)

        elif clean_result["type"] == "chart":
            st.image(base64.b64decode(clean_result["image_base64"]), caption="Generated Chart")

        elif clean_result["type"] in ["text", "text_value"]:
            st.write(clean_result.get("text") or clean_result.get("value"))

        elif clean_result["type"] == "executive_summary":
            st.subheader("üìå Executive Summary")
            st.write(clean_result["summary_text"])
            if "rows" in clean_result:
                st.dataframe(clean_result["rows"], use_container_width=True)
            if "image_base64" in clean_result:
                st.image(base64.b64decode(clean_result["image_base64"]), caption="Summary Chart")

        else:
            st.write("‚ÑπÔ∏è No usable output returned.")
            st.json(clean_result)

    # No rerun needed ‚Äî UI already refreshed
