In [None]:
! pip install -qr requirements.txt

In [3]:
from os import environ

from dotenv import load_dotenv
from kubernetes.client.api_client import ApiClient
from kubernetes.client.rest import ApiException
from kubernetes.client import CoreV1Api
from kubernetes.config import load_incluster_config
from llama_stack_client import Agent, LlamaStackClient
from llama_stack_client.lib.agents.client_tool import client_tool
from llama_stack_client.lib.agents.event_logger import EventLogger
from llama_stack_client.lib.agents.react.agent import ReActAgent
from llama_stack_client.lib.agents.react.tool_parser import ReActOutput
from llama_stack_client.types import UserMessage
from rich import print
from termcolor import cprint

load_dotenv('config.env')

base_url = environ.get('LLAMA_STACK_URL')
model_id = environ.get('LLM_MODEL_ID')

client = LlamaStackClient(base_url=base_url)

print(f'Connected to Llama Stack server at {base_url}')
print('Registered models:')
print(client.models.list())
print(f'Using model: {model_id}')


temperature = float(environ.get("TEMPERATURE", 0.0))
strategy = {"type": "greedy"}

max_tokens = int(environ.get("MAX_TOKENS", 4096))

# sampling_params will later be used to pass the parameters to Llama Stack Agents/Inference APIs
sampling_params = {
    "strategy": strategy,
    "max_tokens": max_tokens,
}
print(f'sampling parameters: {sampling_params}')

APIConnectionError: Connection error.

In [2]:
def run_session(agent, session_name, user_prompts):
    session_id = agent.create_session(session_name)
    print(f'Created new session {session_name}')
    print(f'Looping over user prompts: {user_prompts}')
    for prompt in user_prompts:
        print("\n"+"="*50)
        cprint(f"Processing user query: {prompt}", "blue")
        print("="*50)
        response = agent.create_turn(
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            session_id=session_id,
            stream='True'
        )
        for log in EventLogger().log(response):
            log.print()

In [15]:
def analyze_namespace_errors_improved(namespace, client, model_id, sampling_params, max_infer_iters=5):
    """
    Analyze pod logs and events in a given namespace for errors with improved stopping criteria.
    
    Args:
        namespace (str): The namespace to analyze
        client: LlamaStackClient instance
        model_id (str): Model identifier to use
        sampling_params (dict): Sampling parameters for the model
        max_infer_iters (int): Maximum inference iterations (default: 5)
    
    Returns:
        dict: Analysis results with errors found or "No error found"
    """
    
    # Create ReAct agent with OpenShift tools
    full_react_agent = ReActAgent(
        client=client,
        model=model_id,
        tools=["mcp::openshift"],
        response_format={
            "type": "json_schema",
            "json_schema": ReActOutput.model_json_schema(),
        },
        sampling_params=sampling_params,
        max_infer_iters=max_infer_iters
    )
    
    # Create analysis prompt with clear stopping criteria
    analysis_prompt = f"""You are an expert OpenShift administrator. Your task is to analyze pod logs and events in namespace '{namespace}'.

CRITICAL INSTRUCTIONS:
1. List all pods in the namespace ONCE
2. Check pod logs for each pod ONCE only
3. Check events in the namespace ONCE only
4. Analyze the pod status and identify any errors
5. Provide a final summary with specific error details

STOPPING CRITERIA - STOP AFTER:
- Listing pods once
- Checking logs once per pod
- Checking events once
- Providing final analysis

DO NOT:
- Repeat the same tool calls
- Keep checking logs and events repeatedly
- Continue after you have enough information

Expected output format:
- Namespace: {namespace}
- Pod: [pod name]
- Error Title: [error type/reason] 
- Error Description: [detailed error message]

If no errors found, return: "No error found"
"""
    
    # Create session and run analysis
    session_id = full_react_agent.create_session(f'analysis-{namespace}')
    print(f'Created analysis session for namespace: {namespace}')
    
    try:
        # Run the analysis
        response = full_react_agent.create_turn(
            messages=[
                {
                    "role": "user",
                    "content": analysis_prompt,
                }
            ],
            session_id=session_id,
            stream='True'
        )
        
        # Collect the response and extract final answer
        analysis_result = ""
        final_answer = None
        
        for log in EventLogger().log(response):
            log.print()
            
            # Try to extract the final answer from the response
            if hasattr(log, 'content'):
                content_str = str(log.content)
                if '"answer":' in content_str and '"answer": null' not in content_str:
                    # Extract the answer field
                    import re
                    answer_match = re.search(r'"answer":\s*"([^"]*)"', content_str)
                    if answer_match:
                        final_answer = answer_match.group(1)
        
        # Use final answer if found, otherwise use the last content
        if final_answer:
            analysis_result = final_answer
        else:
            analysis_result = "Analysis completed but no clear answer extracted"
        
        return {
            "namespace": namespace,
            "status": "completed",
            "result": analysis_result,
            "session_id": session_id
        }
        
    except Exception as e:
        return {
            "namespace": namespace,
            "status": "error",
            "error": str(e),
            "session_id": session_id
        }


In [None]:
# Test the improved function with better stopping criteria
namespace = "oom-test"

print("="*60)
print("TESTING IMPROVED FUNCTION WITH BETTER STOPPING CRITERIA")
print("="*60)

# Call the improved analysis function
result = analyze_namespace_errors_improved(
    namespace=namespace,
    client=client,
    model_id=model_id,
    sampling_params=sampling_params,
    max_infer_iters=5  # Reduced from 10 to 5
)

# Display results
print(f"\n{'='*60}")
print(f"ANALYSIS COMPLETE")
print(f"{'='*60}")
print(f"Namespace: {result['namespace']}")
print(f"Status: {result['status']}")
if result['status'] == 'completed':
    print(f"Result: {result['result']}")
else:
    print(f"Error: {result['error']}")


In [None]:


def error_analysis(namespace):
    max_infer_iters = 10
    full_react_agent = ReActAgent(
        client=client,
        model=model_id,
        tools=["mcp::openshift"],
        response_format={
            "type": "json_schema",
            "json_schema": ReActOutput.model_json_schema(),
        },
        sampling_params=sampling_params,
        max_infer_iters=max_infer_iters
    )

    user_prompts = [
        f"""You are an expert OpenShift administrator. Your task is to analyze pod logs and events, If no errors found, return "No error found"
        Check namespace '{namespace}' for all errors and events. List each error with:
    - Namespace: {namespace}
    - Pod: [pod name]
    - Error Title: [error type/reason]
    - Error Description: [detailed error message]
    """
    ]
    run_session(full_react_agent, 'mcp-session', user_prompts)

namespace = "oom-test"
error_analysis(namespace)
