In [None]:
! pip install -r requirements.txt


In [1]:
import os
import json
import time
from datetime import datetime
from typing import List, Dict, Any, Optional

from dotenv import load_dotenv
from llama_stack_client import Agent, LlamaStackClient
from llama_stack_client.lib.agents.react.agent import ReActAgent
from llama_stack_client.lib.agents.react.tool_parser import ReActOutput
from llama_stack_client.lib.agents.event_logger import EventLogger
from llama_stack_client.types import UserMessage
from rich import print
from termcolor import cprint


In [2]:
# Load environment variables
load_dotenv('config.env')

# Get configuration from environment
base_url = os.environ.get('LLAMA_STACK_URL')
model_id = os.environ.get('LLM_MODEL_ID')
temperature = float(os.environ.get("TEMPERATURE", 0.0))
max_tokens = int(os.environ.get("MAX_TOKENS", 4096))

# Sampling parameters
sampling_params = {
    "strategy": {"type": "greedy"},
    "max_tokens": max_tokens,
    "temperature": temperature,
}

print(f"Configuration loaded:")
print(f"  Llama Stack URL: {base_url}")
print(f"  Model ID: {model_id}")
print(f"  Temperature: {temperature}")
print(f"  Max Tokens: {max_tokens}")


In [3]:
# Initialize Llama Stack client
client = LlamaStackClient(base_url=base_url)

print(f'Connected to Llama Stack server at {base_url}')
print('Available models:')
print(client.models.list())
print(f'Using model: {model_id}')

# Verify MCP tool groups are available
print("\nVerifying MCP tool groups:")
for toolgroup in ["mcp::openshift", "mcp::atlassian"]:
    try:
        tools = client.tools.list(toolgroup_id=toolgroup)
        print(f"✅ {toolgroup}: {len(tools)} tools available")
        for tool in tools[:3]:  # Show first 3 tools
            print(f"   - {tool.name}: {tool.description}")
    except Exception as e:
        print(f"❌ {toolgroup}: Error - {e}")


APIConnectionError: Connection error.

In [16]:
# Configuration for our specific use case
NAMESPACE = "oom-test"
POD_PATTERN = "oom-test-deployment-*"
CLUSTER = "08d"  # Update this to your cluster identifier

print(f"\nTarget Configuration:")
print(f"  Namespace: {NAMESPACE}")
print(f"  Pod Pattern: {POD_PATTERN}")
print(f"  Cluster: {CLUSTER}")


In [17]:
# Create the Simple Operations Agent
def create_operations_agent():
    """Create a simple operations agent for pod monitoring and incident management"""
    
    instructions = """
    You are a simple operations agent that:
    1. Checks pod logs for the oom-test-deployment-* pod in the oom-test namespace
    2. Analyzes any errors found in the logs
    3. Searches Confluence for solutions using the error details
    4. Creates Jira incidents if errors are found
    
    IMPORTANT JIRA CONFIGURATION:
    - Always use project key "KAN" for creating Jira incidents
    - Use issue type "Incident" or "Task" as appropriate
    - Include clear titles and detailed descriptions
    - For additional_fields parameter, pass a dictionary object, not a JSON string
    - Example: additional_fields={"priority": {"name": "High"}, "labels": ["memory", "OOM"]}
    
    IMPORTANT CONFLUENCE CONFIGURATION:
    - Always search in the "OCP" space for documentation
    - Look for troubleshooting guides, runbooks, and solutions
    - Use specific error messages as search terms
    
    IMPORTANT ERROR HANDLING:
    - If you get permission errors accessing OpenShift, report the permission issue and stop trying to access pods
    - If Jira incident creation fails more than 2 times, report the Jira issue and stop trying to create incidents
    - Always provide a clear summary of what was accomplished and what failed
    
    Be concise and focused on the specific pod and namespace provided.
    """
    
    # Create ReAct agent with required MCP tools
    agent = ReActAgent(
        client=client,
        model=model_id,
        instructions=instructions,
        tools=[
            "mcp::openshift",    # For pod monitoring and log analysis
            "mcp::atlassian"     # For Confluence search and Jira incidents
        ],
        response_format={
            "type": "json_schema",
            "json_schema": ReActOutput.model_json_schema(),
        },
        sampling_params=sampling_params,
    )
    
    return agent

# Create the operations agent
operations_agent = create_operations_agent()
print("✅ Simple Operations Agent created successfully")


In [18]:
# Function to run agent sessions
def run_agent_session(agent, session_name, user_prompt, stream=True):
    """Run agent session with logging"""
    session_id = agent.create_session(session_name)
    print(f'Created new session: {session_name}')
    
    print("\n" + "="*80)
    cprint(f"Processing: {user_prompt}", "blue", attrs=['bold'])
    print("="*80)
    
    response = agent.create_turn(
        messages=[{"role": "user", "content": user_prompt}],
        session_id=session_id,
        stream=stream
    )
    
    if stream:
        for log in EventLogger().log(response):
            log.print()
    else:
        print(response)
    
    return session_id


In [None]:
# Complete Workflow: Check Pod, Search Solutions, Create Incident
print("🚀 Complete Workflow: Pod Check → Confluence Search → Jira Incident")
print("="*80)

# Single comprehensive prompt that handles the entire workflow
workflow_prompt = f"""
Check the pod logs for any pod matching the pattern 'oom-test-deployment-*' in the '{NAMESPACE}' namespace.

If you find any errors in the logs:
1. Analyze the error details and categorize the issue
2. Search Confluence documentation in the "OCP" space for solutions using the specific error messages
3. Create a Jira incident in the "KAN" project with:
   - Clear title describing the issue
   - Detailed description including error messages and context
   - Priority level based on error severity
   - Resolution steps from Confluence search results
   - Use issue type "Bug" for errors or "Task" for general issues

If no errors are found, simply report that the pod logs are clean.

IMPORTANT: If you encounter permission errors or tool failures, report them clearly and stop retrying after 2 attempts.

Be concise and focused on the specific pod pattern and namespace.
"""

# Run the complete workflow
session_id = run_agent_session(
    operations_agent, 
    "complete-workflow-session", 
    workflow_prompt
)


In [12]:
# Step 1: Check pod logs
print("Step 1: Check Pod Logs")
print("="*40)

step1_prompt = f"Check the logs for any pod matching 'oom-test-deployment-*' in the '{NAMESPACE}' namespace and report any errors found."

# Uncomment to run step 1
session_id = run_agent_session(operations_agent, "step1-logs", step1_prompt)

print("Step 2: Search Confluence (if errors found)")
print("="*40)

step2_prompt = f"If errors were found in the previous step, search Confluence in the 'OCP' space for solutions using the specific error messages from the oom-test-deployment pod logs."

# Uncomment to run step 2
session_id = run_agent_session(operations_agent, "step2-confluence", step2_prompt)

print("Step 3: Create Jira Incident (if errors found)")
print("="*40)

step3_prompt = f"Create a Jira incident in the 'KAN' project for any errors found in the oom-test-deployment pod, including the error details and any solutions found in Confluence. Use issue type 'Bug' for errors."

# Uncomment to run step 3
session_id = run_agent_session(operations_agent, "step3-jira", step3_prompt)

print("\nTo run individual steps, uncomment the relevant run_agent_session calls above.")

print("\n" + "="*80)
print("🔧 TROUBLESHOOTING WORKFLOW")
print("="*80)
print("If you're getting permission errors, try this simplified workflow:")

# Simplified workflow for troubleshooting
troubleshoot_prompt = f"""
I'm getting permission errors when trying to access the '{NAMESPACE}' namespace. 
Please help me troubleshoot this by:

1. Reporting the specific permission error you encountered
2. Searching Confluence in the "OCP" space for documentation about OpenShift RBAC permissions
3. If you can't access pods, suggest what permissions need to be granted
4. Create a Jira incident in the 'KAN' project about the permission issue (if Jira is working)

Don't retry the same failed operations multiple times.
"""

print("Run this troubleshooting workflow:")
print("session_id = run_agent_session(operations_agent, 'troubleshoot-session', troubleshoot_prompt)")


[1m[34mProcessing: Check the logs for any pod matching 'oom-test-deployment-*' in the 'oom-test' namespace and report any errors found.[0m


[33minference> [0m[33m[0m[33m{
[0m[33m [0m[33m "[0m[33mthought[0m[33m":[0m[33m "[0m[33mI[0m[33m need[0m[33m to[0m[33m check[0m[33m the[0m[33m logs[0m[33m for[0m[33m any[0m[33m pod[0m[33m that[0m[33m matches[0m[33m '[0m[33moom[0m[33m-test[0m[33m-de[0m[33mployment[0m[33m-*[0m[33m'[0m[33m in[0m[33m the[0m[33m '[0m[33moom[0m[33m-test[0m[33m'[0m[33m namespace[0m[33m.[0m[33m I[0m[33m'll[0m[33m start[0m[33m by[0m[33m connecting[0m[33m to[0m[33m the[0m[33m Open[0m[33mShift[0m[33m cluster[0m[33m and[0m[33m listing[0m[33m all[0m[33m pods[0m[33m in[0m[33m that[0m[33m namespace[0m[33m.[0m[33m Once[0m[33m I[0m[33m have[0m[33m the[0m[33m pod[0m[33m names[0m[33m,[0m[33m I[0m[33m'll[0m[33m retrieve[0m[33m their[0m[33m logs[0m[33m to[0m[33m look[0m[33m for[0m[33m any[0m[33m errors[0m[33m or[0m[33m issues[0m[33m.[0m[33m If[0m[33m I[0m[33m find[0m[33m any

[1m[34mProcessing: If errors were found in the previous step, search Confluence in the 'OCP' space for solutions using the specific error messages from the oom-test-deployment pod logs.[0m


[33minference> [0m[33m[0m[33m{
[0m[33m [0m[33m "[0m[33mthought[0m[33m":[0m[33m "[0m[33mI[0m[33m need[0m[33m to[0m[33m search[0m[33m Con[0m[33mfluence[0m[33m in[0m[33m the[0m[33m '[0m[33mO[0m[33mCP[0m[33m'[0m[33m space[0m[33m for[0m[33m solutions[0m[33m related[0m[33m to[0m[33m the[0m[33m specific[0m[33m error[0m[33m messages[0m[33m found[0m[33m in[0m[33m the[0m[33m o[0m[33mom[0m[33m-test[0m[33m-de[0m[33mployment[0m[33m pod[0m[33m logs[0m[33m.[0m[33m I[0m[33m'll[0m[33m use[0m[33m the[0m[33m exact[0m[33m error[0m[33m messages[0m[33m as[0m[33m search[0m[33m terms[0m[33m to[0m[33m find[0m[33m relevant[0m[33m troubleshooting[0m[33m guides[0m[33m or[0m[33m run[0m[33mbooks[0m[33m.",
[0m[33m [0m[33m "[0m[33maction[0m[33m":[0m[33m {
[0m[33m   [0m[33m "[0m[33mtool[0m[33m_name[0m[33m":[0m[33m "[0m[33mCon[0m[33mfluence[0m[33m Search[0m[33m",
[0m[33m  

[1m[34mProcessing: Create a Jira incident in the 'KAN' project for any errors found in the oom-test-deployment pod, including the error details and any solutions found in Confluence. Use issue type 'Bug' for errors.[0m


[33minference> [0m[33m[0m[33m{
[0m[33m [0m[33m "[0m[33mthought[0m[33m":[0m[33m "[0m[33mI[0m[33m need[0m[33m to[0m[33m create[0m[33m a[0m[33m J[0m[33mira[0m[33m incident[0m[33m in[0m[33m the[0m[33m '[0m[33mK[0m[33mAN[0m[33m'[0m[33m project[0m[33m for[0m[33m any[0m[33m errors[0m[33m found[0m[33m in[0m[33m the[0m[33m o[0m[33mom[0m[33m-test[0m[33m-de[0m[33mployment[0m[33m pod[0m[33m.[0m[33m I[0m[33m'll[0m[33m use[0m[33m the[0m[33m issue[0m[33m type[0m[33m '[0m[33mBug[0m[33m'[0m[33m as[0m[33m specified[0m[33m.[0m[33m First[0m[33m,[0m[33m I[0m[33m'll[0m[33m check[0m[33m the[0m[33m pod[0m[33m logs[0m[33m in[0m[33m the[0m[33m o[0m[33mom[0m[33m-test[0m[33m namespace[0m[33m for[0m[33m the[0m[33m o[0m[33mom[0m[33m-test[0m[33m-de[0m[33mployment[0m[33m-*[0m[33m pods[0m[33m.[0m[33m If[0m[33m I[0m[33m find[0m[33m any[0m[33m errors[0m[33m,[0m[33m 

In [None]:
# Troubleshooting Information
print("🚨 IMMEDIATE TROUBLESHOOTING")
print("="*80)
print("Based on the error you encountered, here are the issues and solutions:")
print("\n1. **OpenShift Permission Error**:")
print("   - Error: 'system:serviceaccount:llama-serve:ocp-mcp' cannot list pods in 'oom-test' namespace")
print("   - Solution: Grant the service account proper RBAC permissions")
print("   - Command: kubectl create clusterrolebinding ocp-mcp-pods --clusterrole=view --serviceaccount=llama-serve:ocp-mcp")
print("\n2. **Jira Tool Error**:")
print("   - Error: 'Error calling tool create_issue'")
print("   - Solution: Check Jira MCP server configuration and project keys")
print("   - Verify: Project key 'KAN' exists and is accessible")
print("\n3. **Agent Loop Issue**:")
print("   - Error: Max inference iterations reached")
print("   - Solution: The agent retried too many times due to tool failures")
print("   - Fix: Use the troubleshooting workflow above")
print("\n4. **Quick Fix Commands**:")
print("   # Fix OpenShift permissions:")
print("   kubectl create clusterrolebinding ocp-mcp-pods --clusterrole=view --serviceaccount=llama-serve:ocp-mcp")
print("   # Or for specific namespace:")
print("   kubectl create rolebinding ocp-mcp-pods --role=view --serviceaccount=llama-serve:ocp-mcp -n oom-test")
print("\n   # Check Jira project keys:")
print("   # Verify the project keys in your Jira instance match what the agent is trying to use")


In [None]:
# Jira Troubleshooting Workflow
print("🔧 JIRA TROUBLESHOOTING WORKFLOW")
print("="*80)
print("Let's diagnose the Jira incident creation issue...")

# Specific troubleshooting prompt for Jira issues
jira_troubleshoot_prompt = f"""
I'm having trouble creating Jira incidents. Please help me troubleshoot by:

1. First, try to list available Jira projects to see what project keys are available
2. If that works, try to create a simple test incident in the "KAN" project
3. If that fails, report the specific error message you get
4. Search Confluence in the "OCP" space for documentation about Jira integration and MCP setup
5. Provide recommendations for fixing the Jira incident creation issue

Don't retry the same failed operations multiple times. Just report what works and what doesn't.
"""

print("Running Jira troubleshooting...")
print("="*50)

# Run the troubleshooting session
session_id = run_agent_session(
    operations_agent, 
    "jira-troubleshoot-session", 
    jira_troubleshoot_prompt
)


In [None]:
# Quick Jira Tools Check
print("🔍 QUICK JIRA TOOLS CHECK")
print("="*50)

# Check what Jira tools are available
print("Checking available Jira tools...")
try:
    jira_tools = client.tools.list(toolgroup_id="mcp::atlassian")
    print(f"✅ Found {len(jira_tools)} Atlassian tools:")
    for tool in jira_tools:
        print(f"   - {tool.name}: {tool.description}")
        
    # Look specifically for Jira tools
    jira_tools_only = [tool for tool in jira_tools if 'jira' in tool.name.lower()]
    print(f"\n🎯 Jira-specific tools: {len(jira_tools_only)}")
    for tool in jira_tools_only:
        print(f"   - {tool.name}: {tool.description}")
        
except Exception as e:
    print(f"❌ Error accessing Atlassian tools: {e}")

print("\n" + "="*50)
print("If you see Jira tools above, the issue might be with:")
print("1. Project key 'KAN' not existing in your Jira instance")
print("2. Authentication/permissions for the MCP service account")
print("3. Jira MCP server configuration")
print("4. Required fields missing in the incident creation request")


In [None]:
# Updated Configuration Summary
print("🎯 UPDATED CONFIGURATION")
print("="*50)
print("The agent is now configured to use:")
print("\n📋 **Jira Configuration:**")
print("✅ Project Key: 'KAN'")
print("✅ Issue Types: 'Incident' or 'Task' as appropriate")
print("✅ All Jira incidents will be created in the KAN project")
print("\n📚 **Confluence Configuration:**")
print("✅ Space: 'OCP'")
print("✅ Search for troubleshooting guides, runbooks, and solutions")
print("✅ Use specific error messages as search terms")
print("\nIf you're still getting errors, check:")
print("1. Project key 'KAN' exists in your Jira instance")
print("2. Confluence space 'OCP' is accessible")
print("3. MCP service account has proper permissions")
print("4. Required fields are properly configured")


In [None]:
# Jira Validation Error Fix
print("🔧 JIRA VALIDATION ERROR FIX")
print("="*50)
print("The error you encountered is a common MCP tool issue:")
print("\n❌ **Problem**:")
print("   - additional_fields parameter expects a dictionary")
print("   - But it's receiving a JSON string instead")
print("   - Error: 'Input should be a valid dictionary [type=dict_type]'")
print("\n✅ **Solution**:")
print("   - Pass additional_fields as a dictionary object, not a JSON string")
print("   - Example: additional_fields={'priority': {'name': 'High'}, 'labels': ['memory', 'OOM']}")
print("   - NOT: additional_fields='{\"priority\": {\"name\": \"High\"}, \"labels\": [\"memory\", \"OOM\"]}'")
print("\n🎯 **Quick Test**:")
print("   Try creating a simple Jira incident with minimal fields first:")
print("   - project_key: 'KAN'")
print("   - summary: 'Test Issue'")
print("   - issue_type: 'Task'")
print("   - description: 'Test description'")
print("   - additional_fields: {}")
print("\nThen gradually add more fields as needed.")


In [None]:
# Simple Jira Test
print("🧪 SIMPLE JIRA TEST")
print("="*50)
print("Let's test Jira incident creation with the correct format...")

# Simple test prompt that should work
simple_jira_test_prompt = """
Create a simple test Jira incident in the KAN project with these exact parameters:
- project_key: "KAN"
- summary: "Test Incident - OOM Pod Issue"
- issue_type: "Task"
- description: "This is a test incident to verify Jira integration is working"
- additional_fields: {}

Use the correct dictionary format for additional_fields, not a JSON string.
"""

print("Running simple Jira test...")
print("="*30)

# Run the simple test
session_id = run_agent_session(
    operations_agent, 
    "simple-jira-test", 
    simple_jira_test_prompt
)


In [None]:
print("✅ **Pod Monitoring**: Check logs for oom-test-deployment-* pods")
print("✅ **Error Analysis**: Identify and categorize any errors found")
print("✅ **Confluence Search**: Find solutions using error details")
print("✅ **Jira Incidents**: Create incidents with analysis and recommendations")
print("\nThe agent uses MCP tools:")
print("  - mcp::openshift: For pod monitoring and log retrieval")
print("  - mcp::atlassian: For Confluence search and Jira incident creation")
print("\nTo use this agent:")
print("1. Update the CLUSTER variable to match your environment")
print("2. Run the complete workflow cell above")
print("3. Or use the step-by-step approach for more control")
