# AI Agent for Automated Incident Resolution

**Copyright (c) 2026 Shrikara Kaudambady. All rights reserved.**

This notebook simulates an AI agent that automates the resolution of a common IT incident: a crashed web server. The agent uses a set of tools to diagnose the problem, execute a fix, and document the resolution in an IT Service Management (ITSM) system.

### 1. Setup and Simulation Environment
First, we define the simulated external systems our AI agent will interact with: a Server and an ITSM (e.g., ServiceNow/Jira).

In [None]:
import random
import time
from datetime import datetime

class SimulatedServer:
    """A simple class to simulate a production server."""
    def __init__(self, server_id):
        self.server_id = server_id
        self.app_process_running = True
        self.cpu_load = 0.2

    def get_state(self):
        return {'app_process_running': self.app_process_running, 'cpu_load': self.cpu_load}

    def crash_app(self):
        print(f"\nEVENT: Application on {self.server_id} has crashed!")
        self.app_process_running = False
    
    def restart_app_process(self):
        print(f"ACTION: Restarting application process on {self.server_id}...")
        self.app_process_running = True
        print("ACTION: Process restarted.")
        return "Success"

class SimulatedITSM:
    """A simple class to simulate an IT Service Management tool."""
    def __init__(self):
        self.incidents = {}
        self._incident_counter = 12345

    def create_incident(self, description, server_id):
        incident_id = f"INC{self._incident_counter}"
        self.incidents[incident_id] = {
            'description': description,
            'server_id': server_id,
            'status': 'New',
            'created_at': datetime.now(),
            'resolution_notes': ''
        }
        self._incident_counter += 1
        print(f"ITSM: New incident {incident_id} created for {server_id}.")
        return incident_id

    def get_open_incidents(self):
        return {k: v for k, v in self.incidents.items() if v['status'] == 'New'}

    def update_incident(self, incident_id, notes, status):
        if incident_id in self.incidents:
            self.incidents[incident_id]['resolution_notes'] = notes
            self.incidents[incident_id]['status'] = status
            print(f"ITSM: Incident {incident_id} updated. Status: {status}.")
            return True
        return False

### 2. Define the Agent's Tools
These are the specific functions the AI agent can call. In a real system, these would make API calls or run SSH commands.

In [None]:
class AgentTools:
    def __init__(self, server, itsm):
        self.server = server
        self.itsm = itsm

    def run_diagnostics(self, server_id):
        print(f"TOOL: Running diagnostics on {server_id}...")
        state = self.server.get_state()
        return f"Diagnostic report: App process running = {state['app_process_running']}."

    def execute_remediation_script(self, server_id, script_name):
        print(f"TOOL: Executing script '{script_name}' on {server_id}...")
        if script_name == 'restart_web_server':
            return self.server.restart_app_process()
        return "Failure: Unknown script."

    def update_itsm_ticket(self, incident_id, notes, status):
        print(f"TOOL: Updating ITSM ticket {incident_id}...")
        return self.itsm.update_incident(incident_id, notes, status)

### 3. Implement the AI Incident Response Agent
This agent encapsulates the logic or "chain of thought" for resolving an incident. It uses its tools to perform actions.

In [None]:
class IncidentResponseAgent:
    def __init__(self, tools):
        self.tools = tools
        print("AGENT: Incident Response Agent is online.")

    def process_incident(self, incident_id, incident_details):
        print("\n======================================================")
        print(f"AGENT: Starting work on incident {incident_id}: '{incident_details['description']}'")
        server_id = incident_details['server_id']

        # 1. Diagnose
        print("AGENT: Step 1 - Running diagnostics.")
        diag_report = self.tools.run_diagnostics(server_id)
        print(f"AGENT: Received report: {diag_report}")

        # 2. Plan & 3. Act
        if 'App process running = False' in diag_report:
            print("AGENT: Step 2 - Diagnosis confirmed. Planning to restart process.")
            print("AGENT: Step 3 - Executing remediation.")
            result = self.tools.execute_remediation_script(server_id, 'restart_web_server')
            if result != "Success":
                print("AGENT: Remediation failed. Escalating to human.")
                self.tools.update_itsm_ticket(incident_id, "Automated restart failed.", "Escalated")
                return
        else:
            print("AGENT: Diagnosis unclear. Escalating to human.")
            self.tools.update_itsm_ticket(incident_id, "Automated diagnosis failed.", "Escalated")
            return

        # 4. Verify
        print("AGENT: Step 4 - Verifying fix.")
        final_diag_report = self.tools.run_diagnostics(server_id)
        if 'App process running = True' in final_diag_report:
            print("AGENT: Verification successful. System is healthy.")
        else:
            print("AGENT: Verification failed. Escalating.")
            self.tools.update_itsm_ticket(incident_id, "Verification after restart failed.", "Escalated")
            return

        # 5. Document
        print("AGENT: Step 5 - Documenting resolution and closing ticket.")
        resolution_notes = f"Automated resolution by AI Agent at {datetime.now()}. Detected crashed app process. Executed restart script. Verified process is now running."
        self.tools.update_itsm_ticket(incident_id, resolution_notes, 'Resolved')
        print(f"AGENT: Work on incident {incident_id} is complete.")
        print("======================================================")

### 4. Run the Full End-to-End Simulation
Now, let's tie everything together. We'll simulate a server crash, which will trigger the monitoring system to create an ITSM ticket. Then, the AI agent will pick up the ticket and resolve the incident.

In [None]:
# 1. Initialize all systems
server = SimulatedServer('web-prod-01')
itsm = SimulatedITSM()
tools = AgentTools(server, itsm)
agent = IncidentResponseAgent(tools)

# 2. Simulate a problem
server.crash_app()

# 3. Monitoring system detects the crash and creates an incident
if not server.get_state()['app_process_running']:
    incident_id = itsm.create_incident(
        description="Monitoring alert: Web server application is unresponsive.",
        server_id=server.server_id
    )

# 4. The AI agent's main loop finds the open incident and processes it
open_incidents = itsm.get_open_incidents()
for inc_id, inc_details in open_incidents.items():
    agent.process_incident(inc_id, inc_details)
    
print("\n--- FINAL STATE ---")
print(f"Server '{server.server_id}' State: {server.get_state()}")
print(f"ITSM Incident '{incident_id}' State: {itsm.incidents[incident_id]}")