In [1]:
# Import libraries
#!pip install openpyxl --upgrade
import pandas as pd
import re
import ollama
import numpy as np

In [2]:
# Read cleam data in processed_text and create a DataFrame
df = pd.read_csv("clean_data_with_processed_text.csv")  
df['processed_text']=df['processed_text'].apply(str)

df.head()

Unnamed: 0,Case: Case Number,Case: Subject,Case: Description,Case: Product Name,Automation Item Name,Created Date,Body,Feedback,Detailed Feedback,Case: Status,Case: Client Escalated,processed_text
0,TS017525407,Email - We required high uninitialized Luns/ho...,We required high uninitialized Luns/hosts (Hig...,DS8900F,AI-1560218,2024-10-14,Customer Sentiment Not available Description S...,,good,Closed by IBM,0,customer sentiment available description summa...
1,TS017538910,SWITCH REBOOT,"2 reboot events ""Non restartable component (fd...",SAN b-type Collection,AI-1561615,2024-10-14,Customer Sentiment neutral Description Summary...,,ok,Closed by IBM,0,customer sentiment neutral description summary...
2,TS017505135,Email-LUN Total Latency is greater than or equ...,LUN Total Latency is greater than or equal to ...,FlashSystem 9200,AI-1562015,2024-10-14,Customer Sentiment Not available Description S...,,"Very useful feather to have , thanks",IBM is working,0,customer sentiment available description summa...
3,TS017519459,Expired Certificates (EXPIRED_CERTS),"Good afternoon, we recently received several w...",SAN b-type Collection,AI-1562168,2024-10-14,Customer Sentiment neutral Description Summary...,,ok,Closed by IBM,0,customer sentiment neutral description summary...
4,TS017504642,Email - Need to change the SFP fault,Need to change the SFP,SAN c-type Collection,AI-1562711,2024-10-14,Customer Sentiment neutral Description Summary...,,ok,Closed by IBM,0,customer sentiment neutral description summary...


In [3]:
from functools import wraps
import shelve
import hashlib

def cached(func):
    func.cache = shelve.open('llm_cache_CleanData')
    @wraps(func)
    def wrapper(*args):
        h = hashlib.sha512(str(args).encode('utf-8')).hexdigest()
        try:
            return func.cache[h]
        except KeyError:
            func.cache[h] = result = func(*args)
            func.cache.close()
            func.cache = shelve.open('llm_cache_CleanData')
            return result
    return wrapper

In [4]:
# Process case summaries with LLM
@cached
def get_RCA_from_LLM(context, prompt, model):
#The RCA must at least contain the root cause, analysis, resolution, and impact.
    response = ollama.chat(
        model=model,
        messages=[{
            'role': 'user',
            'content': prompt + context
        }]
    )
    return response['message']['content']

# Loop through cases and generate RCA
prompt = """
Here is an example of an RCA:
{
  "RCA": {
    "incident_id": "",
    "date": "",
    "problem": {
      "description": "Memory leak causing service degradation in Service C",
      "symptoms": "Service C experienced gradual performance degradation over 3 hours.",
      "detected_by": "Monitoring system - High memory usage alert"
    },
    "root_cause": {
      "description": "Improper memory management in the data processing module.",
      "related_services": ["Service A", "Service B"],
      "historical_pattern": "Similar memory leak issues found in Service A (Jan 2021) and Service B (Mar 2022)."
    },
    "resolution": {
      "description": "Applied memory optimization techniques, including better memory allocation strategies and garbage collection triggers.",
      "steps_taken": [
        "Identified the memory leak using memory profiler tools.",
        "Optimized memory usage in the data processing module.",
        "Tested the fix in a controlled environment."
      ],
      "time_to_resolve": "2 hours",
      "service_downtime": "3 hours",
      "service_restoration": "Full service restored after memory optimization."
    },
    "impact": {
      "description": "Service degradation for 3 hours, impacting 15% of users.",
      "business_impact": "Moderate performance issues, leading to delays in processing large files."
    },
    "preventive_actions": {
      "description": "Implemented proactive memory monitoring and automated garbage collection.",
      "actions": [
        "Added memory usage alerts in the monitoring system.",
        "Conducted code review to improve memory handling in related services."
      ],
      "future_risk": "High likelihood of similar issues in Service D due to similar code structure."
    }
  }
}
Assume the role of an SRE engineer and give me an RCA report based on the following summary.
Return the report in json format.
If you don't have the data or it's unclear return an empty string :\n
"""
RCA = []
model='granite3-dense:8b-instruct-fp16'
for i, row in df.iterrows():
    context = 'ID: '+row['Case: Case Number']+"\n"    
    context += 'Subject: '+row['Case: Subject']+"\n" 
    context += 'Description: '+row['Case: Description']+"\n" 
    context += 'Created Date: '+str(row['Created Date'])+"\n" 
    context += 'Summary: '+row['processed_text']+"\n"
    print(context)
    RCA += [get_RCA_from_LLM(context, prompt, model)]
    if i > 5: break

ID: TS017525407
Subject: Email - We required high uninitialized Luns/hosts (High workload) and any hardware issues observed at storage end
Description: We required high uninitialized Luns/hosts (High workload) and any hardware issues observed at storage end
Created Date: 2024-10-14
Summary: customer sentiment available description summary summarized email required high uninitialized high workload hardware issue observed storage end required high uninitialized high workload hardware issue observed storage end feed summary summary icici bank requested assistance high uninitialized potential hardware issue storage system ibm support requested information high uninitialized observed hardware issue icici bank provided time frame issue confirmed open event hardware issue time also requested information consuming high load specified time stamp ibm support recommended bandwidth study existing dark fiber link colo site configuration spectrum control monitoring software send report host workload

In [5]:
# Print Sample RCA
i = 0
print('Original Summary: ',df['processed_text'][i])
print('RCA:', RCA[i])

Original Summary:  customer sentiment available description summary summarized email required high uninitialized high workload hardware issue observed storage end required high uninitialized high workload hardware issue observed storage end feed summary summary icici bank requested assistance high uninitialized potential hardware issue storage system ibm support requested information high uninitialized observed hardware issue icici bank provided time frame issue confirmed open event hardware issue time also requested information consuming high load specified time stamp ibm support recommended bandwidth study existing dark fiber link colo site configuration spectrum control monitoring software send report host workload metric ibm expert lab team conduct bandwidth study configure spectrum control reporting ibm support also requested spectrum control report time frame hr storage system icici bank later provided spectrum control report two storage system requested analysis ibm support anal

In [6]:
#classinfication of the data :LLM classoify in RCA,Example, to do list pending

In [7]:
from qdrant_client import QdrantClient, models

client = QdrantClient(url="http://localhost:6333")

client.create_collection(
    collection_name="{rca_collection_CleanData}",
    vectors_config=models.VectorParams(size=100, distance=models.Distance.COSINE),
)



True

In [8]:
# Create a collection
# Add RCAs to vector database
idx = client.add(
    collection_name="rca_collection_CleanData",
    documents=RCA
)

In [9]:
# Search RCA dataset

# query = "firewall issue"
query = "a faulty drive was detected"

search_result = client.query(
    collection_name="rca_collection_CleanData",
    query_text=query
)

# Print TOP 3 results
for i, r in enumerate(search_result):
    print('score:', r.score)
    print(r.metadata['document'])
    if i+1 >= 3: break

score: 0.8731602
{
  "RCA": {
    "incident_id": "TS017538910",
    "date": "2024-10-14",
    "problem": {
      "description": "Switch reboot due to non-restartable component (fdmi) dying",
      "symptoms": "Two reboot events occurred with the error message 'Non restartable component (fdmi (pid=1207)) died'",
      "detected_by": "Watsonx"
    },
    "root_cause": {
      "description": "Software watchdog detected termination of restartable daemon, causing the need for a system reboot and failover",
      "related_services": ["Switch"],
      "historical_pattern": "No similar incidents found in the past."
    },
    "resolution": {
      "description": "Restarted the system using the reboot command and initiated failover using the hafailover command",
      "steps_taken": [
        "Confirmed the solution using Watsonx.",
        "Generated a probable cause indicating software watchdog detected termination of restartable daemon.",
        "Recommended executing the reboot command and