In [9]:
import data_query as dq
import testing_process as tp
import importlib
import pandas as pd
importlib.reload(dq)
importlib.reload(tp)
import warnings
warnings.simplefilter(action='ignore', category=Warning)

# TODO: find best way for detail label-message-count-category-suggestion for multiple label/message

In [10]:
df = tp.db_connect(tp.final_query)

In [11]:
# 1. Send to repair station with single label and single message
# 2. Send to repair station with single label and multiple messages

# 3. Send to repair station but no correspond label from failure description and directly extract label/message from symptom_info
# 4. Send to repair station but no symptom_info or symptom_info is {}, dict will be {failure_description:""}
# 5. Send to repair station but message is empty, dict will be {failure_description:""}
# 6. Not send to repair station, no correspond label from failure description and directly extract label/message from symptom_info
'''
Included Cases: 
1. Matched failure description and symptom_label, with single correspond message
2. Matched failure description and symptom_label, with multiple correspond message
3. Empty failure description, include every label-message from symptom_dict
4. No match failure description, include every label-message from symptom_dict
'''
result_df = tp.process_symptom_info(df)

In [12]:
processed_result_df = tp.compute_label_cycles(result_df)

In [26]:
new = processed_result_df[['serial_number', 'testing_date', 'repaired_date', 'label_cycle']]
new = new[new['testing_date']<='2025-03-31']

In [35]:
# Step 1: 計算所有 label 的總次數
total_counter = Counter()
for cycle in new["label_cycle"]:
    total_counter.update(cycle)

# Step 2: Top 10 label 出現總次數
top_labels = [label for label, _ in total_counter.most_common(10)]
top_10_counter = {label: total_counter[label] for label in top_labels}
top_10_total = sum(top_10_counter.values())

# Step 3: 所有 label 出現總次數
overall_total = sum(total_counter.values())

# Step 4: 百分比
percentage_top10 = top_10_total / overall_total

top_10_counter, overall_total, round(percentage_top10 * 100, 2)

({'hb16 pcie link speed and width check': 3166,
  'diorite ssh connection failed': 3048,
  'checking bmc boot readiness and get bmc ip failed': 2683,
  'dimm infos validation failure': 2542,
  'fru infos validation failure': 1290,
  'sdc first fail of test-completion-missing-error for retest': 1141,
  'diorite location validation failure': 886,
  'the sensor count of config_check_hw is not matched.': 872,
  'capture diorite console and attach to log failed': 773,
  'diorite pcie link speed and width check': 758},
 25633,
 66.94)

In [32]:
from collections import Counter

# 1. 找出所有 label 出現頻率（Across all SNs）
all_label_counter = Counter()
for cycle in new["label_cycle"]:
    all_label_counter.update(cycle.keys())

# 2. 取出 top N label 作為「改善對象」
top_labels = [label for label, _ in all_label_counter.most_common(10)]  # 可以調整 top 5/10

# 3. 建立 "改善後的筆數"（假設這些 top label 不再重複）
def adjusted_count(label_dict, fixed_labels):
    count = 0
    for label, freq in label_dict.items():
        if label in fixed_labels:
            count += min(freq, 1)  # 視為只修一次
        else:
            count += freq  # 沒改善的照舊
    return count

# 4. 計算 Before & After cycle 數量（總筆數）
new["original_cycle"] = new["label_cycle"].apply(lambda d: sum(d.values()))
new["adjusted_cycle"] = new["label_cycle"].apply(lambda d: adjusted_count(d, top_labels))

# 5. 加入 duration
new["repaired_date"] = pd.to_datetime(new["repaired_date"])
new["testing_date"] = pd.to_datetime(new["testing_date"])
new["repair_duration"] = (new["repaired_date"] - new["testing_date"]).dt.total_seconds() / 60
avg_duration = new["repair_duration"].mean()

# 6. groupby SN：before/after 總維修時間
original_cycle_avg = new.groupby("serial_number")["original_cycle"].sum().mean()
adjusted_cycle_avg = new.groupby("serial_number")["adjusted_cycle"].sum().mean()

# 7. repair 時間估算
original_time_avg = original_cycle_avg * avg_duration
adjusted_time_avg = adjusted_cycle_avg * avg_duration


In [33]:
original_cycle_avg,adjusted_cycle_avg,original_time_avg,adjusted_time_avg,avg_duration 

(np.float64(2.3670699048850308),
 np.float64(1.8828146643272694),
 np.float64(8143.584425762396),
 np.float64(6477.569650718586),
 np.float64(3440.3649883580147))

In [34]:
1 - adjusted_time_avg/original_time_avg

np.float64(0.20458003355050125)

In [13]:
# Define the keyword to search for
keyword_to_drop = "checking bmc"  # Change this to the actual keyword

# Function to check if the keyword exists in the dictionary keys
def contains_keyword(symptom_dict, keyword):
    if not isinstance(symptom_dict, dict):  # Ensure it's a valid dictionary
        return False
    return any(keyword in label for label in symptom_dict.keys())  # Check if keyword is in any label

# Drop rows where the keyword appears in symptom_dict keys
check_bmc_df = processed_result_df[processed_result_df["symptom_dict"].apply(lambda x: contains_keyword(x, keyword_to_drop))]


In [6]:
# Define the keyword to search for
keyword_to_drop = "rsync-error"  # Change this to the actual keyword

# Function to check if the keyword exists in the dictionary keys
def contains_keyword(symptom_dict, keyword):
    if not isinstance(symptom_dict, dict):  # Ensure it's a valid dictionary
        return False
    return any(keyword in label for label in symptom_dict.keys())  # Check if keyword is in any label

# Drop rows where the keyword appears in symptom_dict keys
processed_result_df = processed_result_df[~processed_result_df["symptom_dict"].apply(lambda x: contains_keyword(x, keyword_to_drop))]

In [7]:
debug_df = pd.read_excel('debug_tree.xlsx', header=[0, 1])
# Rename columns properly
debug_df.columns = ["symptom_label", "symptom_message", "cycle1_cat", "cycle1_suggestion", "cycle2_cat", "cycle2_suggestion", "cycle3+_cat", "cycle3+_suggestion"]
# Fill down `symptom_label` column and apply stripping/lowering
debug_df["symptom_label"] = debug_df["symptom_label"].ffill().str.strip().str.lower()

# Apply stripping and lowering to `symptom_message` column
debug_df["symptom_message"] = debug_df["symptom_message"].astype(str).str.strip().str.lower()
unique_pairs_set = set(zip(debug_df["symptom_label"], debug_df["symptom_message"]))
len(unique_pairs_set), len(set(debug_df["symptom_label"]))

(367, 180)

In [8]:
# Split to single label-message set and multi label-message set
single_label = processed_result_df[processed_result_df['label_cycle'].apply(lambda x: len(x) == 1)]
multi_label = processed_result_df[processed_result_df['label_cycle'].apply(lambda x: len(x) > 1)] # cases of no failure description
single_label_message = single_label[single_label['label_message_cycle'].apply(lambda x: len(x) == 1)]
single_label_multi_message = single_label[single_label['label_message_cycle'].apply(lambda x: len(x) > 1)]

In [9]:

hb16_patterns = [rf"device not found\.\s*can't validate pcie info for 'pe{pe}.io0'"for pe in range(2, 6)]
pcie_pelane_patterns = [
    rf"^pcie serial number: .+?, location: [\w:.]+, devpath: /phys/pe{pe}/io\d+.*? pcie error, error types: 'lane_error' are exceeding thresholds\."
    for pe in range(0, 8)  # ✅ Only `pe2` to `pe7`
]

pcie_cpulane_patterns = [
    rf"^pcie serial number: .+?, location: [\w:.]+, devpath: /phys/cpu{cpu}.*? pcie error, error types: 'lane_error' are exceeding thresholds\."
    for cpu in range(0, 2)  # ✅ Only `cpu0` and `cpu1`
]

pcie_pereceiver_patterns = [
    rf"^pcie serial number: .+?, location: [\w:.]+, devpath: /phys/pe{pe}/io\d+.*? pcie error, error types: 'receiver_error' are exceeding thresholds\."
    for pe in range(0, 8)  # ✅ Only `pe2` to `pe7`
]

pcie_cpureceiver_patterns = [
    rf"^pcie serial number: .+?, location: [\w:.]+, devpath: /phys/cpu{cpu}.*? pcie error, error types: 'receiver_error' are exceeding thresholds\."
    for cpu in range(0, 2)  # ✅ Only `cpu0` and `cpu1`
]

pcie_pereplay_patterns = [
    rf"^pcie serial number: .+?, location: [\w:.]+, devpath: /phys/pe{pe}/io\d+.*? pcie error, error types: 'replay_timer_timeout' are exceeding thresholds\."
    for pe in range(0, 8)  # ✅ Only `pe2` to `pe7`
]

pcie_cpureplay_patterns = [
    rf"^pcie serial number: .+?, location: [\w:.]+, devpath: /phys/cpu{cpu}.*? pcie error, error types: 'replay_timer_timeout' are exceeding thresholds\."
    for cpu in range(0, 2)  # ✅ Only `cpu0` and `cpu1`
]

sensor_fan_tach_patterns = [
    rf"^got \d+ violations, exceeding threshold of \d+ for sensor fan{fan} tach\."
    for fan in range(0, 6) 
]

match_criteria = {
    'memtester-insufficient-memory': "available memory: \d+, minimum required memory: \d+",
    "capture diorite console and attach to log failed":  r"^caught .*? exception",
    "sata fio write too slow": [r"sata fio /dev/sda (read|write) bw too slow,? cur: \d+, exp: \d+"],  # Label regex
    "sata fio log check": [r'(?s)(?=.*"fio version")(?=.*"timestamp")(?=.*"timestamp_ms")(?=.*"time")'],  # Message regex
    "command execution failed date": [r"command execution failed: date"], # Label regex
    "command execution failed date message": [r"exception message"], # Message regex
    "dimm infos validation failure": r"component dimm sn: .*? not found on unit",
    "hb16 pcie link speed and width check": hb16_patterns,
    "diorite location validation failure": [r"component diorite sn: .*? not found on unit",r"component diorite sn \S+ location: \S+, expected location: \S+"],
    "pcierrors-high-lane_error-rate": pcie_pelane_patterns + pcie_cpulane_patterns,
    "pcierrors-high-receiver_error-rate": pcie_pereceiver_patterns + pcie_cpureceiver_patterns,
    "pcierrors-high-replay_timer_timeout-rate": pcie_pereplay_patterns + pcie_cpureplay_patterns,
    "lower critical limit violated": sensor_fan_tach_patterns,
    "hb16 location validation failure": [r"component hb16 sn: .*? not found on unit",r"component hb16 sn \S+ location: \S+, expected location: \S+"],
    "error-monitor-procedural-error": r"^Test failed:.*?INTERNAL:",
    "diorite part number": r"value\s*=\s*['\"]([^'\"]+)['\"]\s*lower_limit\s*=\s*['\"]([^'\"]*)['\"]\s*upper_limit\s*=\s*['\"]([^'\"]*)['\"]",
    "send command fail:\s*stressapptest": "exception message:\s*\ntraceback",
    "diorite ssh connection failed": "create diorite ssh connection failed\.\s*can not connect to diorite ip: ",
    "cpu location validation failure": [r"component cpu sn: .*? not found on unit",r"component cpu sn \S+ location: \S+, expected location: \S+"],
    "timeout exceeded.": r"thread .*? execution exceeded the timeout limit of .*? seconds",
    "attach booting logs to otc failed": r"attach booting logs to otc failed.\nmsg:",
    "caught oserror exception.": r"traceback \(most recent call last\):\n.*",
    "dimm location validation failure": r"component dimm sn .+? location: .+?, expected location: .+?",
    "io adapter location validation failure": r"component io adapter sn .+? location: .+?, expected location: .+?",
    "gssd location validation failure": [r"component gssd sn: .*? not found on unit",r"component gssd sn \S+ location: \S+, expected location: \S+"],
    "cssd infos validation failure": r"component cssd sn: .*? not found on unit",
    "configuration check (hardware) failed": r"parse response failure. response:",
    "configuration check (software) failed": r"send command",
    "bad-bus": r"for local bus.*?/phys/cpu\d+.*?expected speed:.*?, actual speed:"
}
# Ensure all match_criteria values are lists of patterns
match_criteria = {key: [value] if isinstance(value, str) else value for key, value in match_criteria.items()}

In [10]:
merged_symptoms_list = []
for symptom_data in processed_result_df["symptom_dict"]:
    if not isinstance(symptom_data, dict):  # Skip non-dict values
        continue
    for label, messages in symptom_data.items():
        # Ensure messages is always a list
        if isinstance(messages, str):  
            messages = [messages]  # Convert single string to list

        for message in messages:
            merged_symptoms_list.append({"symptom_label": label, "symptom_message": message})

# Create DataFrame
merged_symptoms_df = pd.DataFrame(merged_symptoms_list).drop_duplicates().reset_index(drop=True)


In [11]:

# Use groupby() + to_dict() for a much faster dictionary creation
debug_dict = debug_df.groupby("symptom_label")["symptom_message"].apply(set).to_dict()


In [12]:
import time
import re

# Step 1: Precompile all regex patterns
compiled_match_criteria = {
    key: [re.compile(p, re.IGNORECASE) for p in patterns] for key, patterns in match_criteria.items()
}

# Step 2: Initialize tracking list for slow rows
slow_indices = []  # Stores indices of rows taking too long

MAX_LENGTH = 500  # Set an appropriate limit

# Step 3: Define regex match function
def regex_match(text, compiled_patterns, timeout=1.0):
    """Checks if text matches any precompiled regex pattern from a dictionary."""
    if pd.isna(text):
        return False
    
    # If the message is too long, only match the first 500 characters
    text = text[:500] if len(text) > 500 else text

    start_time = time.time()
    for label, patterns in compiled_patterns.items():  # ✅ Loop through correctly
        for pattern in patterns:
            if time.time() - start_time > timeout:
                return False  # Skip slow matches
            if pattern.search(text):
                return True  # ✅ Found a match

    return False

# Step 4: Compute label match efficiently
merged_symptoms_df["label_match_flag"] = merged_symptoms_df["symptom_label"].map(
    lambda label: label in debug_dict or regex_match(label, compiled_match_criteria)
)

# Step 5: Compute full match (Skip slow rows)
def fast_full_match(row, timeout=1.0):
    """Checks if symptom_label + symptom_message match exactly or via regex (only if needed)."""
    label, message = row["symptom_label"], row["symptom_message"]

    if row["label_match_flag"]:
        # Exact message match
        if label in debug_dict and message in debug_dict[label]:
            return True

        # If message is too long, try regex on the first 500 characters
        if len(message) > MAX_LENGTH:
            if regex_match(message[:500], compiled_match_criteria, timeout=timeout):
                return True
            slow_indices.append(row.name)  # Store row index if still slow
            return False  # Skip row

        # Otherwise, check regex on the full message
        start_time = time.time()
        if regex_match(message, compiled_match_criteria, timeout=timeout):
            return True

        elapsed_time = time.time() - start_time
        if elapsed_time > timeout:
            slow_indices.append(row.name)  # Store slow row index
            return False  # Skip slow row

    return False  # No match if label itself is unmatched

# Step 6: Apply fast matching with timeout
merged_symptoms_df["full_match_flag"] = merged_symptoms_df.apply(lambda row: fast_full_match(row, timeout=1.0), axis=1)



In [13]:
# Step 1: Precompile all regex patterns
compiled_match_criteria = {
    key: [re.compile(p, re.IGNORECASE) for p in patterns] for key, patterns in match_criteria.items()
}

# Step 2: Convert merged_symptoms_df into a lookup dictionary for quick matching
merged_label_set = set(merged_symptoms_df["symptom_label"])
merged_message_dict = (
    merged_symptoms_df.groupby("symptom_label")["symptom_message"].apply(set).to_dict()
)

# Step 3: Define regex match function
def regex_match(text, compiled_patterns, timeout=1.0):
    """Checks if text matches any precompiled regex pattern from a dictionary."""
    if pd.isna(text):
        return False
    
    text = text[:500] if len(text) > 500 else text  # Limit length

    start_time = time.time()
    for label, patterns in compiled_patterns.items():
        for pattern in patterns:
            if time.time() - start_time > timeout:
                return False  # Skip slow matches
            if pattern.search(text):
                return True

    return False

# Step 4: Compute label and full match flags for debug_df
def check_label_match(label):
    """Check if the label exists in merged_symptoms_df or matches a regex."""
    return label in merged_label_set or regex_match(label, compiled_match_criteria)

def check_full_match(row):
    """Check if both label and message match."""
    label, message = row["symptom_label"], row["symptom_message"]
    
    # ✅ Check exact label-message match in merged_symptoms_df
    if label in merged_message_dict and message in merged_message_dict[label]:
        return True

    # ✅ Check regex match for message (if label is already matched)
    if row["label_match_flag"]:
        return regex_match(message, compiled_match_criteria)

    return False

# Step 5: Create solved_debug_df
solved_debug_df = debug_df.copy()

# Step 6: Apply matching checks
solved_debug_df["label_match_flag"] = solved_debug_df["symptom_label"].apply(check_label_match)
solved_debug_df["full_match_flag"] = solved_debug_df.apply(check_full_match, axis=1)
solved_debug_df[["symptom_label", 'symptom_message', "label_match_flag", "full_match_flag"]].to_csv('solved_debug_df.csv', index=False)


In [14]:
print("solved percentage for fully match from debug tree: ", merged_symptoms_df[(merged_symptoms_df['label_match_flag']==True)&(merged_symptoms_df['full_match_flag']==True)].shape[0] / merged_symptoms_df.shape[0])
print("found label: ", len(merged_symptoms_df[merged_symptoms_df['label_match_flag']==True].symptom_label.unique()) / len(merged_symptoms_df.symptom_label.unique()))

solved percentage for fully match from debug tree:  0.8046396887897908
found label:  0.5521172638436482


In [15]:
merged_symptoms_df

Unnamed: 0,symptom_label,symptom_message,label_match_flag,full_match_flag
0,ncsi_cable-preparation-error,failed to detect ncsi cable on diorite cn,True,True
1,the sensor count of config_check_hw is not mat...,"total number of sensors expected: 185, current...",True,False
2,check dut connections failed,check symptom message of each step below.,False,False
3,hb16 pcie link speed and width check failed,device not found.\ncan't validate pcie info fo...,True,True
4,dimm infos validation failure,component dimm sn: (l)64gb 2rx4 pc5-4800b-ra0-...,True,True
...,...,...,...,...
21074,diorite ssh connection failed,create diorite ssh connection failed.\ncan not...,True,True
21075,dimm infos validation failure,component dimm sn: (l)64gb 2rx4 pc5-5600b-ra0-...,True,True
21076,diorite location validation failure,component diorite sn: pdfcth250413850 not foun...,True,True
21077,diorite ssh connection failed,create diorite ssh connection failed.\ncan not...,True,True


In [16]:
# Count the number of unique symptom_message per symptom_label
# Labels that are not on the debug tree, what need to be delete or add!?
not_match_df = merged_symptoms_df[merged_symptoms_df['label_match_flag']==False]
label_message_count = not_match_df.groupby("symptom_label")["symptom_message"].nunique().reset_index()
print(not_match_df.shape)
label_message_count.sort_values(by=['symptom_message'], ascending=False)


(912, 4)


Unnamed: 0,symptom_label,symptom_message
250,sensor-monitor-runtime-too-short,128
248,sdc first fail of test-completion-missing-erro...,81
42,diorite sanity check failed,69
246,sat-tolerable-dimm,24
24,cssd,24
...,...,...
266,the gulp it's not functional,1
264,the count of gulp fru is not 1,1
263,the cable pcle it's not functional,1
5,ampttkv30-timeout,1


In [17]:
not_match_message_df = merged_symptoms_df[(merged_symptoms_df['label_match_flag']==True)&(merged_symptoms_df['full_match_flag']==False)]
print(not_match_message_df.shape)
label_message_count = not_match_message_df.groupby("symptom_label")["symptom_message"].nunique().reset_index()
print(label_message_count.shape)
label_message_count.sort_values(by=['symptom_message'], ascending=False)[:10]

(3206, 4)
(89, 2)


Unnamed: 0,symptom_label,symptom_message
33,fru infos validation failure,1975
25,excessive-uncorrectable-core-cpu-errors,87
47,pcierrors-high-surprise_down_error-rate,81
41,pcierrors-high-bad_tlp-rate,76
4,bad-bus,74
34,gssd location validation failure,63
36,lower critical limit violated,61
75,the sensor count of config_check_hw is not mat...,58
79,unknown-pcie-location,48
30,fan_speed-internal-error,47


In [235]:
aa = not_match_message_df[not_match_message_df['symptom_label'].str.contains('hb16')]
text = aa.iloc[1].symptom_message
print(text)
aa

pcie device existence on 'pe3.io0'


Unnamed: 0,symptom_label,symptom_message,label_match_flag,full_match_flag
8441,hb16 pcie link speed and width check,pcie device existence on 'pe4.io0',True,False
8442,hb16 pcie link speed and width check,pcie device existence on 'pe3.io0',True,False
8443,hb16 pcie link speed and width check,pcie device existence on 'pe2.io0',True,False
8444,hb16 pcie link speed and width check,pcie device existence on 'pe5.io0',True,False
8992,hb16 pcie link speed and width check,pcie device link width at /phys/pe2/io0:device...,True,False
8993,hb16 pcie link speed and width check,pcie device link width at /phys/pe5/io0:device...,True,False
8994,hb16 pcie link speed and width check,pcie device link width at /phys/pe3/io0:device...,True,False
9064,hb16 pcie link speed and width check,pcie device link width at /phys/pe4/io0:device...,True,False
9065,hb16 pcie link speed and width check,pcie device link speed at /phys/pe3/io0:device...,True,False
10045,hb16 pcie link speed and width check,pcie device link speed at /phys/pe2/io0:device...,True,False


## Deal with single label-message combination

In [18]:
singel_label_message_debug_result = tp.singel_label_message_debug(single_label_message, debug_df)

In [19]:
singel_label_message_debug_result.shape


(52913, 21)

In [69]:
def single_label_message_debug_with_pattern_matching(processed_result_df, debug_df, match_dict):
    """
    Updates processed_result_df by:
    1. Matching labels/messages using regex-based pattern matching.
    2. Tracking cycle counts per serial_number to ensure counts start at 1.
    3. Assigning category & suggestion based on debug_df.

    Parameters:
    - processed_result_df: DataFrame with label_cycle and label_message_cycle.
    - debug_df: DataFrame with symptom_label, symptom_message, and cycle-based category/suggestions.
    - match_dict: Dictionary containing regex patterns for label matching.

    Returns:
    - Updated DataFrame with new columns:
      - Category (dict)
      - Suggestion (dict)
      - Label_Cycle_Count (int)
      - Message_Cycle_Count (int)
    """

    processed_result_df_copy = processed_result_df.copy()

    # ✅ Step 1: Convert debug_df into a lookup dictionary
    debug_dict = {}
    for _, row in debug_df.iterrows():
        debug_label = row["symptom_label"]
        debug_message = row["symptom_message"]
        debug_dict.setdefault(debug_label, {})[debug_message] = row  # Store full row for lookup

    # ✅ Step 2: Initialize tracking for label & message counts per serial_number
    serial_label_count = {}  # {serial_number: {label_pattern: count}}
    serial_label_message_count = {}  # {serial_number: {(label_pattern, message): count}}

    # New columns to be added
    categories, suggestions = [], []
    label_cycle_counts, message_cycle_counts = [], []

    # ✅ Step 3: Iterate over `processed_result_df`
    for index, row in processed_result_df_copy.iterrows():
        serial_number = row["serial_number"]
        label_message_cycle = row["label_message_cycle"]  # Extract {(label, message): count}
        label_cycle = row["label_cycle"]  # Extract {label: count}

        # Initialize tracking for this serial_number if not exists
        if serial_number not in serial_label_count:
            serial_label_count[serial_number] = {}
            serial_label_message_count[serial_number] = {}

        category_dict, suggestion_dict = {}, {}

        # ✅ Initialize counts to ensure they are always assigned
        label_count = 1  # Default to 1 if no match found
        message_count = 1  # Default to 1 if no match found

        found_match = False  # Track if a match was found

        for (label, message), _ in label_message_cycle.items():
            matched_label = None

            # ✅ Step 4: Find matching label in `match_dict`
            for pattern_label in match_dict.keys():
                if re.fullmatch(pattern_label, label, re.IGNORECASE) or re.search(pattern_label, label, re.IGNORECASE):
                    matched_label = pattern_label
                    break  # Stop after first match

            if not matched_label:
                continue  # No match, skip

            found_match = True  # Mark that at least one match was found

            # ✅ Step 5: Determine cycle counts based on previous occurrences
            previous_label_counts = [
                count for prev_label, count in serial_label_count[serial_number].items()
                if re.fullmatch(matched_label, prev_label, re.IGNORECASE) or re.search(matched_label, prev_label, re.IGNORECASE)
            ]
            label_count = max(previous_label_counts) + 1 if previous_label_counts else 1  # Ensure first occurrence is 1

            previous_message_counts = [
                count for (prev_label, prev_message), count in serial_label_message_count[serial_number].items()
                if (re.fullmatch(matched_label, prev_label, re.IGNORECASE) or re.search(matched_label, prev_label, re.IGNORECASE))
                and prev_message == message  # Match message exactly
            ]
            message_count = max(previous_message_counts) + 1 if previous_message_counts else 1

            # ✅ Step 6: Store updated counts
            serial_label_count[serial_number][label] = label_count
            serial_label_message_count[serial_number][(label, message)] = message_count

            # ✅ Step 7: Assign category & suggestion based on cycle count
            if (label, message) in debug_dict:
                for i in [1, 2, "3+"]:
                    if message_count == i or (message_count >= 3 and i == "3+"):
                        category_dict[message] = debug_dict[(label, message)].get(f"cycle{i}_cat", "")
                        suggestion_dict[message] = debug_dict[(label, message)].get(f"cycle{i}_suggestion", "")

            elif label in debug_df["symptom_label"].values:
                for _, label_row in debug_df[debug_df["symptom_label"] == label].iterrows():
                    msg_text = label_row["symptom_message"]
                    for i in [1, 2, "3+"]:
                        if label_count == i or (label_count >= 3 and i == "3+"):
                            category_dict[msg_text] = label_row.get(f"cycle{i}_cat", "")
                            suggestion_dict[msg_text] = label_row.get(f"cycle{i}_suggestion", "")

        # ✅ Ensure `label_count` and `message_count` are at least 1 even if no match was found
        if not found_match:
            label_count = 1
            message_count = 1

        # ✅ Step 8: Append results
        categories.append(category_dict if category_dict else {})
        suggestions.append(suggestion_dict if suggestion_dict else {})
        label_cycle_counts.append(label_count)
        message_cycle_counts.append(message_count)


    # ✅ Step 9: Update DataFrame with new columns
    processed_result_df_copy["Category"] = categories
    processed_result_df_copy["Suggestion"] = suggestions
    processed_result_df_copy["Label_Cycle_Count"] = label_cycle_counts
    processed_result_df_copy["Message_Cycle_Count"] = message_cycle_counts

    return processed_result_df_copy

x = single_label_message_debug_with_pattern_matching(single_label_message[:5000], debug_df, match_criteria)

In [70]:
x

Unnamed: 0,model_name,build_type,skuno,serial_number,station,repaired_date,symptom_info,failure_description,repair_code,repaired_description,...,no_match_flag,empty_symptom_flag,empty_message_flag,isRepair,label_cycle,label_message_cycle,Category,Suggestion,Label_Cycle_Count,Message_Cycle_Count
0,ASTORIA,GA,1154500,FWI2345-00007,AST Functional Test,2023-11-15 15:58:55,{'PassSymRunKey_be8307a8-0f5e-4466-b2f6-8c4396...,ncsi_cable-preparation-error,REPLACE- MOBO,reseat ncsi; fail; replace ncsi; fail; reseat ...,...,False,False,False,True,{'ncsi_cable-preparation-error': 1},"{('ncsi_cable-preparation-error', 'failed to d...",{},{},1,1
1,ASTORIA,GA,1154500,FWI2345-00007,AST Functional Test,2023-11-15 15:58:55,{'PassSymRunKey_be8307a8-0f5e-4466-b2f6-8c4396...,ncsi_cable-preparation-error,REPLACE- CPU,replace mobo; replace cpu1,...,False,False,False,True,{'ncsi_cable-preparation-error': 2},"{('ncsi_cable-preparation-error', 'failed to d...",{},{},1,1
2,ASTORIA,GA,1154500,FWI2345-00007,AST Functional Test,2023-11-15 15:58:55,{'PassSymRunKey_be8307a8-0f5e-4466-b2f6-8c4396...,ncsi_cable-preparation-error,REPLACE- CPU,replace mobo; replace cpu0,...,False,False,False,True,{'ncsi_cable-preparation-error': 3},"{('ncsi_cable-preparation-error', 'failed to d...",{},{},1,1
3,ASTORIA,GA,1154427,FWI2346-04932,AST Functional Test,2023-11-21 11:04:41,{'SymptomRunKey_08aa141a-98e6-46be-bac5-4a590c...,The sensor count of Config_check_HW is not mat...,RETEST,"Terminal of conector with contamination, clean...",...,False,False,False,True,{'the sensor count of config_check_hw is not m...,{('the sensor count of config_check_hw is not ...,{},{},1,1
4,ASTORIA,GA,1170765,FWI2348-00349,AST SDC Test,2023-12-01 12:00:31,{'SymptomRunKey_25d489b8-7ffc-468d-88a2-716c6a...,Check DUT connections Failed,RETEST,"power cycle, retest, Only 3 NIC ips detected",...,False,False,False,True,{'check dut connections failed': 1},"{('check dut connections failed', 'check sympt...",{},{},1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5482,ASTORIA,GA,1158438,FWI2409-15966,AST Run-in Test,2024-03-04 08:52:34,{'SymptomRunKey_05d5b145-5aed-4fc2-adfb-1fec46...,test-completion-missing-error-sat,RETEST,unit unplugged during test,...,False,False,False,True,{'test-completion-missing-error-sat': 1},"{('test-completion-missing-error-sat', 'diag e...",{},{},1,1
5483,ASTORIA,GA,1158438,FWI2409-15970,AST Functional Test,2024-03-04 13:48:15,{'SymptomRunKey_95cb2e9c-d11c-48e8-b039-b477ea...,DIMM infos validation failure,RESEAT - DIMM,DIMM23 not present reseated.,...,False,False,False,True,{'dimm infos validation failure': 1},"{('dimm infos validation failure', 'component ...",{'component dimm sn: (l)64gb 2rx4 pc5-4800b-ra...,{'component dimm sn: (l)64gb 2rx4 pc5-4800b-ra...,1,1
5484,ASTORIA,GA,1158438,FWI2409-15975,AST Functional Test,2024-03-03 09:24:49,{'SymptomRunKey_d7f0a55a-10cb-40d7-b30b-d2e9c1...,DIMM infos validation failure,SWAP - DIMM,"Reseat dimm 22 fail, swap with dimm 0",...,False,False,False,True,{'dimm infos validation failure': 1},"{('dimm infos validation failure', 'component ...",{'component dimm sn: (l)64gb 2rx4 pc5-4800b-ra...,{'component dimm sn: (l)64gb 2rx4 pc5-4800b-ra...,1,1
5485,ASTORIA,GA,1158438,FWI2409-15985,AST Functional Test,2024-03-04 19:48:36,{'SymptomRunKey_393c9e34-ed1e-4316-9826-4964fa...,DIMM infos validation failure,RESEAT - DIMM,reseat dimm 21,...,False,False,False,True,{'dimm infos validation failure': 1},"{('dimm infos validation failure', 'component ...",{'component dimm sn: (l)64gb 2rx4 pc5-4800b-ra...,{'component dimm sn: (l)64gb 2rx4 pc5-4800b-ra...,1,1


In [None]:
# cases that exactly match with debug tree
singel_label_message_match_message = singel_label_message_debug_result[singel_label_message_debug_result['Message_Cycle_Count'] != 'N/A']
singel_label_message_match_message["Category"] = singel_label_message_match_message["Category"].apply(
    lambda x: next(iter(x.values())) if isinstance(x,   dict) and len(x) == 1 else list(x.values()) if isinstance(x, dict) else x
)
singel_label_message_match_message["Suggestion"] = singel_label_message_match_message["Suggestion"].apply(
    lambda x: next(iter(x.values())) if isinstance(x, dict) and len(x) == 1 else list(x.values()) if isinstance(x, dict) else x
)

# Extract all unique labels
unique_label_match_label_message = set()

for label_dict in singel_label_message_match_message["label_cycle"]:
    if isinstance(label_dict, dict):
        unique_label_match_label_message.update(label_dict.keys())

# Extract all unique (label, message) pairs
unique_label_message_pairs_match_label_message = set()

for label_message_dict in singel_label_message_match_message["label_message_cycle"]:
    if isinstance(label_message_dict, dict):
        unique_label_message_pairs_match_label_message.update(label_message_dict.keys())

# Convert to a list if needed
unique_label_message_pairs_match_label_message = list(unique_label_message_pairs_match_label_message)
print(len(unique_label_message_pairs_match_label_message))

singel_label_message_match_message = singel_label_message_match_message.drop(columns=['label_cycle', 'label_message_cycle', 'Label_Cycle_Count'])
singel_label_message_match_message

99


Unnamed: 0,model_name,build_type,skuno,serial_number,station,repaired_date,symptom_info,failure_description,repair_code,repaired_description,symptom_dict,no_match_flag,empty_symptom_flag,empty_message_flag,isRepair,Category,Suggestion,Message_Cycle_Count
0,ASTORIA,GA,1154500,FWI2345-00007,AST Functional Test,2023-11-15 15:58:55,{'PassSymRunKey_be8307a8-0f5e-4466-b2f6-8c4396...,ncsi_cable-preparation-error,REPLACE- MOBO,reseat ncsi; fail; replace ncsi; fail; reseat ...,{'ncsi_cable-preparation-error': ['failed to d...,False,False,False,True,Quick Repair,1. Verify proper seating for NCSI cable on bot...,1
1,ASTORIA,GA,1154500,FWI2345-00007,AST Functional Test,2023-11-15 15:58:55,{'PassSymRunKey_be8307a8-0f5e-4466-b2f6-8c4396...,ncsi_cable-preparation-error,REPLACE- CPU,replace mobo; replace cpu1,{'ncsi_cable-preparation-error': ['failed to d...,False,False,False,True,Slow Repair,1. Verify previous repair action\n2. Replace N...,2
2,ASTORIA,GA,1154500,FWI2345-00007,AST Functional Test,2023-11-15 15:58:55,{'PassSymRunKey_be8307a8-0f5e-4466-b2f6-8c4396...,ncsi_cable-preparation-error,REPLACE- CPU,replace mobo; replace cpu0,{'ncsi_cable-preparation-error': ['failed to d...,False,False,False,True,Debug II,1. Try golden NCSI cable\n2. Try golden NIC ca...,3
15,ASTORIA,GA,1170765,FWI2348-00355,AST Functional Test,2023-11-30 09:45:52,{'SymptomRunKey_67a7ebab-3d5d-4953-94b9-73cb2a...,test-completion-missing-error-dna,RETEST,validating new diag 4.0.8.0,{'test-completion-missing-error-dna': ['diag e...,False,False,False,True,Retest,1. Retest,1
23,ASTORIA,GA,1158438,FWI2349-01033,AST Run-in Test,2023-12-06 13:43:47,{'SymptomRunKey_8a354d62-a1ab-4c96-ab17-4efdb9...,test-completion-missing-error-memtester,SWAP - CPU,"CPU0 CACHE, Error type: uncorrectable-cpu-erro...",{'test-completion-missing-error-memtester': ['...,False,False,False,True,Retest,1. Retest,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60164,ASTORIA,GA,1182051,FWI2511-09601,AST Functional Test,2025-03-14 12:23:49,{'SymptomRunKey_16db3090-d980-4eed-88f6-fe675a...,Checking BMC boot readiness and get BMC IP Failed,RETEST,unit not fully inserted into slot,{'checking bmc boot readiness and get bmc ip f...,False,False,False,True,Retest,1. Retest,1
60166,ASTORIA,GA,1182051,FWI2511-09606,AST Functional Test,2025-03-14 12:24:57,{'SymptomRunKey_223dbd1c-ea1f-44b2-94e8-2f569f...,Checking BMC boot readiness and get BMC IP Failed,RETEST,unit not fully inserted into slot,{'checking bmc boot readiness and get bmc ip f...,False,False,False,True,Retest,1. Retest,1
60168,ASTORIA,GA,1182051,FWI2511-09612,AST Run-in Test,2025-03-14 19:29:44,{'SymptomRunKey_918eba67-7abe-407d-82d5-f915d8...,test-completion-missing-error-sat,RETEST,Retest,{'test-completion-missing-error-sat': ['diag e...,False,False,False,True,Retest,1. Retest,1
60186,ASTORIA,GA,1182051,FWI2511-09658,AST Pre-Test,2025-03-14 01:28:31,{'SymptomRunKey_59852e6d-6236-42ee-9fd8-542af6...,Pre Diorite firmware check on all slots Failed,RETEST,retest,{'pre diorite firmware check on all slots fail...,False,False,False,True,Quick Repair,1. Reseat agora,1


In [61]:

hb16_patterns = [rf"device not found\.\s*can't validate pcie info for 'pe{pe}.io0'"for pe in range(2, 6)]
pcie_pelane_patterns = [
    rf"^pcie serial number: .+?, location: [\w:.]+, devpath: /phys/pe{pe}/io\d+.*? pcie error, error types: 'lane_error' are exceeding thresholds\."
    for pe in range(0, 8)  # ✅ Only `pe2` to `pe7`
]

pcie_cpulane_patterns = [
    rf"^pcie serial number: .+?, location: [\w:.]+, devpath: /phys/cpu{cpu}.*? pcie error, error types: 'lane_error' are exceeding thresholds\."
    for cpu in range(0, 2)  # ✅ Only `cpu0` and `cpu1`
]

pcie_pereceiver_patterns = [
    rf"^pcie serial number: .+?, location: [\w:.]+, devpath: /phys/pe{pe}/io\d+.*? pcie error, error types: 'receiver_error' are exceeding thresholds\."
    for pe in range(0, 8)  # ✅ Only `pe2` to `pe7`
]

pcie_cpureceiver_patterns = [
    rf"^pcie serial number: .+?, location: [\w:.]+, devpath: /phys/cpu{cpu}.*? pcie error, error types: 'receiver_error' are exceeding thresholds\."
    for cpu in range(0, 2)  # ✅ Only `cpu0` and `cpu1`
]

pcie_pereplay_patterns = [
    rf"^pcie serial number: .+?, location: [\w:.]+, devpath: /phys/pe{pe}/io\d+.*? pcie error, error types: 'replay_timer_timeout' are exceeding thresholds\."
    for pe in range(0, 8)  # ✅ Only `pe2` to `pe7`
]

pcie_cpureplay_patterns = [
    rf"^pcie serial number: .+?, location: [\w:.]+, devpath: /phys/cpu{cpu}.*? pcie error, error types: 'replay_timer_timeout' are exceeding thresholds\."
    for cpu in range(0, 2)  # ✅ Only `cpu0` and `cpu1`
]

sensor_fan_tach_patterns = [
    rf"^got \d+ violations, exceeding threshold of \d+ for sensor fan{fan} tach\."
    for fan in range(0, 6) 
]

match_criteria = {
    'memtester-insufficient-memory': "available memory: \d+, minimum required memory: \d+",
    "capture diorite console and attach to log failed":  r"^caught .*? exception",
    r"sata fio /dev/sda (read|write) bw too slow,? cur: \d+, exp: \d+": [r'(?s)(?=.*"fio version")(?=.*"timestamp")(?=.*"timestamp_ms")(?=.*"time")'],  # Message regex
    r"command execution failed: date": r"exception message", # Message regex
    "dimm infos validation failure": r"component dimm sn: .*? not found on unit",
    "hb16 pcie link speed and width check": hb16_patterns,
    "diorite location validation failure": [r"component diorite sn: .*? not found on unit",r"component diorite sn \S+ location: \S+, expected location: \S+"],
    "pcierrors-high-lane_error-rate": pcie_pelane_patterns + pcie_cpulane_patterns,
    "pcierrors-high-receiver_error-rate": pcie_pereceiver_patterns + pcie_cpureceiver_patterns,
    "pcierrors-high-replay_timer_timeout-rate": pcie_pereplay_patterns + pcie_cpureplay_patterns,
    "lower critical limit violated": sensor_fan_tach_patterns,
    "hb16 location validation failure": [r"component hb16 sn: .*? not found on unit",r"component hb16 sn \S+ location: \S+, expected location: \S+"],
    "error-monitor-procedural-error": r"^Test failed:.*?INTERNAL:",
    "diorite part number": r"value\s*=\s*['\"]([^'\"]+)['\"]\s*lower_limit\s*=\s*['\"]([^'\"]*)['\"]\s*upper_limit\s*=\s*['\"]([^'\"]*)['\"]",
    "send command fail:\s*stressapptest": "exception message:\s*\ntraceback",
    "diorite ssh connection failed": "create diorite ssh connection failed\.\s*can not connect to diorite ip: ",
    "cpu location validation failure": [r"component cpu sn: .*? not found on unit",r"component cpu sn \S+ location: \S+, expected location: \S+"],
    "timeout exceeded.": r"thread .*? execution exceeded the timeout limit of .*? seconds",
    "attach booting logs to otc failed": r"attach booting logs to otc failed.\nmsg:",
    "caught oserror exception.": r"traceback \(most recent call last\):\n.*",
    "dimm location validation failure": r"component dimm sn .+? location: .+?, expected location: .+?",
    "io adapter location validation failure": r"component io adapter sn .+? location: .+?, expected location: .+?",
    "gssd location validation failure": [r"component gssd sn: .*? not found on unit",r"component gssd sn \S+ location: \S+, expected location: \S+"],
    "cssd infos validation failure": r"component cssd sn: .*? not found on unit",
    "configuration check (hardware) failed": r"parse response failure. response:",
    "configuration check (software) failed": r"send command",
    "bad-bus": r"for local bus.*?/phys/cpu\d+.*?expected speed:.*?, actual speed:"
}

def single_label_message_debug_update_based_on_keyword(processed_result_df, debug_df, match_dict):
    processed_result_df_copy = processed_result_df.copy()

    # ✅ Step 1: Create `debug_dict = {label: {message: row_data}}` for fast lookup
    debug_dict = {}
    for _, row in debug_df.iterrows():
        debug_label = row["symptom_label"]
        debug_message = row["symptom_message"]
        debug_dict.setdefault(debug_label, {})[debug_message] = row  # Store full row for lookup

    # ✅ Step 2: Initialize tracking for label & message counts per serial_number
    serial_label_count = {}  # {serial_number: {label_pattern: count}}
    serial_label_message_count = {}  # {serial_number: {(label_pattern, message): count}}

    # ✅ Step 3: Iterate over `processed_result_df`
    for index, row in processed_result_df_copy.iterrows():
        serial_number = row["serial_number"]
        label_message_cycle = row["label_message_cycle"]  # Extract {(label, message): count}

        # Initialize tracking for this serial_number
        if serial_number not in serial_label_count:
            serial_label_count[serial_number] = {}
            serial_label_message_count[serial_number] = {}

        for (label, message), _ in label_message_cycle.items():
            matched_label = None

            # ✅ Step 4: Find matching label in `match_dict`
            for pattern_label in match_dict.keys():
                if re.fullmatch(pattern_label, label, re.IGNORECASE) or re.search(pattern_label, label, re.IGNORECASE):
                    matched_label = pattern_label
                    break  # Stop after first match

            if not matched_label:
                continue  # No match, skip

            # ✅ Step 5: Find previous occurrences using regex matching on `serial_label_count`
            previous_label_counts = [
                count for prev_label, count in serial_label_count[serial_number].items()
                if re.fullmatch(matched_label, prev_label, re.IGNORECASE) or re.search(matched_label, prev_label, re.IGNORECASE)
            ]
            label_count = max(previous_label_counts) + 1 if previous_label_counts else 1  # Ensure first occurrence is 1

            # ✅ Step 6: Track label-message cycle separately
            previous_message_counts = [
                count for (prev_label, prev_message), count in serial_label_message_count[serial_number].items()
                if (re.fullmatch(matched_label, prev_label, re.IGNORECASE) or re.search(matched_label, prev_label, re.IGNORECASE))
                   and prev_message == message  # Match message exactly
            ]
            message_count = max(previous_message_counts) + 1 if previous_message_counts else 1

            # ✅ Step 7: Store the updated counts
            serial_label_count[serial_number][label] = label_count
            serial_label_message_count[serial_number][(label, message)] = message_count

            # ✅ Step 8: Assign to DataFrame
            processed_result_df_copy.at[index, "Label_Cycle_Count"] = label_count
            processed_result_df_copy.at[index, "Message_Cycle_Count"] = message_count

    return processed_result_df_copy


In [62]:
new_singel_label_message_debug_result = single_label_message_debug_update_based_on_keyword(singel_label_message_debug_result, debug_df, match_criteria)
new_singel_label_message_debug_result["Category"] = new_singel_label_message_debug_result["Category"].apply(
    lambda x: next(iter(x.values())) if isinstance(x, dict) and len(x) == 1 else x
)


new_singel_label_message_debug_result["Suggestion"] = new_singel_label_message_debug_result["Suggestion"].apply(
    lambda x: next(iter(x.values())) if isinstance(x, dict) and len(x) == 1 else x
)

In [63]:
test = new_singel_label_message_debug_result[~
    new_singel_label_message_debug_result["Category"].apply(lambda x: isinstance(x, dict))
]
test2 = test[~test['Message_Cycle_Count'].isin([1,2,3])]
test2.iloc[3].label_message_cycle

{('test-completion-missing-error-cpu_functional', 'diag execution failed.'): 4}

In [64]:
aa = test[test['Label_Cycle_Count']=='N/A']
aa['Computed_Count'] = aa['label_cycle'].apply(lambda x: list(x.values())[0])
aa[aa['Computed_Count'] != aa['Message_Cycle_Count']].serial_number.unique()

array(['FWI2405-10924', 'FWI2406-06429', 'FWI2407-05930', 'FWI2414-13626',
       'FWI2419-11518', 'FWI2430-00630', 'FWI2431-03279', 'FWI2433-11544',
       'FWI2435-09442', 'FWI2443-04306', 'FWI2450-17222', 'FWI2451-01826',
       'FWI2451-11137', 'FWI2501-26708', 'FWI2501-27922', 'FWI2501-28290',
       'FWI2501-28420', 'FWI2501-28883', 'FWI2501-36411', 'FWI2501-38000',
       'FWI2501-38368', 'FWI2502-01335', 'FWI2502-02264', 'FWI2502-02433',
       'FWI2502-03748', 'FWI2502-07028', 'FWI2502-08090', 'FWI2502-08245',
       'FWI2503-01227', 'FWI2503-01406', 'FWI2503-02217', 'FWI2503-14695',
       'FWI2503-15864', 'FWI2503-18697', 'FWI2504-00035', 'FWI2504-10041',
       'FWI2504-10477', 'FWI2504-14709', 'FWI2507-05135', 'FWI2507-14551',
       'FWI2509-01307', 'FWI2509-18572', 'FWI2510-07567'], dtype=object)

In [435]:
def find_matching_rows(label_message_cycle, match_dict):
    if not isinstance(label_message_cycle, dict):  # Ensure it's a dictionary
        return False

    for (label, message) in label_message_cycle:  # ✅ No need for `.keys()`
        matched_label = None

        # **Step 1: Check if label has an exact or regex match**
        for pattern_label, pattern in match_dict.items():
            if re.fullmatch(pattern_label, label, re.IGNORECASE) or re.search(pattern_label, label, re.IGNORECASE):
                matched_label = pattern_label
                break  # Stop after the first match

        if matched_label:
            regex_patterns = match_dict[matched_label]

            # ✅ Ensure `regex_patterns` is a list (handles both single and multiple regex cases)
            if not isinstance(regex_patterns, list):
                regex_patterns = [regex_patterns]

            # **Step 2: Clean and Match Message**
            cleaned_message = re.sub(r"\s+", " ", message).strip()

            for regex_pattern in regex_patterns:
                if re.search(regex_pattern, cleaned_message, re.IGNORECASE | re.DOTALL):
                    return True  # ✅ Found a match

    return False  # ❌ No match

match_criteria = {
    "diorite location validation failure": [r"component diorite sn: .*? not found on unit",r"component diorite sn \S+ location: \S+, expected location: \S+"],}

find = new_singel_label_message_debug_result[
    new_singel_label_message_debug_result["label_message_cycle"].apply(lambda x: find_matching_rows(x, match_criteria))
]

find
# find2 = find[find["Category"].apply(lambda x: isinstance(x, dict))]
# print(len(find2["label_message_cycle"].apply(lambda x: list(x.keys())[0][1])))
# for i in find2["label_message_cycle"].apply(lambda x: list(x.keys())[0][1])[:10]:
#     print(f'"{i}",')  # ✅ Correct way to print


Unnamed: 0,model_name,build_type,skuno,serial_number,station,repaired_date,repair_code,repaired_description,symptom_dict,label_cycle,label_message_cycle,Category,Suggestion,Label_Cycle_Count,Message_Cycle_Count
4522,ASTORIA,PVT,1182051,FWI2408-02238,AST Pre-Test,2024-02-23 19:15:23,SWAP - NIC,"Swap NIC card in system, PE0 to PE1",{'diorite location validation failure': ['comp...,{'diorite location validation failure': 1},"{('diorite location validation failure', 'comp...",Quick Repair,1. Swap PE# to PE## NIC card,1,1
4524,ASTORIA,PVT,1182051,FWI2408-02239,AST Pre-Test,2024-02-24 11:10:45,SWAP - NIC,"Swap NIC card in system, installed in wrong lo...",{'diorite location validation failure': ['comp...,{'diorite location validation failure': 1},"{('diorite location validation failure', 'comp...",Quick Repair,1. Swap PE# to PE## NIC card,1,1
4556,ASTORIA,PVT,1182051,FWI2408-02264,AST Pre-Test,2024-02-24 08:05:00,SWAP - NIC,"Swap NIC card in system, installed in wrong lo...",{'diorite location validation failure': ['comp...,{'diorite location validation failure': 1},"{('diorite location validation failure', 'comp...",Quick Repair,1. Swap PE# to PE## NIC card,1,1
4575,ASTORIA,PVT,1182051,FWI2408-02274,AST Pre-Test,2024-02-23 12:02:50,SWAP - NIC,"Swap NIC card in system, PE0 to PE1",{'diorite location validation failure': ['comp...,{'diorite location validation failure': 1},"{('diorite location validation failure', 'comp...",Quick Repair,1. Swap PE# to PE## NIC card,1,1
4607,ASTORIA,PVT,1182051,FWI2408-02289,AST Pre-Test,2024-02-24 09:04:08,SWAP - NIC,"Swap NIC card in system, installed in wrong lo...",{'diorite location validation failure': ['comp...,{'diorite location validation failure': 1},"{('diorite location validation failure', 'comp...",Quick Repair,1. Swap PE# to PE## NIC card,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111926,ASTORIA,GA,1182051,FWI2509-15167,AST Pre-Test,2025-03-03 13:37:28,REPLACE- NIC,PE1 IO0 not detected - swap PE1 and PE0 nic ca...,{'diorite location validation failure': ['comp...,{'diorite location validation failure': 1},"{('diorite location validation failure', 'comp...",Quick Repair,"1. Verify PCIe cable for proper seating, verif...",1,1
111931,ASTORIA,GA,1182051,FWI2509-15204,AST Pre-Test,2025-02-28 23:26:14,RESEAT - NIC,Reseated Nic card and PCIe cable,{'diorite location validation failure': ['comp...,{'diorite location validation failure': 1},"{('diorite location validation failure', 'comp...",Quick Repair,"1. Verify PCIe cable for proper seating, verif...",1,1
111945,ASTORIA,GA,1182051,FWI2509-15239,AST Pre-Test,2025-03-03 13:36:38,REPLACE- NIC,PE7 IO0 not detected - swap PE7 and PE6 nic ca...,{'diorite location validation failure': ['comp...,{'diorite location validation failure': 1},"{('diorite location validation failure', 'comp...",Quick Repair,"1. Verify PCIe cable for proper seating, verif...",1,1
112058,ASTORIA,GA,1182051,FWI2509-15738,AST Pre-Test,2025-03-04 10:35:34,REPLACE- NIC,PE7 IO0 not detected - swap PE7 and PE6 nic ca...,{'diorite location validation failure': ['comp...,{'diorite location validation failure': 1},"{('diorite location validation failure', 'comp...",Quick Repair,"1. Verify PCIe cable for proper seating, verif...",1,1


In [246]:
# Step 1: Filter rows where Category is a dictionary (non-empty or empty)
filtered_df = new_singel_label_message_debug_result[
    new_singel_label_message_debug_result["Category"].apply(lambda x: isinstance(x, dict))
]
# Step 2: Extract unique (label, message) pairs from label_message_cycle
unique_label_message_pairs_match_label = set()

for label_message_dict in filtered_df["label_message_cycle"]:
    if isinstance(label_message_dict, dict):
        unique_label_message_pairs_match_label.update(label_message_dict.keys())  # Extract (label, message) pairs

# Extract all unique labels from label_cycle
unique_labels_match_label = set()

for label_dict in filtered_df["label_cycle"]:
    if isinstance(label_dict, dict):
        unique_labels_match_label.update(label_dict.keys())

print('Rest of the lable-message need classified amount: ', len(list(unique_label_message_pairs_match_label)))

Rest of the lable-message need classified amount:  3038


In [247]:
# Step 1: Get the labels that match from debug_df
need_match_labels = list(set(debug_df["symptom_label"]).intersection(unique_labels_match_label))
print('Rest of the lable need classified amount: ', len(need_match_labels))
# Step 2: Create a dictionary to store unique message counts per label
label_message_count = {}

# Dictionary to store unique messages per label
label_unique_messages = {}

for label in need_match_labels:
    # Filter rows where `label` exists in `label_cycle`
    result = filtered_df[
        filtered_df["label_cycle"].apply(lambda x: any(label in label_key for label_key in x.keys()))
    ]

    # Collect unique messages for the label
    unique_messages = set()
    for label_message_dict in result["label_message_cycle"]:
        if isinstance(label_message_dict, dict):
            unique_messages.update(k[1] for k in label_message_dict.keys())  # Extract message part of (label, message)
    
    # Store the count of unique messages
    label_message_count[label] = len(unique_messages)
    label_unique_messages[label] = unique_messages  # Store the unique messages per label

# Step 3: Sort labels by the number of unique messages (descending)
sorted_labels = sorted(label_message_count.items(), key=lambda x: x[1], reverse=True)

# Step 5: Extract only the sorted label list
sorted_label_list = [label for label, _ in sorted_labels]

# ✅ Now, `sorted_label_list[0]` is the label with the most unique messages
top_label = sorted_label_list[0] if sorted_label_list else None
print(top_label)
# Step 6: If we have a top label, print its corresponding messages
if top_label:
    top_label_messages = label_unique_messages[top_label]  # Retrieve unique messages for the top label
    print(len(top_label_messages)) 
    for msg in top_label_messages:
        print(f"- {msg}")  # Print each message


Rest of the lable need classified amount:  51
fru infos validation failure
1816
- component sn: kw049419-96402 41608663 not found on unit
- component fru sn: 2468056-1 24510a3pc not found on unit
- component fru sn: 2468056-1 241414513 not found on unit
- component fru sn: 2411266-1 234411132 not found on unit
- component fru sn: 2468056-1 241716300 not found on unit
- component sn: ipbfwi243703626 not found on unit
- component fru sn: 2468056-1 245200dug not found on unit
- component fru sn: 2411266-1 234408454 not found on unit
- component fru sn: 2468056-1 244634666 not found on unit
- component fru sn: 2468056-1 250130bgr not found on unit
- component fru sn: 2468056-1 24510b30r not found on unit
- component sn: ipbfwi250501834 not found on unit
- component sn: pdfqmb243200504 not found on unit
- component sn: habqtw244806810 not found on unit
- component fru sn: 2468056-1 242070501 not found on unit
- component fru sn: 2468056-1 250330a8m not found on unit
- component fru sn: 2409