<a href="https://colab.research.google.com/github/rl-cyber/User-Engaged-Network-Diagnosis/blob/main/Parser_for_Sec_conflict_Segments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import re
import pandas as pd

# Load raw conflict file
file_path = "conflict_segments_4G.txt"
with open(file_path, "r", encoding="utf-8") as f:
    raw_text = f.read()

# Split segments using "-----" separator
segments = re.split(r'\n\s*-{5,}\s*\n', raw_text.strip())

parsed = []
for i, segment in enumerate(segments):
    lines = segment.strip().split('\n')
    message = "UNKNOWN"
    state = "UNKNOWN"
    explanation_lines = []

    for line in lines:
        line_clean = line.strip()
        if re.search(r'\bmessage\b\s*[:\-]', line_clean, re.IGNORECASE):
            message = re.split(r'[:\-]', line_clean, maxsplit=1)[-1].strip().upper()
        elif re.search(r'\bstate\b\s*[:\-]', line_clean, re.IGNORECASE):
            state = re.split(r'[:\-]', line_clean, maxsplit=1)[-1].strip().upper()
        elif line_clean and not re.match(r'\b(conflict|message|state)\b', line_clean, re.IGNORECASE):
            explanation_lines.append(line_clean)

    parsed.append({
        "conflict_id": f"C{i+1:03}",
        "message": message,
        "state": state,
        "conflict_explanation": " ".join(explanation_lines)
    })

df = pd.DataFrame(parsed)

# Vocabulary-based fallback for message and state
known_messages = [
    "DETACH REQUEST", "SERVICE REQUEST", "ATTACH REQUEST", "AUTHENTICATION REQUEST",
    "SECURITY MODE COMMAND", "RE-AUTHENTICATION REQUEST", "PDN CONNECTIVITY REQUEST"
]

known_states = [
    "EMM-REGISTERED", "EMM-DEREGISTERED", "IDLE", "CONNECTED", "ATTACHING", "AUTHENTICATED",
    "UNAUTHENTICATED", "CONNECTED MODE", "IDLE MODE", "REGISTERED"
]

def extract_known(text, keywords):
    for kw in keywords:
        if kw.lower() in text.lower():
            return kw
    return "UNKNOWN"

df["message"] = df.apply(lambda row: extract_known(row["conflict_explanation"], known_messages) if row["message"] == "UNKNOWN" else row["message"], axis=1)
df["state"] = df.apply(lambda row: extract_known(row["conflict_explanation"], known_states) if row["state"] == "UNKNOWN" else row["state"], axis=1)

# Save clean version
output_path = "conflict_segments_normalized_extracted.csv"
df.to_csv(output_path, index=False)
df.head()

Unnamed: 0,conflict_id,message,state,conflict_explanation
0,C001,UNKNOWN,UNKNOWN,.
1,C002,UNKNOWN,UNKNOWN,The non-access stratum (NAS) described in the ...
2,C003,SERVICE REQUEST,EMM-REGISTERED,"During the EPS attach procedure, the network c..."
3,C004,SERVICE REQUEST,UNKNOWN,A UE configured for NAS signalling low priorit...
4,C005,UNKNOWN,UNKNOWN,.
