In [1]:
import os
import json
import re
import logging
import pandas as pd


In [2]:
try:
    from rdflib import Graph, Namespace, RDF, RDFS, OWL, Literal
    HAVE_RDFLIB = True
except ImportError:
    HAVE_RDFLIB = False

logging.basicConfig(
    level=logging.INFO,
    format='[%(asctime)s] %(levelname)s: %(message)s',
    datefmt='%H:%M:%S'
)
logger = logging.getLogger(__name__)

RAW_PATH = 'C:\\Users\\user\\Downloads\\Sentosa-main - Copy\\Sentosa-main\\scraper\\raw_text_data.json'     
CLEAN_PATH = 'C:\\Users\\user\\Downloads\\Sentosa-main - Copy\\Sentosa-main\\preprocessing\\clean_text_data.json' 

OUTPUT_DIR = 'kr_output'
os.makedirs(OUTPUT_DIR, exist_ok=True)

if not os.path.isfile(RAW_PATH):
    logger.error(f"File not found: {RAW_PATH}. Please verify the path.")
    raise FileNotFoundError(f"{RAW_PATH} not found")
if not os.path.isfile(CLEAN_PATH):
    logger.error(f"File not found: {CLEAN_PATH}. Please verify the path.")
    raise FileNotFoundError(f"{CLEAN_PATH} not found")


In [3]:
with open(RAW_PATH, 'r', encoding='utf-8') as f:
    raw_data = json.load(f)
with open(CLEAN_PATH, 'r', encoding='utf-8') as f:
    clean_data = json.load(f)

len_raw = len(raw_data)
len_clean = len(clean_data)
if len_raw != len_clean:
    logger.warning(f"raw ({len_raw}) and clean ({len_clean}) entry counts do not match. Please verify alignment.")
else:
    logger.info(f"raw and clean entry counts match: {len_raw}")


[19:02:53] INFO: raw and clean entry counts match: 122


In [4]:
# Show first few entries for sanity check
for i in range(min(3, len_raw)):
    rt = raw_data[i].get('title', '')
    ct = clean_data[i].get('clean_title', [[]])
    logger.info(f"[{i}] raw title: {rt!r}")
    logger.info(f"     clean_title tokens: {ct}")


[19:02:54] INFO: [0] raw title: 'Cara Mengatasi Anxiety: 8 Cara Ini Boleh Bantu Anda Tenangkan Diri!'
[19:02:54] INFO:      clean_title tokens: [['anxiety', 'bantu', 'tenang']]
[19:02:54] INFO: [1] raw title: 'Ubat Anxiety: Ini 5 Jenis Ubat Boleh Beli Tapi Mesti Dengan Surat Doktor!'
[19:02:54] INFO:      clean_title tokens: [['ubat', 'anxiety', 'jenis', 'ubat', 'beli', 'surat', 'doktor']]
[19:02:54] INFO: [2] raw title: 'Punca Penyakit Anxiety: Ini 7 Perkara Boleh Trigger Anxiety Anda!'
[19:02:54] INFO:      clean_title tokens: [['punca', 'sakit', 'anxiety', 'trigger', 'anxiety']]


In [5]:
normalization = {
    # Anxiety related
    'anxiety attack': 'serangan kegelisahan',
    'panic attack': 'serangan panik',
    'anxiety': 'kegelisahan',
    'kecemasan': 'kegelisahan',
    'kebimbangan': 'kegelisahan',
    'kerisauan': 'kegelisahan',
    # Stress related
    'stress': 'tekanan emosi',
    'stres': 'tekanan emosi',
    # Depression related
    'depression': 'depresi',
    'depresi': 'depresi',
    # PTSD
    'ptsd': 'stres pasca trauma',
    'stres pasca trauma': 'stres pasca trauma',
    # Panic
    'panic': 'serangan panik',
    'serangan panik': 'serangan panik',
    # OCD
    'ocd': 'gangguan obsesif kompulsif',
    'obsesif kompulsif': 'gangguan obsesif kompulsif',
    # Insomnia
    'insomnia': 'insomnia',
    # Fobia examples, add more as needed
    'claustrophobia': 'fobia klaustrofobia',
    'fobia badut': 'fobia badut',
    # Add more terms as discovered
}

synonym_map = {
   
}


nodes = []        
edges = []        
entity_to_id = {}  
entity_counters = {
    'Condition': 0,
    'Symptom': 0,
    'Trigger': 0,
    'Treatment': 0,
    'Strategy': 0,
    'Professional': 0,
}

condition_definitions = {}

unhandled_entries = []


In [6]:
def normalize_entity(phrase: str) -> str:
    p = phrase.lower().strip()
    p = re.sub(r'\s+', ' ', p)
    for key in sorted(normalization.keys(), key=lambda x: -len(x)):
        val = normalization[key]
        pattern = r'\b' + re.escape(key) + r'\b'
        if re.search(pattern, p):
            p = re.sub(pattern, val, p)
    for syn, can in synonym_map.items():
        pattern = r'\b' + re.escape(syn) + r'\b'
        if re.search(pattern, p):
            p = re.sub(pattern, can, p)
    p = re.sub(r'\s+', ' ', p).strip()
    return p


In [7]:
def get_or_create_entity(norm_phrase: str, ent_type: str) -> str:
    key = f"{ent_type}|{norm_phrase}"
    if key in entity_to_id:
        return entity_to_id[key]
    entity_counters[ent_type] += 1
    prefix = {
        'Condition': 'C',
        'Symptom': 'S',
        'Trigger': 'T',
        'Treatment': 'TRT',
        'Strategy': 'STR',
        'Professional': 'PRO'
    }.get(ent_type, 'E')
    new_id = f"{prefix}{entity_counters[ent_type]:03d}"
    entity_to_id[key] = new_id
    nodes.append({
        'id': new_id,
        'label_malay': norm_phrase,
        'label_english': '',
        'type': ent_type
    })
    return new_id


In [8]:
def extract_definition(raw_content: str) -> str:
    text = re.sub(r'<[^>]+>', '', raw_content).strip()
    parts = re.split(r'\n\n|\r\n\r\n', text)
    first_para = parts[0] if parts else text
    sents = re.split(r'(?<=[\.！？\?])\s+', first_para)
    definition = ' '.join(sents[:2]).strip()
    if len(definition) < 20 and len(sents) > 2:
        definition = ' '.join(sents[:3]).strip()
    return definition


In [9]:
def extract_list_after_keyword(content: str, keyword_patterns: list) -> list:
    text = re.sub(r'<[^>]+>', '', content).strip()
    for pat in keyword_patterns:
        match = re.search(pat, text, re.IGNORECASE)
        if match:
            after = text[match.end():]
            parts = re.split(r'\n\n|\r\n\r\n', after)
            segment = parts[0] if parts else after
            items = re.split(r'\n|\r\n|,|;|\d+\.\s*|\s+dan\s+', segment)
            cleaned = []
            for item in items:
                item = item.strip().strip('.').strip()
                if not item or len(item) < 3:
                    continue
                if re.search(r'\b(apa|bagaimana|mengapa)\b', item.lower()):
                    continue
                cleaned.append(item)
            return cleaned
    return []


In [10]:
def extract_condition_from_title(raw_title: str) -> str:
    title = raw_title.strip()
    t = title.lower()

    # 1) Definition patterns
    m = re.search(r'\b(?:apa itu|apa yang harus anda tahu|perlu tahu|info tentang|ketahui lebih lanjut)\s+(?:penyakit\s+|gangguan\s+)?(.+?)[\?|:]', title, re.IGNORECASE)
    if m:
        cond = m.group(1).strip()
    else:
        # 2) Common patterns
        patterns = [
            r'(?:Simptom|Tanda(?: Umum)?|Gejala)\s+(.+?):',
            r'(?:Punca Penyakit|Punca)\s+(.+?):',
            r'(?:Ubat|Rawat|Cara Mengatasi|Cara Atasi|Terapi|Kaunseling|Pantang Larang)\s+(.+?):',
            r'(?:Penyakit|Gangguan)\s+(.+?):',
        ]
        cond = None
        for pat in patterns:
            match = re.search(pat, title, re.IGNORECASE)
            if match:
                cond = match.group(1).strip()
                cond = re.sub(r'\b(ini|boleh|untuk).+', '', cond, flags=re.IGNORECASE).strip()
                break
        if cond is None:
            # 3) Before comma
            m2 = re.match(r'^\s*([^,，]+)', title)
            if m2:
                part = m2.group(1).strip()
                cond = part
            else:
                # 4) '&' or 'dan' multiple conditions: take first
                if '&' in title:
                    first = title.split('&')[0].strip()
                    cond = first
                elif ' dan ' in t:
                    first = title.lower().split(' dan ')[0].strip()
                    cond = first
                else:
                    # 5) Fallback: before '?' or ':' last word
                    if '?' in title:
                        before = title.split('?', 1)[0]
                    elif ':' in title:
                        before = title.split(':', 1)[0]
                    else:
                        before = title
                    tokens = re.findall(r'\b\w+\b', before.strip())
                    cond = tokens[-1] if tokens else before.strip()
    # 6) Remove prefixes 'gangguan ' / 'penyakit '
    cond_lower = cond.lower().strip()
    for prefix in ['gangguan ', 'penyakit ']:
        if cond_lower.startswith(prefix):
            cond = cond[len(prefix):].strip()
            break
    return cond


In [11]:
def detect_relation(clean_title_tokens: list, raw_title: str) -> str:
    toks = [tok.lower() for tok in clean_title_tokens]
    t = raw_title.lower()

    # 1) Definition patterns
    if any([
        'apa itu' in t,
        re.search(r'\bapa yang harus anda tahu\b', t),
        re.search(r'\bperlu tahu\b', t),
        re.search(r'ketahui lebih lanjut', t),
        re.search(r'info tentang', t),
    ]):
        return 'definition'

    # 2) 'Cara Atasi' implies treated_by
    if 'cara atasi' in t or 'lakukan cara ini' in t:
        return 'treated_by'

    # 3) has_symptom patterns
    if any(tok in toks for tok in ['simptom', 'tanda', 'gejala']):
        return 'has_symptom'
    if re.search(r'\b(simptom|tanda|gejala)\b', t):
        return 'has_symptom'

    # 4) triggered_by patterns
    if any(tok in toks for tok in ['punca', 'faktor', 'picu']):
        return 'triggered_by'
    if re.search(r'\b(punca|faktor|picu)\b', t):
        return 'triggered_by'

    # 5) treated_by patterns: ubat/rawat/terapi/kaunseling/pantang larang
    if any(tok in toks for tok in ['ubat', 'rawat', 'cara', 'terapi', 'kaunseling', 'pantang', 'larang']):
        return 'treated_by'
    if re.search(r'\b(ubat|rawat|cara mengatasi|terapi|kaunseling|pantang larang)\b', t):
        return 'treated_by'

    # 6) Fobia handling: default to definition unless risk/penghidap => triggered_by
    if 'fobia' in t:
        if any(k in t for k in ['apa', 'tahu', 'info tentang']):
            return 'definition'
        if any(k in t for k in ['kondisi', 'risiko', 'penghidap']):
            return 'triggered_by'
        return 'definition'

    # 7) related_to: check '&' or 'dan' with >=2 extracted conditions
    if '&' in raw_title or ' dan ' in t:
        parts = []
        if '&' in raw_title:
            parts = raw_title.split('&')
        elif ' dan ' in t:
            parts = re.split(r'\sdan\s', raw_title, flags=re.IGNORECASE)
        conds = []
        for part in parts:
            part = part.strip()
            cond = extract_condition_from_title(part)
            norm = normalize_entity(cond) if cond else ''
            if norm:
                conds.append(norm)
        if len(conds) >= 2:
            return 'related_to'

    # 8) 'muncul selepas' indicates triggered_by
    if 'muncul selepas' in t:
        return 'triggered_by'

    # 9) PTSD/Stres Pasca Trauma patterns
    if any(x in t for x in ['ptsd', 'stres pasca trauma']):
        if any(k in t for k in ['risiko', 'penghidap']):
            return 'triggered_by'
        if any(k in t for k in ['apa', 'tahu']):
            return 'definition'
        return 'definition'

    # 10) Single condition name: treat as definition if short title without keywords
    cond_tmp = extract_condition_from_title(raw_title)
    norm_tmp = normalize_entity(cond_tmp) if cond_tmp else ''
    if norm_tmp and norm_tmp in t:
        words = re.findall(r'\b\w+\b', raw_title)
        if ':' not in raw_title and len(words) <= 6:
            return 'definition'
        if not any(k in t for k in ['simptom','punca','ubat','cara','rawat','tanda','gejala']):
            return 'definition'

    return None
    

In [12]:
unhandled_entries = []
for idx, raw_entry in enumerate(raw_data):
    raw_title = raw_entry.get('title', '').strip()
    raw_content = raw_entry.get('content', '').strip()
    clean_entry = clean_data[idx] if idx < len(clean_data) else {}
    clean_title_tokens = []
    if isinstance(clean_entry, dict) and 'clean_title' in clean_entry:
        try:
            clean_title_tokens = clean_entry.get('clean_title', [[]])[0]
        except:
            clean_title_tokens = []

    # Skip if title or content is empty
    if not raw_title or not raw_content:
        unhandled_entries.append({
            'index': idx, 'title': raw_title,
            'reason': 'empty title or content'
        })
        continue

    # 1) Detect relation
    relation = detect_relation(clean_title_tokens, raw_title)
    if relation is None:
        unhandled_entries.append({
            'index': idx, 'title': raw_title,
            'clean_title_tokens': clean_title_tokens,
            'relation': None,
            'reason': 'detect_relation returned None'
        })
        continue

    # 2) Extract condition name and normalize
    cond_phrase = extract_condition_from_title(raw_title)
    cond_norm = normalize_entity(cond_phrase) if cond_phrase else ''
    if not cond_norm:
        unhandled_entries.append({
            'index': idx, 'title': raw_title,
            'relation': relation,
            'reason': f"condition extraction empty, raw: {cond_phrase!r}"
        })
        continue

    # Handle definition relation: store definition and ensure Condition node
    if relation == 'definition':
        if cond_norm not in condition_definitions:
            definition_text = extract_definition(raw_content)
            if definition_text:
                condition_definitions[cond_norm] = definition_text
        get_or_create_entity(cond_norm, 'Condition')
        continue

    # Handle related_to: extract multiple conditions from title and create bidirectional edges
    if relation == 'related_to':
        parts = []
        if '&' in raw_title:
            parts = raw_title.split('&')
        elif ' dan ' in raw_title.lower():
            parts = re.split(r'\sdan\s', raw_title, flags=re.IGNORECASE)
        conds = []
        for part in parts:
            part = part.strip()
            cond = extract_condition_from_title(part)
            norm = normalize_entity(cond) if cond else ''
            if norm:
                conds.append(norm)
        if len(conds) >= 2:
            ids = [get_or_create_entity(norm, 'Condition') for norm in conds]
            for i in range(len(ids)):
                for j in range(i+1, len(ids)):
                    edges.append({
                        'source_id': ids[i],
                        'relation': 'related_to',
                        'target_id': ids[j],
                        'source_reference': raw_title
                    })
                    edges.append({
                        'source_id': ids[j],
                        'relation': 'related_to',
                        'target_id': ids[i],
                        'source_reference': raw_title
                    })
            continue
        else:
            unhandled_entries.append({
                'index': idx, 'title': raw_title,
                'relation': relation,
                'reason': 'related_to pattern: fewer than 2 conditions extracted'
            })
            continue

    # For has_symptom / triggered_by / treated_by, create Condition node first
    cond_id = get_or_create_entity(cond_norm, 'Condition')

    if relation == 'has_symptom':
        patterns = [r'Simptom.*?:', r'Tanda(?: Umum)? .*?:', r'Gejala.*?:']
    elif relation == 'triggered_by':
        patterns = [r'Punca(?: Penyakit)? .*?:', r'Faktor .*?:', r'Picu .*?:', r'Penghidap Berisiko.*?:']
    elif relation == 'treated_by':
        patterns = [r'(?:Cara Mengatasi|Cara Atasi|Ubat|Rawat|Terapi|Kaunseling|Pantang Larang|Lakukan Cara Ini).*?:']
    else:
        patterns = []

    if relation == 'triggered_by' and 'muncul selepas' in raw_title.lower():
        m = re.search(r'Perasaan\s+(.+?)\s+muncul selepas\s+(.+)', raw_title, re.IGNORECASE)
        if m:
            symptom_raw = m.group(1).strip()
            cond_raw = m.group(2).strip().split('?')[0].strip()
            for p in ['penyakit ', 'gangguan ']:
                if cond_raw.lower().startswith(p):
                    cond_raw = cond_raw[len(p):].strip()
            cond_norm2 = normalize_entity(cond_raw)
            symptom_norm = normalize_entity(symptom_raw)
            if cond_norm2:
                cond_id2 = get_or_create_entity(cond_norm2, 'Condition')
                if symptom_norm:
                    ent_id = get_or_create_entity(symptom_norm, 'Symptom')
                    edges.append({
                        'source_id': cond_id2,
                        'relation': 'has_symptom',
                        'target_id': ent_id,
                        'source_reference': raw_title
                    })
                continue

    # Extract list items from content
    items = extract_list_after_keyword(raw_content, patterns)
    if not items:
        unhandled_entries.append({
            'index': idx, 'title': raw_title,
            'relation': relation,
            'reason': 'list extraction returned empty'
        })
        continue

    # Process each extracted item: normalize, create entity node, and edge
    for phrase in items:
        norm = normalize_entity(phrase)
        if not norm:
            continue
        if relation == 'has_symptom':
            ent_type = 'Symptom'
        elif relation == 'triggered_by':
            ent_type = 'Trigger'
        elif relation == 'treated_by':
            # Determine Strategy vs Treatment by keywords
            if re.search(r'\b(teknik|relaksasi|senaman|yoga|meditasi|pantang|larang|kaunseling)\b', norm):
                ent_type = 'Strategy'
            else:
                ent_type = 'Treatment'
        else:
            ent_type = 'Symptom'
        ent_id = get_or_create_entity(norm, ent_type)
        edges.append({
            'source_id': cond_id,
            'relation': relation,
            'target_id': ent_id,
            'source_reference': raw_title
        })

logger.info(f"Extraction complete: nodes={len(nodes)}, edges={len(edges)}, unhandled_entries={len(unhandled_entries)}, definitions={len(condition_definitions)}")


[19:02:58] INFO: Extraction complete: nodes=738, edges=657, unhandled_entries=19, definitions=70


In [13]:
df_nodes = pd.DataFrame(nodes)
df_edges = pd.DataFrame(edges)
df_unhandled = pd.DataFrame(unhandled_entries)

nodes_csv = os.path.join(OUTPUT_DIR, 'kg_nodes.csv')
edges_csv = os.path.join(OUTPUT_DIR, 'kg_edges.csv')
unhandled_json = os.path.join(OUTPUT_DIR, 'unhandled_entries.json')
unhandled_csv = os.path.join(OUTPUT_DIR, 'unhandled_entries.csv')

df_nodes.to_csv(nodes_csv, index=False, encoding='utf-8-sig')
df_edges.to_csv(edges_csv, index=False, encoding='utf-8-sig')
df_unhandled.to_json(unhandled_json, force_ascii=False, orient='records', indent=2)
df_unhandled.to_csv(unhandled_csv, index=False, encoding='utf-8-sig')

defs_path = os.path.join(OUTPUT_DIR, 'condition_definitions.json')
with open(defs_path, 'w', encoding='utf-8') as f:
    json.dump(condition_definitions, f, ensure_ascii=False, indent=2)

logger.info(f"Saved nodes CSV: {nodes_csv}")
logger.info(f"Saved edges CSV: {edges_csv}")
logger.info(f"Saved unhandled entries JSON: {unhandled_json}")
logger.info(f"Saved condition definitions JSON: {defs_path}")
logger.info("Sample nodes (first 10 rows):")
logger.info(df_nodes.head(10).to_string(index=False))
logger.info("Sample edges (first 10 rows):")
logger.info(df_edges.head(10).to_string(index=False))
logger.info("Sample unhandled entries (first 10 rows):")
logger.info(df_unhandled.head(10).to_string(index=False))


[19:02:58] INFO: Saved nodes CSV: kr_output\kg_nodes.csv
[19:02:58] INFO: Saved edges CSV: kr_output\kg_edges.csv
[19:02:58] INFO: Saved unhandled entries JSON: kr_output\unhandled_entries.json
[19:02:58] INFO: Saved condition definitions JSON: kr_output\condition_definitions.json
[19:02:58] INFO: Sample nodes (first 10 rows):
[19:02:58] INFO:     id                                                                                                                                                                                                                                                                                                          label_malay label_english      type
  C001                                                                                                                                                                                                                                                                                                          kegelisaha

In [14]:
if HAVE_RDFLIB:
    df_nodes2 = pd.read_csv(nodes_csv, encoding='utf-8-sig')
    df_edges2 = pd.read_csv(edges_csv, encoding='utf-8-sig')
    with open(defs_path, 'r', encoding='utf-8') as f:
        cond_defs = json.load(f)

    BASE = Namespace('http://example.org/mentalhealth#')
    g = Graph()
    g.bind('mh', BASE)
    g.bind('rdfs', RDFS)
    g.bind('owl', OWL)

    # Define Classes: Condition, Symptom, Trigger, Treatment, Strategy, Professional
    for ent_type in df_nodes2['type'].unique():
        class_uri = BASE[ent_type]
        g.add((class_uri, RDF.type, OWL.Class))
        g.add((class_uri, RDFS.label, Literal(ent_type, lang='en')))

    # Define ObjectProperties
    relation_defs = {
        'has_symptom': ('Condition', 'Symptom'),
        'triggered_by': ('Condition', 'Trigger'),
        'treated_by': ('Condition', 'Treatment'),
        'related_to': ('Condition', 'Condition'),
    }
    for rel, (dom, rng) in relation_defs.items():
        prop = BASE[rel]
        g.add((prop, RDF.type, OWL.ObjectProperty))
        g.add((prop, RDFS.domain, BASE[dom]))
        g.add((prop, RDFS.range, BASE[rng]))
        g.add((prop, RDFS.label, Literal(rel, lang='en')))

    # Add individuals with labels and definitions
    for _, row in df_nodes2.iterrows():
        node_uri = BASE[row['id']]
        ent_type = row['type']
        g.add((node_uri, RDF.type, BASE[ent_type]))
        mal_label = row['label_malay']
        if isinstance(mal_label, str) and mal_label:
            g.add((node_uri, RDFS.label, Literal(mal_label, lang='ms')))
        eng_label = row.get('label_english', '')
        if isinstance(eng_label, str) and eng_label:
            g.add((node_uri, RDFS.label, Literal(eng_label, lang='en')))
        if ent_type == 'Condition' and mal_label in cond_defs:
            definition_text = cond_defs.get(mal_label)
            if definition_text:
                g.add((node_uri, RDFS.comment, Literal(definition_text, lang='ms')))

    # Add relation triples
    for _, row in df_edges2.iterrows():
        subj = BASE[row['source_id']]
        obj = BASE[row['target_id']]
        prop = BASE[row['relation']]
        g.add((subj, prop, obj))

    # Serialize to Turtle
    owl_path = os.path.join(OUTPUT_DIR, 'mental_health_ontology.ttl')
    g.serialize(destination=owl_path, format='turtle')
    logger.info(f"OWL ontology saved to: {owl_path}")
else:
    logger.warning("rdflib not installed, skipping OWL generation")


[19:02:59] INFO: OWL ontology saved to: kr_output\mental_health_ontology.ttl


In [15]:
def build_graph_dict():
    df_n = pd.read_csv(nodes_csv, encoding='utf-8-sig')
    df_e = pd.read_csv(edges_csv, encoding='utf-8-sig')
    id_to_label = dict(zip(df_n['id'], df_n['label_malay']))
    graph = {}
    for _, row in df_e.iterrows():
        subj = id_to_label.get(row['source_id'], '')
        tgt = id_to_label.get(row['target_id'], '')
        if not subj or not tgt:
            continue
        graph.setdefault(subj, {}).setdefault(row['relation'], []).append(tgt)
    return graph

graph = build_graph_dict()
with open(defs_path, 'r', encoding='utf-8') as f:
    condition_definitions = json.load(f)