# alias clustering

In [None]:
import pandas as pd
import textdistance

# Load the normalized data
df = pd.read_csv("normalized_characters_all.csv")
df['normalized_characters'] = df['normalized_characters'].apply(eval)

# Pointer list
POINTERS = [
    'ibu', 'pak', 'puteri', 'permaisuri', 'raja', 'putera', 'ayah', 'istri', 'suami', 'uwak', 'menteri',
    'bunda', 'anak', 'kakak', 'adik', 'kakek', 'orang tua', 'tetangga', 'putri', 'beru tandang',
    'putroe', 'telangkai', 'tuhan', 'abang'
]

# Anti-merge exclusion pairs (based on semantic contradiction)
EXCLUDE_PAIRS = [
    ('bungsu', 'sulung'),
    ('muda', 'tua'),
    ('mahkota', 'biasa')
]

# Compute similarity scores
def compute_similarity(s1, s2):
    return {
        'jaccard': textdistance.jaccard(s1, s2),
        'jaro': textdistance.jaro(s1, s2),
    }

# Normalize alias
def normalize(text):
    return text.lower().strip()

# Check if an exclusion pair exists
def is_excluded(name1, name2):
    name1 = normalize(name1)
    name2 = normalize(name2)
    for a, b in EXCLUDE_PAIRS:
        if a in name1 and b in name2 or b in name1 and a in name2:
            return True
    return False

# Cluster characters without pointers
def cluster_without_pointers(characters_list, aliases_clusters, threshold):
    cluster_id = len(aliases_clusters) + 1

    # 🧱 Centralized false merge list
    false_merge_blacklist = set([
        ('raja', 'rajawali'),
        ('putri', 'putri malu'),
        ('raja', 'rajagaluh')
    ])

    for character in characters_list:
        character = normalize(character)
        found = False
        for key, cluster in aliases_clusters.items():
            for name in cluster:
                name = normalize(name)

                # ❗ Skip similarity merging if either alias is too short
                if len(character) < 4 or len(name) < 4:
                    if character != name:
                        if f" {character} " in f" {name} " or f" {name} " in f" {character} ":
                            cluster.add(character)
                            found = True
                            break
                        continue

                if character.endswith("nya") and name.endswith("nya"):
                    if compute_similarity(character[:-3], name[:-3])['jaro'] >= threshold:
                        cluster.add(character)
                        found = True
                        break
                    else:
                        break

                if f" {character} " in f" {name} " or f" {name} " in f" {character} ":
                    cluster.add(character)
                    found = True
                    break

                char_norm = normalize(character)
                name_norm = normalize(name)

                if (char_norm, name_norm) in false_merge_blacklist or (name_norm, char_norm) in false_merge_blacklist:
                    continue

                if compute_similarity(character, name)['jaro'] >= threshold:
                    if character in name or name in character:
                        continue
                    cluster.add(character)
                    found = True
                    break
            if found:
                break
        if not found:
            aliases_clusters[f"person-{cluster_id}"] = {character}
            cluster_id += 1
    return aliases_clusters

# Cluster characters with pointers
def cluster_with_pointers(characters_with_pointer, pointers, threshold):
    all_pointer_item_clusters = []
    for p in pointers:
        pointer_cluster = {}
        pointer_cluster_id = 1
        one_token_list = []
        character_per_pointer = [
            character for character in characters_with_pointer
            if normalize(character).startswith(p.lower())
        ]
        for character in character_per_pointer:
            character = normalize(character)
            tokens = character.split()
            if len(tokens) == 1:
                one_token_list.append(character)
            else:
                found = False
                for key, cluster in pointer_cluster.items():
                    for name in cluster:
                        name = normalize(name)
                        # Check exclusion
                        if is_excluded(character, name):
                            continue
                        # Suffix match
                        if character.endswith("nya") and character[:-3] in name:
                            cluster.append(character)
                            found = True
                            break
                        # Compare suffix after pointer
                        suffix_char = character[len(p):].strip()
                        suffix_name = name[len(p):].strip()

                        if len(suffix_char) < 4 or len(suffix_name) < 4:
                            if suffix_char != suffix_name:
                                continue  # prevent false merge for short suffixes

                        if suffix_char in suffix_name or compute_similarity(suffix_char, suffix_name)['jaccard'] >= threshold:
                            cluster.append(character)
                            found = True
                            break
                if not found:
                    pointer_cluster[pointer_cluster_id] = [character]
                    pointer_cluster_id += 1
        if one_token_list:
            if len(pointer_cluster) == 1:
                for item in one_token_list:
                    pointer_cluster[1].append(item)
            elif not pointer_cluster:
                pointer_cluster[pointer_cluster_id] = one_token_list
        all_pointer_item_clusters.extend(pointer_cluster.values())
    return all_pointer_item_clusters

# Merge the two types of clusters
def merge_clusters(aliases_clusters, pointer_clusters):
    final_clusters = {}
    counter = 1
    for cluster in pointer_clusters:
        final_clusters[f"Tokoh-{counter}"] = list(set(cluster))
        counter += 1
    for key, value in aliases_clusters.items():
        final_clusters[f"Tokoh-{counter}"] = list(set(value))
        counter += 1
    return final_clusters

# Main clustering per story
def cluster_character_aliases(characters_list, pointers):
    aliases_clusters = {}
    characters_with_pointer = [
        character for character in characters_list
        if any(normalize(character).startswith(pointer.lower()) for pointer in pointers)
    ]
    characters_without_pointer = [char for char in characters_list if char not in characters_with_pointer]

    aliases_clusters = cluster_without_pointers(characters_without_pointer, aliases_clusters, 0.82)
    pointer_clusters = cluster_with_pointers(characters_with_pointer, pointers, 0.75)  # raised threshold
    return merge_clusters(aliases_clusters, pointer_clusters)

# Run on all stories
all_results = []
grouped = df.groupby("story_id")['normalized_characters'].apply(lambda x: sum(x, []))

for story_id, characters in grouped.items():
    sorted_characters = sorted(set(characters), key=len, reverse=True)
    result = cluster_character_aliases(sorted_characters, POINTERS)
    for person, aliases in result.items():
        all_results.append({
            'story_id': story_id,
            'person': person,
            'aliases': aliases
        })

# Save results
df_result = pd.DataFrame(all_results)
df_result.to_csv("alias_clusters_all.csv", index=False)
print("✅ Alias clustering complete with exclusion logic! Saved as alias_clusters_all.csv")


✅ Alias clustering complete with exclusion logic! Saved as alias_clusters_all.csv


: 