# post preprocessing

In [1]:
sample_aliases = [
    'puterinya', 'Puteri Ayu', 'permaisurilah', 'ayahnya', 'Pak Gendut',
    'Istriku', 'Doyan Nada', 'Allah', 'karyawannya', 'Nabilah',
    'petani', 'Petani', 'ikan', 'Pak', 'gadis', 'Puteri', 'desa', 
    'penduduk desa', 'bidadari', 'suami', 'Putera', 'ayahnya', 
    'istri petani', 'suaminya', 'istrinya', 'Kanda', 'ayah'
]


EXCEPTIONS = {'allah', 'nabilah', 'kahlil', 'permaisuri', 'istri', 'puteri', 'karyawan'}

def normalize_alias_custom(name):
    name = name.lower().strip()
    if name in EXCEPTIONS:
        return name

    # Possessive
    for suffix in ['nya', 'ku', 'mu']:
        if name.endswith(suffix) and len(name) > len(suffix) + 2:
            name = name[:-len(suffix)]

    # Emphatic particles
    for suffix in ['lah', 'kah', 'tah', 'nda']:
        if name.endswith(suffix) and len(name) > len(suffix) + 2:
            name = name[:-len(suffix)]

    return name.strip()


In [2]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()

def normalize_alias_stem(name):
    return stemmer.stem(name.lower().strip())


In [3]:
for alias in sample_aliases:
    custom = normalize_alias_custom(alias)
    stemmed = normalize_alias_stem(alias)
    print(f"{alias:<25} | Custom → {custom:<15} | Stemmer → {stemmed}")


puterinya                 | Custom → puteri          | Stemmer → puter
Puteri Ayu                | Custom → puteri ayu      | Stemmer → puter ayu
permaisurilah             | Custom → permaisuri      | Stemmer → permaisuri
ayahnya                   | Custom → ayah            | Stemmer → ayah
Pak Gendut                | Custom → pak gendut      | Stemmer → pak gendut
Istriku                   | Custom → istri           | Stemmer → istri
Doyan Nada                | Custom → doyan nada      | Stemmer → doyan nada
Allah                     | Custom → allah           | Stemmer → allah
karyawannya               | Custom → karyawan        | Stemmer → karyawannya
Nabilah                   | Custom → nabilah         | Stemmer → nabi
petani                    | Custom → petani          | Stemmer → tani
Petani                    | Custom → petani          | Stemmer → tani
ikan                      | Custom → ikan            | Stemmer → ikan
Pak                       | Custom → pak             | St

In [7]:
import pandas as pd
import ast

# Load your dataset
df = pd.read_csv("pseudo_characters_final.csv")

# Convert stringified list to Python list
df['characters'] = df['characters'].apply(ast.literal_eval)

# Define target story IDs
target_story_ids = [1, 7, 10, 111, 87, 109, 16, 23, 35, 25, 26]

# Filter only the relevant story IDs
df = df[df['story_id'].isin(target_story_ids)].copy()

# Define custom normalization function
EXCEPTIONS = {'allah', 'nabilah', 'kahlil', 'permaisuri', 'istri', 'puteri', 'karyawan'}

def normalize_alias_custom(name):
    name = name.lower().strip()
    if name in EXCEPTIONS:
        return name

    tokens = name.split()

    # Strip possessive only if the word is a single token
    if len(tokens) == 1:
        for suffix in ['nya', 'ku', 'mu']:
            if name.endswith(suffix) and len(name) > len(suffix) + 2:
                name = name[:-len(suffix)]
    # Don't strip possessive suffix from multi-word names like "kepala suku"

    # Emphatic particles (optional for multi-word too)
    for suffix in ['lah', 'kah', 'tah', 'nda']:
        if name.endswith(suffix) and len(name) > len(suffix) + 2:
            name = name[:-len(suffix)]

    return name.strip()

# Apply normalization to each character
df['normalized_characters'] = df['characters'].apply(
    lambda char_list: [normalize_alias_custom(char) for char in char_list]
)

# Show only the required columns
df = df[['story_id', 'sentence_id', 'characters', 'normalized_characters']]
print(df)


      story_id  sentence_id               characters    normalized_characters
0            1            0                 [petani]                 [petani]
1            1            3                 [petani]                 [petani]
2            1           12                 [Petani]                 [petani]
3            1           13                   [ikan]                   [ikan]
4            1           15                 [petani]                 [petani]
...        ...          ...                      ...                      ...
5582       111          188          [tikus, anjing]          [tikus, anjing]
5583       111          189              [anak raja]              [anak raja]
5584       111          192  [kucing, anjing, tikus]  [kucing, anjing, tikus]
5585       111          193  [kucing, anjing, tikus]  [kucing, anjing, tikus]
8813        10           66              [mataniari]              [mataniari]

[1190 rows x 4 columns]


In [8]:
df.to_csv("normalized_characters_filtered.csv", index=False)


## normalized all story id

In [None]:
import pandas as pd
import ast

# Load your dataset
df = pd.read_csv("pseudo_characters_final.csv")

# Convert stringified list to Python list
df['characters'] = df['characters'].apply(ast.literal_eval)

# Updated exceptions
EXCEPTIONS = {'allah', 'nabilah', 'kahlil', 'permaisuri', 'istri', 'puteri', 'karyawan'}

# Improved normalization function
def normalize_alias_custom(name):
    name = name.lower().strip()
    if name in EXCEPTIONS:
        return name

    tokens = name.split()

    # Strip possessive only if it's a single token
    if len(tokens) == 1:
        for suffix in ['nya', 'ku', 'mu']:
            if name.endswith(suffix) and len(name) > len(suffix) + 2:
                name = name[:-len(suffix)]

    # Emphatic particles (okay for multi-word)
    for suffix in ['lah', 'kah', 'tah', 'nda']:
        if name.endswith(suffix) and len(name) > len(suffix) + 2:
            name = name[:-len(suffix)]

    return name.strip()

# Apply normalization
df['normalized_characters'] = df['characters'].apply(
    lambda char_list: [normalize_alias_custom(char) for char in char_list]
)

# Keep relevant columns
df = df[['story_id', 'sentence_id', 'characters', 'normalized_characters']]

# Save to CSV
df.to_csv("normalized_characters_all.csv", index=False)

print("All stories processed and saved as normalized_characters_all.csv")


All stories processed and saved as normalized_characters_all.csv
