# feature engineering (all data)

In [1]:
import pandas as pd
from ast import literal_eval

# Load file
ner_df = pd.read_csv("ner_sentence_characters.csv")
norm_df = pd.read_csv("normalized_characters.csv")

# Ubah string list jadi list beneran
ner_df['characters'] = ner_df['characters'].apply(literal_eval)
norm_df['characters'] = norm_df['characters'].apply(literal_eval)

# Ambil semua karakter unik dari kedua file
all_ner_chars = set([char.lower() for sublist in ner_df['characters'] for char in sublist])
all_norm_chars = set([char.lower() for sublist in norm_df['characters'] for char in sublist])

# Cari karakter yang ada di ner tapi belum masuk ke file normalized
missing = all_ner_chars - all_norm_chars

# Print hasilnya
print(f"🔍 Total unique NER characters: {len(all_ner_chars)}")
print(f"✅ Covered by normalized list: {len(all_norm_chars)}")
print(f"❌ Missing from normalized list: {len(missing)}")
print("Missing examples:", list(missing)[:10])


🔍 Total unique NER characters: 1692
✅ Covered by normalized list: 1692
❌ Missing from normalized list: 0
Missing examples: []


In [2]:
import pandas as pd
from ast import literal_eval

# Load files
norm_df = pd.read_csv("normalized_characters.csv")
alias_df = pd.read_csv("alias_clusters.csv")

# Parse kolom jadi list beneran
norm_df['normalized_characters'] = norm_df['normalized_characters'].apply(literal_eval)
alias_df['aliases'] = alias_df['aliases'].apply(literal_eval)

# Ambil semua karakter unik dari normalized
all_norm_chars = set([char.lower() for sublist in norm_df['normalized_characters'] for char in sublist])

# Ambil semua alias yang sudah dicatat
all_aliases = set([alias.lower() for sublist in alias_df['aliases'] for alias in sublist])

# Cari yang belum masuk ke alias cluster
missing = all_norm_chars - all_aliases

# Print hasilnya
print(f"🔍 Total unique normalized characters: {len(all_norm_chars)}")
print(f"✅ Covered by alias list: {len(all_aliases & all_norm_chars)}")
print(f"❌ Missing from alias list: {len(missing)}")
print("Missing examples:", list(missing)[:10])


🔍 Total unique normalized characters: 1617
✅ Covered by alias list: 1615
❌ Missing from alias list: 2
Missing examples: ['putroe', 'telangkai']


## merge alias clusters with NER normalized characters

In [4]:
import pandas as pd
import ast

# === 1. Load the file ===
df = pd.read_csv("normalized_characters.csv")

# convert the stringified lists to actual Python lists
df["characters"]            = df["characters"].apply(ast.literal_eval)
df["normalized_characters"] = df["normalized_characters"].apply(ast.literal_eval)

# === 2. Explode so one mention per row ===
rows = []
for _, row in df.iterrows():
    sid   = row["story_id"]
    sent  = row["sentence_id"]
    chars = row["characters"]
    norm  = row["normalized_characters"]

    # zip keeps the original ↔ normalized pairing
    for orig, normed in zip(chars, norm):
        rows.append({
            "story_id": sid,
            "sentence_id": sent,
            "characters": [orig],               # keep list format with single item
            "normalized_characters": [normed]
        })

out = pd.DataFrame(rows)

# === 3. Save the expanded file ===
out.to_csv("normalized_characters_expanded.csv", index=False)
print("✅ Saved as 'normalized_characters_expanded.csv'")


✅ Saved as 'normalized_characters_expanded.csv'


In [5]:
import pandas as pd
import ast

# === 1. Load data ===
exp = pd.read_csv("normalized_characters_expanded.csv")   # already 1-token per row
ali = pd.read_csv("alias_clusters.csv")

# parse list-columns
exp["normalized_characters"] = exp["normalized_characters"].apply(ast.literal_eval)
ali["aliases"]               = ali["aliases"].apply(ast.literal_eval)

# unwrap the single-item list into a plain string (lower-case for safe matching)
exp["norm_tok"] = exp["normalized_characters"].str[0].str.lower()

# === 2. Build the sentence-ID list for each alias cluster ===
rows = []
for _, a in ali.iterrows():
    sid     = a["story_id"]
    person  = a["person"]
    aliases = [t.lower() for t in a["aliases"]]          # match on normalized tokens

    sent_ids = (
        exp[(exp["story_id"] == sid) & (exp["norm_tok"].isin(aliases))]
        ["sentence_id"]
        .unique()
        .tolist()
    )
    sent_ids.sort()

    # join into comma-separated string (e.g., "3,5,7")
    rows.append({
        "story_id": sid,
        "person": person,
        "aliases": a["aliases"],          # keep list form
        "sentence_id": ", ".join(map(str, sent_ids)) if sent_ids else ""
    })

out = pd.DataFrame(rows)

# === 3. Save result ===
out.to_csv("alias_sentence_map.csv", index=False)
print("✅ Saved as 'alias_sentence_map.csv'")


✅ Saved as 'alias_sentence_map.csv'


In [6]:
import pandas as pd
import ast

# === 1. Load data ===
exp  = pd.read_csv("normalized_characters_expanded.csv")
ali  = pd.read_csv("alias_clusters.csv")

# Parse list-columns
exp["normalized_characters"] = exp["normalized_characters"].apply(ast.literal_eval)
exp["characters"]            = exp["characters"].apply(ast.literal_eval)
ali["aliases"]               = ali["aliases"].apply(ast.literal_eval)

# Extract single tokens for quick matching
exp["norm_tok"]      = exp["normalized_characters"].str[0].str.lower()
exp["orig_mention"]  = exp["characters"].str[0]             # keep original form

# === 2. Build the mapping ===
rows = []
for _, a in ali.iterrows():
    sid      = a["story_id"]
    person   = a["person"]
    aliases  = [t.lower() for t in a["aliases"]]            # case-insensitive

    sub = exp[
        (exp["story_id"] == sid) &
        (exp["norm_tok"].isin(aliases))
    ]

    sent_ids      = sorted(sub["sentence_id"].unique().tolist())
    orig_mentions = (
        sub[["sentence_id", "orig_mention"]]
        .drop_duplicates()
        .sort_values("sentence_id")["orig_mention"]
        .tolist()
    )

    rows.append({
        "story_id"        : sid,
        "person"          : person,
        "aliases"         : a["aliases"],
        "sentence_id"     : ", ".join(map(str, sent_ids)) if sent_ids else "",
        "original_mention": ", ".join(orig_mentions)        if orig_mentions else ""
    })

out = pd.DataFrame(rows)

# === 3. Save result ===
out.to_csv("alias_sentence_original.csv", index=False)
print("✅ Saved as 'alias_sentence_original.csv'")


✅ Saved as 'alias_sentence_original.csv'


In [7]:
import pandas as pd
import ast

# === 1. Load the file made earlier ===
df = pd.read_csv("alias_sentence_original.csv")

# Parse the list-in-string from the aliases column → real Python list
df["aliases"] = df["aliases"].apply(lambda s: ast.literal_eval(s) if isinstance(s, str) else [])

# Helper to preserve order while deduplicating
def uniq(seq):
    seen = set()
    out  = []
    for x in seq:
        if x not in seen:
            seen.add(x)
            out.append(x)
    return out

# === 2. Merge the original mentions into aliases (normalised to lowercase) ===
def add_mentions(row):
    aliases = [a.lower() for a in row["aliases"]]              # baseline list
    
    # split "Tuhanku, Tuhan, Tuhan" → ["Tuhanku", "Tuhan", "Tuhan"]
    mentions = [m.strip() for m in str(row["original_mention"]).split(",") if m.strip()]
    mentions = [m.lower() for m in mentions]                   # normalise
    
    merged = uniq(aliases + mentions)                          # keep order / dedup
    return merged

df["aliases"] = df.apply(add_mentions, axis=1)

# === 3. Drop the original_mention column if you no longer need it ===
df = df.drop(columns=["original_mention"])

# === 4. Save the result ===
df.to_csv("alias_with_augmented_aliases.csv", index=False)
print("✅ Saved as 'alias_with_augmented_aliases.csv'")


✅ Saved as 'alias_with_augmented_aliases.csv'


## merge alias + sentences with the text

In [1]:
import pandas as pd
import ast

# ── 1. LOAD CLEANED-UP ALIAS TABLE ───────────────────────────
alias_df = pd.read_csv("alias_with_augmented_aliases.csv")   # story_id | person | aliases | sentence_id
alias_df["aliases"] = alias_df["aliases"].apply(ast.literal_eval)

def split_ids(val):
    if pd.isna(val) or str(val).strip() == "":
        return []
    return [int(x) for x in str(val).split(",") if str(x).strip().isdigit()]

alias_df["sentence_ids"] = alias_df["sentence_id"].apply(split_ids)

# ── 2. LOAD WORD-LEVEL TOKEN TABLE & BUILD FULL SENTENCES ────
tok_df = pd.read_csv("tokenized_sentences.csv")              # story_id | judul | sentence_id | word

# keep only rows where word is a real token
tok_df = tok_df.dropna(subset=["word"]).copy()
tok_df["word"] = tok_df["word"].astype(str)                 # ensure str type

sent_df = (
    tok_df.groupby(["story_id", "sentence_id"])["word"]
          .agg(" ".join)                                     # join tokens with space
          .reset_index(name="text")
)

sent_lookup = {(r.story_id, r.sentence_id): r.text for r in sent_df.itertuples()}

# ── 3. EXPAND alias_df (one row per sentence) ────────────────
rows = []
for r in alias_df.itertuples():
    for sid in r.sentence_ids:
        rows.append({
            "story_id"   : r.story_id,
            "person"     : r.person,
            "aliases"    : r.aliases,
            "sentence_id": sid,
            "text"       : sent_lookup.get((r.story_id, sid), "")
        })

out = pd.DataFrame(rows)

# ── 4. SAVE ──────────────────────────────────────────────────
out.to_csv("alias_sentence_text.csv", index=False)
print("✅ Saved as 'alias_sentence_text.csv'")


✅ Saved as 'alias_sentence_text.csv'


## add mention count

In [1]:
import pandas as pd
import ast
import re
from collections import defaultdict, Counter

# ═══ 1. LOAD DATA ════════════════════════════════════════════════════════
alias_df = pd.read_csv("alias_sentence_text.csv")   # story_id | person | aliases | sentence_id | text
token_df = pd.read_csv("tokenized_sentences.csv")   # story_id | judul  | sentence_id | word

alias_df["aliases"] = alias_df["aliases"].apply(ast.literal_eval)
token_df  = token_df.dropna(subset=["word"]).copy()
token_df["word"] = token_df["word"].astype(str)

# ═══ 2. PREP TOKENS PER CERITA ══════════════════════════════════════════
story_tokens = defaultdict(list)        # {story_id: [token1, token2, ...]}
for r in token_df.itertuples():
    story_tokens[r.story_id].append(r.word.lower())

story_strings = {sid: " ".join(tok) for sid, tok in story_tokens.items()}

# ═══ 3. HITUNG mention_count PER TOKOH (story-level) ════════════════════
mention_dict = {}   # key = (story_id, person) -> total count

# groupby agar satu tokoh (story_id, person) diproses sekali
for (sid, person), sub in alias_df.groupby(["story_id", "person"]):
    # satukan semua alias dari baris-baris tokoh tsb
    alias_set = set()
    for alist in sub["aliases"]:
        alias_set.update([a.lower() for a in alist])

    toks = story_tokens[sid]
    txt  = story_strings[sid]

    count = 0
    for a in alias_set:
        if len(a.split()) == 1:                 # single-word alias
            count += toks.count(a)
        else:                                   # multi-word alias
            count += len(re.findall(r"\b" + re.escape(a) + r"\b", txt))

    mention_dict[(sid, person)] = count

# ═══ 4. TEMPEL KE TABEL alias_sentence_text ═════════════════════════════
alias_df["mention_count"] = alias_df.apply(
    lambda r: mention_dict[(r.story_id, r.person)], axis=1
)

# ═══ 5. (OPSI) URUTKAN & SIMPAN ═════════════════════════════════════════
alias_df = alias_df.sort_values(["story_id", "sentence_id", "person"])
alias_df.to_csv("alias_sentence_features_revised.csv", index=False)

print("✅  mention_count kini dihitung per tokoh & disimpan di 'alias_sentence_features_revised.csv'")


✅  mention_count kini dihitung per tokoh & disimpan di 'alias_sentence_features_revised.csv'


## add word count

In [2]:
import pandas as pd
import ast

# ── 1. LOAD file hasil sebelumnya ─────────────────────────────
df = pd.read_csv("alias_sentence_features_revised.csv")   # story_id | person | aliases | sentence_id | text | mention_count
df["aliases"] = df["aliases"].apply(ast.literal_eval)

# ── 2. HITUNG panjang setiap kalimat (jumlah kata) ────────────
df["sent_len"] = df["text"].str.split().str.len()         # panjang per-kalimat

# ── 3. AGREGASI word_count per TOKOH (story-level) ───────────
total_wc = (                                              # {(story_id, person): total_words}
    df.groupby(["story_id", "person"])["sent_len"]
      .sum()
      .to_dict()
)

df["word_count"] = df.apply(
    lambda r: total_wc[(r.story_id, r.person)], axis=1
)

# ── 4. BERSIHkan kolom temp & simpan ─────────────────────────
df = df.drop(columns=["sent_len"])
df.to_csv("alias_sentence_features_wc.csv", index=False)

print("✅  Kolom word_count (total kata per tokoh di cerita) sudah ditambahkan dan disimpan ke 'alias_sentence_features_wc.csv'")


✅  Kolom word_count (total kata per tokoh di cerita) sudah ditambahkan dan disimpan ke 'alias_sentence_features_wc.csv'


In [3]:
import pandas as pd
import re

# ── 1. LOAD file ───────────────────────────────────────────────
df = pd.read_csv("alias_sentence_features_wc.csv")

# ── 2. EXTRACT NOMOR TOKOH (Tokoh-7 → 7) ───────────────────────
df["person_num"] = (
    df["person"]
      .str.extract(r"Tokoh-(\d+)", expand=False)
      .astype(int)
)

# ── 3. SORT: story_id  → nomor tokoh  → sentence_id ────────────
df = (
    df.sort_values(["story_id", "person_num", "sentence_id"])
      .reset_index(drop=True)
)

# ── 4. DROP kolom bantu & SIMPAN ───────────────────────────────
df = df.drop(columns=["person_num"])
df.to_csv("alias_sentence_features_sorted.csv", index=False)

print("✅  File disimpan sebagai 'alias_sentence_features_sorted.csv' dengan urutan Tokoh-1, Tokoh-2, dst per cerita.")


✅  File disimpan sebagai 'alias_sentence_features_sorted.csv' dengan urutan Tokoh-1, Tokoh-2, dst per cerita.


## add BERT context, ML context, text next, text prev

In [4]:
import pandas as pd
import ast
import re

# ── 1.  LOAD dua file ────────────────────────────────────────────────
feat = pd.read_csv("alias_sentence_features_wc.csv")    # story_id | person | ... | text
feat["aliases"] = feat["aliases"].apply(ast.literal_eval)

tok  = pd.read_csv("tokenized_sentences.csv")           # story_id | judul | sentence_id | word
tok   = tok.dropna(subset=["word"]).copy()
tok["word"] = tok["word"].astype(str)

# ── 2.  SUSUN kalimat utuh & lookup───────────────────────────────────
sent_df = (
    tok.groupby(["story_id", "sentence_id"])["word"]
       .agg(" ".join)
       .reset_index(name="full_text")
)

lookup = {(r.story_id, r.sentence_id): r.full_text for r in sent_df.itertuples()}

def get_prev(row):
    return lookup.get((row.story_id, row.sentence_id - 1), "")

def get_next(row):
    return lookup.get((row.story_id, row.sentence_id + 1), "")

# ── 3.  TAMBAH kolom context ─────────────────────────────────────────
feat["text_prev"] = feat.apply(get_prev, axis=1)
feat["text_next"] = feat.apply(get_next, axis=1)

feat["bert_context"] = (
    feat["text_prev"].str.strip() + " [SEP] " +
    feat["text"].str.strip()      + " [SEP] " +
    feat["text_next"].str.strip()
).str.strip()

# ── 4.  SIMPAN ───────────────────────────────────────────────────────
feat.to_csv("alias_sentence_features_context.csv", index=False)
print("✅  text_prev, text_next, bert_context ditambahkan (pakai tokenized_sentences.csv) ➜ alias_sentence_features_context.csv")


✅  text_prev, text_next, bert_context ditambahkan (pakai tokenized_sentences.csv) ➜ alias_sentence_features_context.csv


## add is primary in sentence

In [7]:
import pandas as pd
import ast
import re

# ── 1. LOAD the current feature file (with text, aliases, etc.) ──
df = pd.read_csv("alias_sentence_features_context.csv")   # ganti nama jika berbeda
df["aliases"] = df["aliases"].apply(ast.literal_eval)      # list asli

# ── 2. Kumpulkan SEMUA alias, buat regex word-boundary ──────────
all_aliases = (
    df["aliases"]
      .explode()
      .dropna()
      .map(lambda x: x.lower().strip())
      .unique()
      .tolist()
)
alias_patterns = {a: re.compile(r"\b" + re.escape(a) + r"\b") for a in all_aliases}

# ── 3. Tentukan apakah tokoh ini alias PERTAMA di kalimat ───────
def is_primary(row):
    sent = str(row["text"]).lower()
    first_pos, first_alias = None, None
    for alias, pat in alias_patterns.items():
        m = pat.search(sent)
        if m:
            pos = m.start()
            if first_pos is None or pos < first_pos:
                first_pos, first_alias = pos, alias
    if first_alias is None:
        return 0
    return 1 if first_alias in [a.lower().strip() for a in row["aliases"]] else 0

df["is_primary_in_sentence"] = df.apply(is_primary, axis=1)

# ── 4. Sort: story_id → nomor Tokoh → sentence_id ───────────────
df["person_num"] = (
    df["person"]
      .str.extract(r"Tokoh-(\d+)", expand=False)
      .astype(int)
)
df = (
    df.sort_values(["story_id", "person_num", "sentence_id"])
      .reset_index(drop=True)
      .drop(columns=["person_num"])
)

# ── 5. Simpan hasil ─────────────────────────────────────────────
df.to_csv("alias_sentence_features_primary_sorted.csv", index=False)
print("✅  is_primary_in_sentence diperbaiki & file di-sort. Hasil: 'alias_sentence_features_primary_sorted.csv'")


✅  is_primary_in_sentence diperbaiki & file di-sort. Hasil: 'alias_sentence_features_primary_sorted.csv'
