### Preprocessing Data Teks
#### Membersihkan Teks sesuai tahapan NLP:
 - case folding
 - hapus angka & karakter non-alfabet
 - tokenisasi
 - normalisasi
 - filtering stopwords
 - stemming (pakai Sastrawi untuk Bahasa Indonesia)


In [12]:
import os, re, pandas as pd
from collections import Counter

IN_PATH = "scrapped_data_honkai_star_rail.csv"
OUT_PATH = "cleaned_data_hsr_final.csv"

if not os.path.exists(IN_PATH):
    raise FileNotFoundError(f"Input file tidak ditemukan: {IN_PATH}")

# 1) Load
df = pd.read_csv(IN_PATH)
print("Loaded:", IN_PATH, "shape:", df.shape)
print("Columns:", df.columns.tolist())

# 2) Tentukan kolom teks (ubah jika perlu)
candidates = ["cleaned_text","clean_final","final_preprocessed","clean_norm",
              "content","review","text","ulasan","comments"]
text_col = next((c for c in candidates if c in df.columns), None)
if text_col is None:
    # fallback: pilih kolom dengan >60% string
    for c in df.columns:
        non_null = df[c].dropna()
        if len(non_null) == 0: 
            continue
        frac_str = non_null.apply(lambda x: isinstance(x, str)).mean()
        if frac_str > 0.6:
            text_col = c
            break
if text_col is None:
    raise ValueError("Tidak dapat mendeteksi kolom teks. Sebutkan nama kolom ulasan.")
print("Menggunakan kolom:", text_col)

# 3) Diagnostics singkat
col = df[text_col]
print("Tipe data (top counts):")
print(col.apply(lambda x: type(x).__name__).value_counts().head())

print("NaN count:", col.isna().sum())
print("Empty-string count:", (col.astype(str).str.strip()=="").sum())

# 4) Build token frequency (opsional untuk bantu bikin kamus slang)
all_text = " ".join(col.dropna().astype(str).tolist()).lower()
tokens = re.findall(r"\b[\w']+\b", all_text)
freq = Counter(tokens)
print("Top tokens:", freq.most_common(20))

# 5) Preprocessing functions
# install `emoji` package required for remove_emoji
try:
    import emoji as _emoji
    def remove_emoji(s): return _emoji.replace_emoji(s, replace=" ")
except Exception:
    # fallback: minimal regex (less reliable)
    def remove_emoji(s): return re.sub(r'[^\w\s]', ' ', s)

# small built-in slang map (extendable)
slang_map = {
    "ga": "tidak",
    "gak": "tidak",
    "gk" : "tidak",
    "gg" : "hebat",
    "udah": "sudah",
    "udahh": "sudah",
    "udahhh": "sudah",
    "aja": "saja",
    "gw": "saya",
    "gue": "saya",
    "aq": "saya",
    "sy": "saya",
    "tdk": "tidak",
    "yg": "yang",
    "dgn": "dengan",
    "dlm": "dalam",
    "trs": "terus",
    "tp": "tapi",
    "sm": "sama",
    "lg": "lagi",
    "dr": "dari",
    "krn": "karena",
    "pls": "tolong",
    "thx": "terima kasih",
    "makasih": "terima kasih",
    "gemnya": "game nya",
    "plis" : "tolong",
    "bgt" : "banget",
    "rerol" : "reroll",
    "nyaa" : "nya",
    "aminnn" : "aamiin",
    "besttt" : "best",
    "mantapzz" : "mantap",
    "Gabe" : "Game",
    "gampangin" : "mudahkan",
    "baiq" : "baik",
    "emang" : "memang",
    "gamampu" : "tidak mampu",
    "lodingnya" : "loading nya",
    "bagustapi" : "bagus tapi",
    "tlonk" : "tolong",
    "oomagad" : "oh tuhan",
    "nyoba" : "coba",
    "jg" : "juga"
}

# protected multi-word game phrases (tidak dipecah)
protected_phrases = [
    "power creep","black screen","turn based","turn-based","turnbased",
    "early game","late game","gacha rate","rate up","event boss","update patch"
]
protected_map = {p: p.replace(" ", "_") for p in protected_phrases}

def protect_phrases(text):
    for p,rep in protected_map.items():
        text = re.sub(re.escape(p), rep, text, flags=re.IGNORECASE)
    return text

def restore_phrases(text):
    for p,rep in protected_map.items():
        text = text.replace(rep, p)
    return text

# reduce repeated characters (3+ -> 2) e.g. yaaa -> yaa or you can set 1
def reduce_repeats(s, max_rep=2):
    return re.sub(r"(.)\1{"+str(max_rep)+",}", r"\1"*max_rep, s)

# main preprocess per string
def preprocess_string(s):
    if not isinstance(s, str):
        return None
    s = s.strip()
    if s == "": return None
    # remove emoji
    s = remove_emoji(s)
    # lower
    s = s.lower()
    # protect phrases
    s = protect_phrases(s)
    # remove urls
    s = re.sub(r"http\S+|www\.\S+", " ", s)
    # normalize repeated chars
    s = reduce_repeats(s, max_rep=2)
    # keep alnum, underscore (for protected), apostrophe and space
    s = re.sub(r"[^0-9a-zA-Z_'_\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    if s == "": return None
    # token-level slang normalization
    toks = s.split()
    toks = [slang_map.get(t, t) for t in toks]
    s = " ".join(toks)
    # restore phrase spacing
    s = restore_phrases(s)
    return s

# 6) Apply preprocessing with strict filtering
series = df[text_col]
# keep only true string entries
series = series[series.apply(lambda x: isinstance(x, str))].copy()
series = series.str.strip()
series = series[series != ""]
series = series[~series.str.lower().isin({"nan","none","null","n/a","na"})]

print("Rows left before preprocess:", len(series))

processed = series.map(preprocess_string)
processed = processed[processed.notna()]
processed = processed[processed.str.strip() != ""]

print("Rows after preprocess:", len(processed))

# Assemble cleaned dataframe (keep original metadata if needed)
cleaned_df = df.loc[processed.index].copy()
cleaned_df["_raw_text"] = df.loc[processed.index, text_col].astype(str)
cleaned_df["cleaned_text"] = processed.values


# 7) Optional: apply Sastrawi stemming if available
try:
    from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
    stemmer = StemmerFactory().create_stemmer()
    def stem_sentence(s):
        try:
            return " ".join([stemmer.stem(w) for w in s.split()])
        except Exception:
            return s
    cleaned_df["cleaned_text_stemmed"] = cleaned_df["cleaned_text"].apply(stem_sentence)
    stem_applied = True
except Exception as e:
    cleaned_df["cleaned_text_stemmed"] = cleaned_df["cleaned_text"]
    stem_applied = False
    print("Note: Sastrawi not available; stemming skipped. Install Sastrawi to enable stemming.")

# 8) Save cleaned CSV
cleaned_df.to_csv(OUT_PATH, index=False, encoding="utf-8-sig")
print("Saved cleaned CSV to:", OUT_PATH)
print("Total cleaned rows:", len(cleaned_df))
print("Stemming applied:", stem_applied)

# quick preview
print(cleaned_df[["_raw_text","cleaned_text","cleaned_text_stemmed"]].head(8).to_string(index=False))

# 9) Output candidate slang list (auto-suggest)
clean_tokens = re.findall(r"\b[\w']+\b", " ".join(cleaned_df["cleaned_text"].astype(str).tolist()).lower())
freq_clean = Counter(clean_tokens)
candidates = [w for w,cnt in freq_clean.items() if cnt >= 25]  # threshold adjustable
candidates_sorted = sorted(candidates, key=lambda w: freq_clean[w], reverse=True)
print("\nSuggested high-frequency tokens (freq>=25) to check (first 80):")
print(candidates_sorted[:80])

# End of preprocessing cell.


Loaded: scrapped_data_honkai_star_rail.csv shape: (25413, 4)
Columns: ['userName', 'score', 'at', 'content']
Menggunakan kolom: content
Tipe data (top counts):
content
str    25413
Name: count, dtype: int64
NaN count: 0
Empty-string count: 0
Top tokens: [('game', 11629), ('nya', 6115), ('bagus', 5493), ('di', 4690), ('dan', 4308), ('yang', 3994), ('ini', 3942), ('saya', 3454), ('bisa', 3006), ('tapi', 2489), ('banget', 2025), ('ada', 2019), ('untuk', 2003), ('main', 1994), ('baik', 1969), ('yg', 1857), ('juga', 1794), ('tolong', 1773), ('karakter', 1771), ('lagi', 1756)]
Rows left before preprocess: 25413
Rows after preprocess: 25047
Saved cleaned CSV to: cleaned_data_hsr_final.csv
Total cleaned rows: 25047
Stemming applied: True
                                                                                                                                                                                     _raw_text                                                                      

#### Ambil Sampel dari data penuh (Opsional)

In [14]:
for i, d in enumerate(docs):
    if not isinstance(d, str):
        print("FOUND NON-STRING at idx:", i, "value:", d, "type:", type(d))
        break
else:
    print("Semua item di docs sudah string âœ”")


Semua item di docs sudah string âœ”


In [1]:
import pandas as pd

CLEANED_PATH = "cleaned_data_hsr_final.csv"
dfc = pd.read_csv(CLEANED_PATH, encoding="utf-8-sig")

# ambil 1000 sampel acak (atau kurang kalau datanya <1000)
df_sample = dfc.sample(n=min(1000, len(dfc)), random_state=42)

# simpan sampel agar gampang dipakai ulang
SAMPLE_PATH = "cleaned_data_hsr_sample1000.csv"
df_sample.to_csv(SAMPLE_PATH, index=False, encoding="utf-8-sig")
print("Sample saved:", SAMPLE_PATH, "rows:", len(df_sample))


Sample saved: cleaned_data_hsr_sample1000.csv rows: 1000


#### Untuk Uji Coba 1000 sampel

In [2]:
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
import umap, hdbscan
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired

# load sample
docs = df_sample["cleaned_text_stemmed"].astype(str).tolist()
docs = [d.strip() for d in docs if isinstance(d, str) and d.strip()]

# IndoBERT embedder
MODEL_NAME = "indobenchmark/indobert-base-p1"
embedder = SentenceTransformer(MODEL_NAME)

# Stopwords tambahan (Indonesia + Inggris + slang game)
extra_stopwords = [
    "game", "play", "server", "turn", "based", "screen", "pls", "gg", "hp", "mp",
    "update", "lag", "crash", "error", "login", "event", "hero", "meta"
]

# pre-encode embeddings
embeddings = embedder.encode(docs, batch_size=32, show_progress_bar=False, normalize_embeddings=True)

vectorizer_model = CountVectorizer(
    ngram_range=(1,2),
    min_df=20,  # lebih tinggi dari sebelumnya untuk buang kata jarang muncul
    stop_words=extra_stopwords
)
# Fine-tuning parameter
umap_model = umap.UMAP(n_neighbors=30, n_components=5, metric="cosine", random_state=42)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=30, min_samples=15, metric="euclidean", cluster_selection_method="eom")

topic_model = BERTopic(
    embedding_model=embedder,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    language="indonesian",
    min_topic_size=30,  # naikkan agar cluster kecil digabung
    calculate_probabilities=False,
    verbose=True
)

topics, probs = topic_model.fit_transform(docs, embeddings=full_embeddings)

# Reduksi topik biar lebih rapih
topic_model = topic_model.reduce_topics(docs, nr_topics=10)

print("Topic count (sample 1000):", len(topic_model.get_topic_info()))

# simpan hasil topik sampel
topic_model.get_topic_info().to_csv("topic_info_sample1000.csv", index=False, encoding="utf-8-sig")
print("Saved topic_info_sample1000.csv")


No sentence-transformers model found with name indobenchmark/indobert-base-p1. Creating a new one with mean pooling.


ConnectionError: (MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /api/models/indobenchmark/indobert-base-p1/tree/main/additional_chat_templates?recursive=False&expand=False (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000017D964553D0>: Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: c1b78700-d383-49d3-88ba-0350cda25ae4)')

### Evaluasi topik 1000 sample

In [2]:
# === Evaluasi untuk 1000 sampel ===
from sklearn.metrics import silhouette_score
import numpy as np

# Ambil embeddings dari model
sample_embeddings = topic_model._extract_embeddings(docs_sample.tolist(), method="document")
sample_topics, _ = topic_model.fit_transform(docs_sample, embeddings=sample_embeddings)

# Evaluasi coherence score (pakai c-TF-IDF)
from bertopic.evaluation import coherence_score
c_score = coherence_score(docs_sample, sample_topics, topic_model.get_topics())
print("Coherence Score (Sample 1000):", c_score)

# Evaluasi silhouette score (kualitas clustering)
s_score = silhouette_score(sample_embeddings, sample_topics)
print("Silhouette Score (Sample 1000):", s_score)


# === Evaluasi untuk data penuh ===
#full_embeddings = topic_model._extract_embeddings(docs.tolist(), method="document")
#full_topics, _ = topic_model.fit_transform(docs, embeddings=full_embeddings)

#c_score_full = coherence_score(docs, full_topics, topic_model.get_topics())
#print("Coherence Score (Full Data):", c_score_full)

#s_score_full = silhouette_score(full_embeddings, full_topics)
#print("Silhouette Score (Full Data):", s_score_full)


NameError: name 'topic_model' is not defined

### Visualisasi 1000 sample

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Distribusi topik
fig = topic_model.visualize_barchart(top_n_topics=10)
fig.show()

# Hierarki topik
fig = topic_model.visualize_hierarchy()
fig.show()

# Distribusi dokumen dalam ruang embedding
fig = topic_model.visualize_documents(docs_sample, embeddings=sample_embeddings)
fig.show()

# Wordcloud untuk tiap topik
topics = topic_model.get_topics()
for topic_num, words in topics.items():
    if topic_num == -1:  # skip outlier
        continue
    word_freq = {word: weight for word, weight in words}
    wc = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(word_freq)
    
    plt.figure(figsize=(8, 6))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(f"Topic {topic_num}")
    plt.show()


In [13]:
# ====== Modeling: pre-encode + BERTopic (jalankan setelah preprocessing selesai) ======
import pandas as pd
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
import umap, hdbscan
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired

CLEANED_PATH = "cleaned_data_hsr_final.csv"
dfc = pd.read_csv(CLEANED_PATH, encoding="utf-8-sig")
docs = dfc["cleaned_text_stemmed"].astype(str).tolist()  # or use "cleaned_text"

# 1) strict safety checks
docs = [d.strip() for d in docs if isinstance(d, str) and d.strip() and d.lower().strip() not in {"nan","none","null"}]
print("Docs to model:", len(docs))

# 2) load embedding (IndoBERT)
MODEL_NAME = "indobenchmark/indobert-base-p1"   # or p2
embedder = SentenceTransformer(MODEL_NAME)

# 3) pre-encode embeddings (disable progress bar to avoid widget issues)
embeddings = embedder.encode(docs, batch_size=32, show_progress_bar=False, normalize_embeddings=True)
print("Embeddings shape:", getattr(embeddings, "shape", None))

# 4) build BERTopic with embeddings passed explicitly
umap_model = umap.UMAP(n_neighbors=15, n_components=5, metric="cosine", random_state=42)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=30, min_samples=10, metric="euclidean", cluster_selection_method="eom")
vectorizer_model = CountVectorizer(ngram_range=(1,2), min_df=10)
repr_model = KeyBERTInspired()

topic_model = BERTopic(umap_model=umap_model,
                       hdbscan_model=hdbscan_model,
                       vectorizer_model=vectorizer_model,
                       representation_model=repr_model,
                       language="indonesian",
                       min_topic_size=30,
                       calculate_probabilities=False,
                       verbose=True)

topics, probs = topic_model.fit_transform(docs, embeddings=embeddings)
print("Topic count:", len(topic_model.get_topic_info()))
topic_model.get_topic_info().to_csv("/mnt/data/topic_info_final.csv", index=False, encoding="utf-8-sig")
print("Saved topic_info_final.csv")


Docs to model: 25043


No sentence-transformers model found with name indobenchmark/indobert-base-p1. Creating a new one with mean pooling.


KeyboardInterrupt: 

### Model BERTopic + IndoBERT
 - IndoBERT sebagai embedding -> lebih kuat menangkap konteks bahasa Indonesia
 - BERTopic untuk clustering + class-based TF-IDF -> menghasilkan topik yang koheren

##### Cek versi

In [3]:
import sentence_transformers
print(sentence_transformers.__version__)

import huggingface_hub
print(huggingface_hub.__version__)


5.1.0
0.34.4


In [10]:
from sentence_transformers import SentenceTransformer

MODEL_NAME = "indobenchmark/indobert-base-p1"
embedder = SentenceTransformer(MODEL_NAME)

# Coba encode sedikit dulu (smoke test)
_ = embedder.encode(docs[:8], show_progress_bar=False)

# Lanjut encode semua (ini bisa butuh waktu)
embeddings = embedder.encode(
    docs,
    batch_size=32,
    show_progress_bar=False,   # hilangkan widget
    normalize_embeddings=True
)
print("Embeddings shape:", getattr(embeddings, "shape", None))


No sentence-transformers model found with name indobenchmark/indobert-base-p1. Creating a new one with mean pooling.


TypeError: 'float' object is not subscriptable

In [4]:
from bertopic import BERTopic
import umap, hdbscan
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired

umap_model = umap.UMAP(n_neighbors=15, n_components=5, metric="cosine", random_state=42)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=15 if USE_SAMPLE else 30,
                                min_samples=10, metric="euclidean",
                                cluster_selection_method="eom")

vectorizer_model = CountVectorizer(ngram_range=(1,2), min_df=5 if USE_SAMPLE else 10)
repr_model = KeyBERTInspired()

topic_model = BERTopic(
    embedding_model=embedder,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    representation_model=repr_model,
    language="indonesian",
    min_topic_size=10 if USE_SAMPLE else 30,
    calculate_probabilities=False,
    verbose=True
)

topics, probs = topic_model.fit_transform(docs)
print("Jumlah topik ditemukan:", len(topic_model.get_topic_info()))

2025-08-26 12:39:15,536 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

TypeError: 'float' object is not subscriptable

### Evaluasi Topik

In [None]:
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
import numpy as np

tokenized_texts = [d.split() for d in docs]
dictionary = Dictionary(tokenized_texts)

topics_dict = {}
for t in topic_model.get_topic_info()["Topic"]:
    if t == -1: continue
    topics_dict[t] = [w for w, _ in topic_model.get_topic(t)]

coherence_scores = []
for t, words in topics_dict.items():
    cm = CoherenceModel(topics=[words], texts=tokenized_texts, dictionary=dictionary, coherence="c_v")
    coherence_scores.append(cm.get_coherence())

avg_coherence = float(np.mean(coherence_scores)) if coherence_scores else 0
print(f"Average coherence: {avg_coherence:.4f}")


### Visualisasi Hasil
#### Gunakan wordcloud dan diagram topik untuk memperkuat pembahasan.

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# WordCloud untuk topik tertentu
def plot_wordcloud_for_topic(topic_model, topic_id):
    words = dict(topic_model.get_topic(topic_id))
    wc = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(words)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.show()

# contoh tampilkan wordcloud topik 1
plot_wordcloud_for_topic(topic_model, 1)

# Visualisasi distribusi topik
topic_model.visualize_barchart(top_n_topics=10)


### Simpan Hasil

In [None]:
topic_info = topic_model.get_topic_info()
topic_info.to_csv("topic_info.csv", index=False, encoding="utf-8-sig")

doc_topics = df_model.copy()
doc_topics["topic"] = topics
doc_topics.to_csv("doc_topic_assignment.csv", index=False, encoding="utf-8-sig")

print("Hasil disimpan ke CSV.")
