In [1]:
# =======================
# 1. IMPORT LIBRARY
# =======================
import pandas as pd
import re
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [2]:
# =======================
# 2. INISIALISASI TOOLS
# =======================
stop_factory = StopWordRemoverFactory()
default_stopwords = set(stop_factory.get_stop_words())

# Tambahkan stopword tambahan jika perlu
custom_stopwords = {
    'yg', 'nya', 'aja', 'dong', 'nih', 'loh', 'gak', 'ga', 'kalo', 'sih', 'deh',
    'itu', 'ini', 'gue', 'lu', 'tau', 'udah', 'dah', 'emang', 'eh', 'biar', 'kayak'
}
stopwords = default_stopwords.union(custom_stopwords)

stem_factory = StemmerFactory()
stemmer = stem_factory.create_stemmer()


In [3]:
# =======================
# 3. DEFINISI FUNGSI CLEANING
# =======================
def clean_text(text):
    # Step 1: Case folding
    text = text.lower()
    
    # Step 2: Remove angka, simbol, emoji, karakter asing
    text = re.sub(r'\d+', ' ', text)                      # hapus angka
    text = re.sub(r'[^a-z\s]', ' ', text)                 # hapus simbol & huruf asing
    text = re.sub(r'\s+', ' ', text).strip()              # hapus spasi berlebih
    
    # Step 3: Tokenizing & Filtering stopword
    tokens = text.split()
    tokens = [word for word in tokens if word not in stopwords and len(word) > 1]
    
    # Step 4: Stemming
    stemmed = [stemmer.stem(word) for word in tokens]
    
    return ' '.join(stemmed)

In [4]:
# =======================
# 4. BACA DATA & APLIKASIKAN
# =======================
# Ganti nama file sesuai file kamu
data = pd.read_csv('scrapped_data_honkai_star_rail.csv')

# Pastikan kolom 'content' ada
if 'content' in data.columns:
    data['content_clean'] = data['content'].astype(str).apply(clean_text)
else:
    raise ValueError("Kolom 'content' tidak ditemukan di file CSV!")


In [5]:
# =======================
# 5. CEK HASIL
# =======================
print(data[['content', 'content_clean']].head(10))

                                             content  \
0  udah warp 140 kali masih blum dapat B5 yg ada ...   
1  Bagus game hoyo versi turn base, banyak bansos...   
2      bagus, tapi semenjak update besar bngt GB nya   
3        global passive? HP inflation? pfft* hell no   
4  game nyah sudah saya hapus susah gacha nyah ap...   
5  ga jelas bgt siii, masa pas story di tempat be...   
6  Buat game ini jadi seimbang, jangan buat chara...   
7  walaupun hampir smua nya chara b5 tapi hsr mas...   
8  tlonk adain karakter pacarku, dia mw jadi anak...   
9  GAME JAHAT, STELLAR 8K KU HILANG DI MAKAN BAIL...   

                                       content_clean  
0  warp kali blum banernya malah dapet clara warp...  
1  bagus game hoyo versi turn base banyak bansos ...  
2                bagus semenjak update besar bngt gb  
3           global passive hp inflation pfft hell no  
4  game nyah hapus susah gacha nyah apa lagih bua...  
5  jelas bgt siii masa pas story tempat salju mau... 

In [8]:
# =======================
# 6. SIMPAN KE FILE BARU
# =======================
data.to_csv('cleaned_data_honkai_star_rail_FINAL.csv', index=False)
print("\nâœ… File hasil berhasil disimpan sebagai 'cleaned_data_honkai_star_rail_FINAL.csv'")

Berhasil disimpan sebagai 'cleaned_data_honkai_star_rail.csv'
