In [1]:
# =======================
# 1. IMPORT LIBRARY
# =======================
import pandas as pd
import re
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [2]:
# =======================
# 2. INISIALISASI TOOLS
# =======================
stop_factory = StopWordRemoverFactory()
default_stopwords = set(stop_factory.get_stop_words())

# Tambahkan stopword tambahan jika perlu
custom_stopwords = {
    'yg', 'nya', 'aja', 'dong', 'nih', 'loh', 'gak', 'ga', 'kalo', 'sih', 'deh',
    'itu', 'ini', 'gue', 'lu', 'tau', 'udah', 'dah', 'emang', 'eh', 'biar', 'kayak'
}
stopwords = default_stopwords.union(custom_stopwords)

stem_factory = StemmerFactory()
stemmer = stem_factory.create_stemmer()


In [3]:
#Fungsi Preprocessing

def preprocess_indonesian(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', ' ', text)  # Hapus simbol/angka
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(stemmed_tokens)

In [4]:
# Load Dataset

data = pd.read_csv('scrapped_data_honkai_star_rail.csv')

# Tampilkan beberapa baris awal
data[['content']].head()

Unnamed: 0,content
0,udah warp 140 kali masih blum dapat B5 yg ada ...
1,"Bagus game hoyo versi turn base, banyak bansos..."
2,"bagus, tapi semenjak update besar bngt GB nya"
3,global passive? HP inflation? pfft* hell no
4,game nyah sudah saya hapus susah gacha nyah ap...


In [7]:
# Proses Preprocessing

# Terapikan fungsi ke seluruh kolom content
data['content_clean'] = data['content'].astype(str).apply(preprocess_indonesian)

# Tampilkan hasil
data[['content', 'content_clean']].head()

Unnamed: 0,content,content_clean
0,udah warp 140 kali masih blum dapat B5 yg ada ...,udah warp kali blum b yg banernya malah dapet ...
1,"Bagus game hoyo versi turn base, banyak bansos...",bagus game hoyo versi turn base banyak bansos ...
2,"bagus, tapi semenjak update besar bngt GB nya",bagus semenjak update besar bngt gb nya
3,global passive? HP inflation? pfft* hell no,global passive hp inflation pfft hell no
4,game nyah sudah saya hapus susah gacha nyah ap...,game nyah hapus susah gacha nyah apa lagih bua...


In [8]:
# Simpan ke file baru
data.to_csv('cleaned_data_honkai_star_rail.csv', index=False)
print("Berhasil disimpan sebagai 'cleaned_data_honkai_star_rail.csv'")

Berhasil disimpan sebagai 'cleaned_data_honkai_star_rail.csv'
