In [43]:
import os
import json
import time
import re
import string

import nltk
import pandas as pd
import requests
from nltk.corpus import stopwords
from transformers import BertTokenizer

nltk.download('stopwords')

tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-base-p2")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\elang\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [44]:
df = pd.read_csv("data_clean/all_reviews_merged.csv")
df.sample(5)

Unnamed: 0,nama_tempat,review
1294,Geprek Kak Rose,Tukang parkirnya cmn mau duitnya doang. Habis ...
7568,Mie Gacoan Stasiun Kota Malang,"layanan nya bagusss, makanan nyaa enakk toppp"
7261,Mie Gacoan Sawojajar Kota Malang,Taste tetap terjaga. Isi Pangsit agak kecil is...
2931,Geprek Kak Rose Suhat Malang,"Enak rasanya, luas juga untuk lantai 2 nya"
1503,Geprek Kak Rose,Cocok dengan harga


In [45]:
def cleaningText(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text)
    text = re.sub(r'#[A-Za-z0-9]+', '', text)
    text = re.sub(r"http\S+", '', text)
    text = re.sub(r'[0-9]+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    text = text.replace('\n', ' ')
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.strip(' ')
    return text


def casefoldingText(text):
    return (text or '').lower()


def normalize_repeated_chars(text):
    # Ubah huruf yang sama berurutan lebih dari 2 kali menjadi tepat 2 kali, contoh: "enakkkk" -> "enakkk" -> "enakk"
    return re.sub(r'(.)\1{2,}', r'\1\1', (text or ''))


def tokenizingText(text):
    text = (text or '').strip()
    if not text:
        return []
    # IndoBERT subword tokenization
    return tokenizer.tokenize(text)


def filteringText(tokens):
    listStopwords = set(stopwords.words('indonesian'))
    listStopwords.update(set(stopwords.words('english')))
    additional_stopwords = ['iya','yaa','gk','gak','g','dr','nya','na','sih','ku','di','ga','ya','gaa','loh','kah','woi','woii','woy','pas','c','deh','eh']
    listStopwords.update(additional_stopwords)
    return [word for word in tokens if word not in listStopwords]


def toSentence(list_words):
    return ' '.join(word for word in list_words)

In [46]:
def _get_json_with_retries(url, params=None, retries=3, timeout=30, backoff=1.5):
    last_err = None
    for i in range(retries):
        try:
            resp = requests.get(url, params=params, timeout=timeout)
            if resp.status_code == 200:
                try:
                    return resp.json()
                except Exception as je:
                    last_err = je
            else:
                last_err = RuntimeError(f"HTTP {resp.status_code}: {resp.text[:200]}")
        except Exception as e:
            last_err = e
        time.sleep(backoff ** i)
    raise RuntimeError(f"Failed to GET JSON from {url} after {retries} retries: {last_err}")


def load_slangwords_from_hf(cache_path="data_clean/slangwords.json", max_rows=None):
    base = "https://datasets-server.huggingface.co"
    ds = "theonlydo/indonesia-slang"
    cfg = "default"
    split = "train"

    try:
        if cache_path and os.path.exists(cache_path):
            with open(cache_path, "r", encoding="utf-8") as f:
                cached = json.load(f)
                if isinstance(cached, dict) and cached:
                    return {str(k).lower(): str(v).lower() for k, v in cached.items()}
    except Exception:
        pass

    if cache_path:
        os.makedirs(os.path.dirname(cache_path), exist_ok=True)

    meta = _get_json_with_retries(
        f"{base}/rows",
        params={"dataset": ds, "config": cfg, "split": split, "offset": 0, "length": 1},
        retries=4,
        timeout=30,
    )
    total = int(meta.get("num_rows_total", 0))
    if total <= 0:
        raise RuntimeError("Dataset returned zero rows or missing num_rows_total")

    if isinstance(max_rows, int) and max_rows > 0:
        total_to_fetch = min(total, max_rows)
    else:
        total_to_fetch = total

    slang = {}
    offset = 0
    length = 100
    fetched = 0

    while fetched < total_to_fetch:
        remaining = total_to_fetch - fetched
        request_length = min(length, remaining)
        data = _get_json_with_retries(
            f"{base}/rows",
            params={
                "dataset": ds,
                "config": cfg,
                "split": split,
                "offset": offset,
                "length": request_length,
            },
            retries=4,
            timeout=60,
        )
        rows = data.get("rows", [])
        for r in rows:
            row = r.get("row", {})
            s = str(row.get("slang", "")).strip().lower()
            f = str(row.get("formal", "")).strip().lower()
            if s and f:
                slang[s] = f
        batch = len(rows)
        fetched += batch
        offset += batch
        if batch == 0:
            break

    if not slang:
        raise RuntimeError("No slang entries loaded from Hugging Face dataset")

    if cache_path:
        try:
            with open(cache_path, "w", encoding="utf-8") as f:
                json.dump(slang, f, ensure_ascii=False)
        except Exception:
            pass

    return slang


slangwords = load_slangwords_from_hf()
len(slangwords)

4417

In [47]:
def fix_slangwords(text):
    words = text.split()
    fixed_words = []

    for word in words:
        if word.lower() in slangwords:
            fixed_words.append(slangwords[word.lower()])
        else:
            fixed_words.append(word)

    fixed_text = ' '.join(fixed_words)
    return fixed_text

In [48]:
# Membersihkan teks dan menyimpannya di kolom 'text_clean'
df['text_clean'] = df['review'].apply(cleaningText)

# Mengubah huruf dalam teks menjadi huruf kecil dan menyimpannya di 'text_casefoldingText'
df['text_casefoldingText'] = df['text_clean'].apply(casefoldingText)

# Normalisasi huruf berurutan (>2 jadi 2), sebelum slangwords
df['text_normrepeat'] = df['text_casefoldingText'].apply(normalize_repeated_chars)

# Mengganti kata-kata slang dengan kata-kata standar dan menyimpannya di 'text_slangwords'
df['text_slangwords'] = df['text_normrepeat'].apply(fix_slangwords)

# Memecah teks menjadi token (kata-kata) dan menyimpannya di 'text_tokenizingText'
df['text_tokenizingText'] = df['text_slangwords'].apply(tokenizingText)

# Menghapus kata-kata stop (kata-kata umum) dan menyimpannya di 'text_stopword'
df['text_stopword'] = df['text_tokenizingText'].apply(filteringText)

# Menggabungkan token-token menjadi kalimat dan menyimpannya di 'text_akhir'
df['text_akhir'] = df['text_stopword'].apply(toSentence)

# Simpan versi bersih yang dipakai untuk inference selanjutnya
df['text_clean'] = df['text_slangwords']

# Output akhir
df['sentence'] = df['text_akhir']

In [49]:
df.sample(5)
output_cols = ['text_clean']
df_out = df[output_cols].copy()
output_csv = "data_clean/all_reviews_cleaned.csv"
df_out.to_csv(output_csv, index=False, encoding='utf-8-sig')