<a href="https://colab.research.google.com/github/qoyyimil/pba/blob/main/notebooks/1A_Moovit_App_Review.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# SETUP
!pip install -q google-play-scraper PySastrawi nltk emoji regex pandas transformers torch


In [6]:
import pandas as pd, re, emoji, nltk, math, os
import nltk
from google_play_scraper import reviews_all, Sort
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk import word_tokenize
from collections import Counter
from transformers import pipeline
from datetime import datetime

# NLTK resources
nltk.download('punkt', quiet=True)
try:
    nltk.download('punkt_tab', quiet=True)
except:
    pass

APP_ID = "com.tranzmate"  # Moovit
STAMP  = datetime.now().strftime("%Y%m%d_%H%M")

# 0) folders
os.makedirs("data/raw", exist_ok=True)
os.makedirs("data/processed", exist_ok=True)

# 1) SCRAPE
raw = reviews_all(
    APP_ID,
    sleep_milliseconds=300,   # naikkan jika rate-limit
    lang='id', country='id',  # fokus bahasa Indonesia
    sort=Sort.MOST_RELEVANT
)
raw_df = pd.DataFrame(raw)

# SAVE RAW
raw_path = f"data/raw/moovit_{APP_ID}_raw_{STAMP}.csv"
raw_df.to_csv(raw_path, index=False, encoding="utf-8")
print("Saved RAW  :", raw_path)

if raw_df.empty:
    raise SystemExit("Tidak ada ulasan yang diambil. Coba ganti sort/lang/country.")

# 2) CLEANING + BASELINE
def clean(s):
    s = str(s or "").lower()
    s = re.sub(r"http\S+|www\.\S+", " ", s)
    s = emoji.replace_emoji(s, replace=" ")
    s = re.sub(r"\s+", " ", s).strip()
    return s

df = raw_df.copy()
df["content"] = df["content"].apply(clean)
df = df.dropna(subset=["content"])
df = df[df["content"].str.len() > 0].drop_duplicates(subset=["content"]).reset_index(drop=True)

def rating2sent(r):
    r = int(r)
    if r <= 2: return "negatif"
    if r == 3: return "netral"
    return "positif"

df["sentiment_baseline"] = df["score"].astype(int).apply(rating2sent)

# 3) STEMMING + TOP TOKENS
stemmer = StemmerFactory().create_stemmer()
def stemming_tokens(text):
    return word_tokenize(stemmer.stem(text))

df["stemmed_text"] = df["content"].apply(stemming_tokens)
all_tokens = [t for toks in df["stemmed_text"] for t in toks if len(t) > 2]
print("Top 30 tokens (setelah stemming):")
print(Counter(all_tokens).most_common(30))

# 4) MODEL INDO
clf = pipeline(
    "text-classification",
    model="w11wo/indonesian-roberta-base-sentiment-classifier",
    truncation=True, padding=True, max_length=256
)

# Batching agar cepat
BATCH = 64
preds = []
for i in range(0, len(df), BATCH):
    batch = df["content"].iloc[i:i+BATCH].tolist()
    outs = clf(batch)  # [{'label':'positive','score':...}, ...]
    preds.extend([o["label"].lower() for o in outs])

map_en2id = {"positive":"positif","neutral":"netral","negative":"negatif"}
df["sentiment_model"] = pd.Series(preds).map(map_en2id).fillna(pd.Series(preds))

# 5) RINGKASAN CEPAT
print("\nDistribusi (baseline dari rating):")
print(df["sentiment_baseline"].value_counts())

print("\nDistribusi (model Indo):")
print(df["sentiment_model"].value_counts())

print("\nPerbandingan baseline vs model (proporsi per baris baseline):")
print(pd.crosstab(df["sentiment_baseline"], df["sentiment_model"], normalize="index").round(3))

# 6) SIMPAN
proc_path = f"data/processed/moovit_{APP_ID}_sentiment_{STAMP}.csv"
cols = ["userName","at","appVersion","score","content",
        "sentiment_baseline","sentiment_model"]
avail_cols = [c for c in cols if c in df.columns]
df[avail_cols].to_csv(proc_path, index=False, encoding="utf-8")
print("Saved PROC:", proc_path)

Saved RAW  : data/raw/moovit_com.tranzmate_raw_20250902_0230.csv
Top 30 tokens (setelah stemming):
[('aplikasi', 473), ('bantu', 382), ('sangat', 371), ('dan', 295), ('iklan', 287), ('saya', 285), ('tidak', 278), ('ini', 271), ('ada', 262), ('nya', 223), ('rute', 195), ('jadwal', 190), ('bagus', 187), ('yang', 177), ('bisa', 176), ('gak', 170), ('guna', 160), ('banyak', 160), ('bus', 157), ('untuk', 152), ('akurat', 134), ('jalan', 123), ('waktu', 120), ('banget', 119), ('lagi', 115), ('mau', 110), ('jam', 110), ('tapi', 108), ('jadi', 106), ('sekali', 104)]


Device set to use cpu



Distribusi (baseline dari rating):
sentiment_baseline
positif    1144
negatif     775
netral      211
Name: count, dtype: int64

Distribusi (model Indo):
sentiment_model
negatif    938
positif    937
netral     255
Name: count, dtype: int64

Perbandingan baseline vs model (proporsi per baris baseline):
sentiment_model     negatif  netral  positif
sentiment_baseline                          
negatif               0.848   0.092    0.061
netral                0.540   0.223    0.237
positif               0.146   0.120    0.734
Saved PROC: data/processed/moovit_com.tranzmate_sentiment_20250902_0230.csv
