In [1]:
!pip install google-play-scraper



In [2]:
!python scrape_playstore.py --app_id com.whatsapp --n 20000 --out dataset_raw.csv --lang id --country id

saved: 10000 -> dataset_raw.csv


In [3]:
import re
import pandas as pd

def clean_text(s: str) -> str:
    s = str(s).lower()
    s = re.sub(r"http\S+|www\.\S+", " ", s)         # url
    s = re.sub(r"[^a-z0-9\s]", " ", s)              # non-alnum
    s = re.sub(r"\s+", " ", s).strip()
    return s

df = pd.read_csv("dataset_raw.csv")

# drop yang kosong
df = df[df["content"].notna()]
df["content"] = df["content"].astype(str).str.strip()
df = df[df["content"] != ""]

# label binary
df = df[df["score"].isin([1,2,4,5])].copy()
df["label"] = df["score"].map({1:"negatif", 2:"negatif", 4:"positif", 5:"positif"})

# bersihkan teks
df["text"] = df["content"].apply(clean_text)

# optional: buang kolom identitas
keep_cols = ["text", "label", "score", "at"]
df_clean = df[keep_cols].copy()
df_clean.to_csv("dataset_clean.csv", index=False)
print(df_clean["label"].value_counts())


label
positif    7557
negatif    1885
Name: count, dtype: int64


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

X = df_clean["text"]
y = df_clean["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

model1 = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), max_features=200000)),
    ("clf", LinearSVC())
])
model1.fit(X_train, y_train)
pred = model1.predict(X_test)

print("ACC:", accuracy_score(y_test, pred))
print(classification_report(y_test, pred))


ACC: 0.8808893594494441
              precision    recall  f1-score   support

     negatif       0.70      0.71      0.70       377
     positif       0.93      0.92      0.93      1512

    accuracy                           0.88      1889
   macro avg       0.81      0.82      0.81      1889
weighted avg       0.88      0.88      0.88      1889



In [5]:
from sklearn.linear_model import LogisticRegression

model2 = Pipeline([
    ("tfidf", TfidfVectorizer(analyzer="char", ngram_range=(3,5), min_df=5)),
    ("clf", LogisticRegression(max_iter=300, n_jobs=-1))
])
model2.fit(X_train, y_train)
pred2 = model2.predict(X_test)
print("ACC:", accuracy_score(y_test, pred2))


ACC: 0.8883006881948121


In [6]:
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers

# encode label
label2id = {"negatif":0, "positif":1}
y_arr = df_clean["label"].map(label2id).values

X_train, X_test, y_train, y_test = train_test_split(
    df_clean["text"].values, y_arr, test_size=0.2, random_state=42, stratify=y_arr
)

max_words = 50000
max_len = 80

vectorize = layers.TextVectorization(
    max_tokens=max_words,
    output_mode="int",
    output_sequence_length=max_len
)
vectorize.adapt(X_train)

model3 = keras.Sequential([
    vectorize,
    layers.Embedding(max_words, 128),
    layers.Bidirectional(layers.LSTM(64)),
    layers.Dense(64, activation="relu"),
    layers.Dense(1, activation="sigmoid")
])

model3.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model3.fit(X_train, y_train, validation_split=0.1, epochs=3, batch_size=256)
loss, acc = model3.evaluate(X_test, y_test, verbose=0)
print("ACC:", acc)


Epoch 1/3
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 597ms/step - accuracy: 0.8142 - loss: 0.4851 - val_accuracy: 0.8413 - val_loss: 0.3400
Epoch 2/3
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 538ms/step - accuracy: 0.8756 - loss: 0.3136 - val_accuracy: 0.8677 - val_loss: 0.2982
Epoch 3/3
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 536ms/step - accuracy: 0.8970 - loss: 0.2675 - val_accuracy: 0.8796 - val_loss: 0.2760
ACC: 0.8803600072860718


In [7]:
import joblib
joblib.dump(model1, "svm_tfidf.joblib")


['svm_tfidf.joblib']

In [8]:
import joblib

model = joblib.load("svm_tfidf.joblib")

samples = [
    "aplikasinya sering error dan lemot",
    "fiturnya bagus banget, mantap",
]

pred = model.predict(samples)
for s, p in zip(samples, pred):
    print(f"{p}\t-> {s}")


negatif	-> aplikasinya sering error dan lemot
positif	-> fiturnya bagus banget, mantap
