In [None]:
# ================================
# 0) Library
# ================================
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import joblib
import os, json, time

# Repro
RSEED = 42
np.random.seed(RSEED)

In [None]:
# ================================
# 1) Load data bersih
# ================================
df = pd.read_csv("tokopedia_reviews_clean.csv")

# Pastikan kolom tersedia
assert set(["text","sentiment"]).issubset(df.columns), "Kolom 'text'/'sentiment' tidak ditemukan."

# Drop NA/duplikat minimal
df = df.dropna(subset=["text","sentiment"]).drop_duplicates(subset=["text"]).reset_index(drop=True)
print(df.head(3))
print(df["sentiment"].value_counts())


In [None]:
# ================================
# 2) Train/Val/Test split (stratified)
# ================================
X = df["text"].astype(str).values
y = df["sentiment"].astype(str).values

# train 70%, val 15%, test 15%
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.15, random_state=RSEED, stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.1765, random_state=RSEED, stratify=y_temp
)  # 0.1765 * 0.85 ≈ 0.15

print(f"Train: {len(X_train)} | Val: {len(X_val)} | Test: {len(X_test)}")




In [None]:
# ================================
# 3) Pipeline TF-IDF + Logistic Regression
#    - class_weight='balanced' biar imbang antar kelas
#    - sublinear_tf=True buat handle kata frekuensi tinggi
# ================================
pipe = Pipeline([
    ("tfidf", TfidfVectorizer(
        ngram_range=(1,2),
        min_df=2,
        max_df=0.95,
        sublinear_tf=True
    )),
    ("clf", LogisticRegression(
        max_iter=2000,
        class_weight="balanced",
        n_jobs=None # ignored by LR liblinear/saga; biarin default
    ))
])

In [None]:
# ================================
# 4) GridSearch (hiperparameter sederhana)
# ================================
param_grid = {
    "clf__C": [0.5, 1.0, 2.0, 4.0],
    "clf__solver": ["lbfgs", "saga"],  # saga support l1/l2, lbfgs l2
    # tambahkan l1 penalty opsional
    # "clf__penalty": ["l2"]  # kalau mau l1, tambahkan "l1" dan solver="saga"
}
gs = GridSearchCV(
    pipe,
    param_grid=param_grid,
    scoring="f1_macro",
    cv=3,
    n_jobs=-1,
    verbose=1
)

gs.fit(X_train, y_train)
print("Best params:", gs.best_params_)
print("Best CV score (macro-F1):", gs.best_score_)

In [None]:
# ================================
# 5) Evaluasi di VAL set
# ================================
best_model = gs.best_estimator_

y_val_pred = best_model.predict(X_val)
print("\n=== VAL SET REPORT ===")
print(classification_report(y_val, y_val_pred, digits=4))

cm_val = confusion_matrix(y_val, y_val_pred, labels=["neg","neu","pos"])
disp = ConfusionMatrixDisplay(confusion_matrix=cm_val, display_labels=["neg","neu","pos"])
plt.figure()
disp.plot(values_format="d")
plt.title("Confusion Matrix - Validation")
plt.tight_layout()
plt.show()


In [None]:
# ================================
# 6) Evaluasi FINAL di TEST set
# ================================
y_test_pred = best_model.predict(X_test)
print("\n=== TEST SET REPORT ===")
print(classification_report(y_test, y_test_pred, digits=4))

cm_test = confusion_matrix(y_test, y_test_pred, labels=["neg","neu","pos"])
disp = ConfusionMatrixDisplay(confusion_matrix=cm_test, display_labels=["neg","neu","pos"])
plt.figure()
disp.plot(values_format="d")
plt.title("Confusion Matrix - Test")
plt.tight_layout()
plt.show()


In [None]:
# ================================
# 7) Simpan artefak model (pipeline) + meta
# ================================
os.makedirs("artefacts", exist_ok=True)
stamp = time.strftime("%Y%m%d-%H%M%S")
model_path = f"artefacts/sentiment_lr_pipeline_{stamp}.joblib"
joblib.dump(best_model, model_path)

meta = {
    "created_at": stamp,
    "model_path": model_path,
    "algo": "TFIDF+LogReg",
    "params": gs.best_params_,
    "classes": ["neg","neu","pos"],
    "notes": "Baseline sentiment Tokopedia"
}
with open(f"artefacts/metadata_{stamp}.json","w") as f:
    json.dump(meta, f, indent=2)

print("Saved:", model_path)

In [None]:
# ================================
# 8) Quick inference helper
# ================================
def predict_sentiment(texts):
    """texts: list[str]"""
    loaded = joblib.load(model_path)
    preds = loaded.predict(texts)
    proba = None
    # cek apakah model punya predict_proba
    try:
        proba = loaded.predict_proba(texts)
    except Exception:
        pass
    return preds, proba

# Contoh pakai:
samples = [
    "Barangnya bagus, pengiriman cepat banget. Recommended!",
    "Biasa aja sih. Tidak terlalu sesuai ekspektasi.",
    "Parah, barang rusak dan penjual susah dihubungi."
]
preds, prob = predict_sentiment(samples)
for i, s in enumerate(samples):
    print(f"[{preds[i]}] {s}")