In [1]:
# ============================================================
# IMPORT LIBRARY & KONFIGURASI
# ============================================================

!pip install gensim
!pip install Sastrawi

import pandas as pd
import numpy as np
import re
import string
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

# Untuk Word2Vec + LSTM (dipakai di percobaan 4)
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# Stemming Bahasa Indonesia (Sastrawi)
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

RANDOM_STATE = 42
CSV_PATH = "data_manual_250.csv"

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m38.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0
Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [2]:
# ============================================================
# PREPROCESSING (STEEMING + "LEMMATISASI")
# ============================================================

factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Kamus lemmatisasi sederhana (bisa kamu tambah sendiri)
LEMMATIZATION_DICT = {
    "nggak": "tidak",
    "gak": "tidak",
    "ga": "tidak",
    "ngga": "tidak",
    "bgt": "banget",
    "bener": "benar",
    "beneran": "benar",
}

def basic_clean(text: str) -> str:
    """Lowercase, hilangkan angka & punctuation, normalisasi spasi."""
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+", " ", text)       # hapus URL
    text = re.sub(r"\d+", " ", text)                  # hapus angka
    text = text.translate(str.maketrans("", "", string.punctuation))  # hapus tanda baca
    text = re.sub(r"\s+", " ", text).strip()
    return text

def apply_lemmatization(tokens):
    """Mapping token berdasarkan kamus lemmatisasi sederhana."""
    return [LEMMATIZATION_DICT.get(tok, tok) for tok in tokens]

def preprocess_text(text: str) -> str:
    """
    Pipeline preprocessing:
    1) basic cleaning
    2) stemming Sastrawi
    3) lemmatisasi kamus sederhana
    """
    cleaned = basic_clean(text)
    stemmed = stemmer.stem(cleaned)
    tokens = stemmed.split()
    tokens = apply_lemmatization(tokens)
    return " ".join(tokens)


In [3]:
# ============================================================
# LOAD DATA & TRAIN-TEST SPLIT
# ============================================================

df = pd.read_csv(CSV_PATH)
print(df.head())
print(df["label"].value_counts())

texts_raw = df["cleaned"].astype(str)
labels_raw = df["label"].astype(str)

print("\n[INFO] Melakukan preprocessing (stem + lemmatization)...")
texts_processed = texts_raw.apply(preprocess_text)

X_train, X_test, y_train, y_test = train_test_split(
    texts_processed,
    labels_raw,
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=labels_raw
)

print("[INFO] Distribusi label train:", Counter(y_train))
print("[INFO] Distribusi label test :", Counter(y_test))


                                             cleaned     label
0  jelek asik main night temen mati sisa gameplay...  negative
1                   alhamdulillah update seru senang  positive
2  bagus iklan murah dikit beli mahal murah dikit...  positive
3       kasih bintang bagus map map mabar candu main  positive
4                        main lot buruk sekali error  negative
label
negative    100
positive    100
neutral      50
Name: count, dtype: int64

[INFO] Melakukan preprocessing (stem + lemmatization)...
[INFO] Distribusi label train: Counter({'negative': 80, 'positive': 80, 'neutral': 40})
[INFO] Distribusi label test : Counter({'positive': 20, 'negative': 20, 'neutral': 10})


In [9]:
# ============================================================
# PERCOBAAN 1 - BOW + LOGISTIC REGRESSION
# ============================================================

bow_vectorizer_6 = CountVectorizer(
    max_features=5000,
    ngram_range=(1, 1)
)

X_train_bow_6 = bow_vectorizer_6.fit_transform(X_train)
X_test_bow_6 = bow_vectorizer_6.transform(X_test)

logreg_bow_clf = LogisticRegression(
    max_iter=1000,
    multi_class="multinomial",
    solver="lbfgs",
    random_state=RANDOM_STATE
)
logreg_bow_clf.fit(X_train_bow_6, y_train)

y_pred_logreg_bow = logreg_bow_clf.predict(X_test_bow_6)

print("=== Percobaan 1: BoW + Logistic Regression ===")
print("Accuracy:", accuracy_score(y_test, y_pred_logreg_bow))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_logreg_bow, digits=4))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_logreg_bow))

=== Percobaan 1: BoW + Logistic Regression ===
Accuracy: 0.86

Classification Report:
              precision    recall  f1-score   support

    negative     0.8947    0.8500    0.8718        20
     neutral     0.7273    0.8000    0.7619        10
    positive     0.9000    0.9000    0.9000        20

    accuracy                         0.8600        50
   macro avg     0.8407    0.8500    0.8446        50
weighted avg     0.8633    0.8600    0.8611        50


Confusion Matrix:
[[17  2  1]
 [ 1  8  1]
 [ 1  1 18]]




In [10]:
# ============================================================
# PERCOBAAN 2 - TF-IDF KARAKTER + LINEAR SVM
# ============================================================

from sklearn.feature_extraction.text import TfidfVectorizer as CharTfidfVectorizer

char_tfidf_vectorizer = CharTfidfVectorizer(
    analyzer="char",
    ngram_range=(3, 5),   # character 3–5-gram
    max_features=8000
)

X_train_char = char_tfidf_vectorizer.fit_transform(X_train)
X_test_char = char_tfidf_vectorizer.transform(X_test)

svm_char_clf = LinearSVC(random_state=RANDOM_STATE)
svm_char_clf.fit(X_train_char, y_train)

y_pred_char_svm = svm_char_clf.predict(X_test_char)

print("=== Percobaan 2: TF-IDF Karakter (3–5-gram) + Linear SVM ===")
print("Accuracy:", accuracy_score(y_test, y_pred_char_svm))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_char_svm, digits=4))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_char_svm))


=== Percobaan 2: TF-IDF Karakter (3–5-gram) + Linear SVM ===
Accuracy: 0.82

Classification Report:
              precision    recall  f1-score   support

    negative     0.7727    0.8500    0.8095        20
     neutral     0.7000    0.7000    0.7000        10
    positive     0.9444    0.8500    0.8947        20

    accuracy                         0.8200        50
   macro avg     0.8057    0.8000    0.8014        50
weighted avg     0.8269    0.8200    0.8217        50


Confusion Matrix:
[[17  3  0]
 [ 2  7  1]
 [ 3  0 17]]


In [11]:
# ============================================================
# PERCOBAAN 3: BoW (1–2 gram) + Linear SVM
# ============================================================

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

bow_1_2_vec = CountVectorizer(
    max_features=7000,
    ngram_range=(1, 2)   # unigram + bigram
)

X_train_bow_1_2 = bow_1_2_vec.fit_transform(X_train)
X_test_bow_1_2  = bow_1_2_vec.transform(X_test)

svm_bow = LinearSVC(
    C=1.0,
    random_state=RANDOM_STATE
)
svm_bow.fit(X_train_bow_1_2, y_train)

y_pred_bow_svm = svm_bow.predict(X_test_bow_1_2)

print("=== Percobaan 3: BoW (1–2 gram) + Linear SVM ===")
print("Accuracy:", accuracy_score(y_test, y_pred_bow_svm))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_bow_svm, digits=4))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_bow_svm))


=== Percobaan 3: BoW (1–2 gram) + Linear SVM ===
Accuracy: 0.84

Classification Report:
              precision    recall  f1-score   support

    negative     0.8947    0.8500    0.8718        20
     neutral     0.6667    0.6000    0.6316        10
    positive     0.8636    0.9500    0.9048        20

    accuracy                         0.8400        50
   macro avg     0.8083    0.8000    0.8027        50
weighted avg     0.8367    0.8400    0.8369        50


Confusion Matrix:
[[17  2  1]
 [ 2  6  2]
 [ 0  1 19]]


In [12]:
pip install sentence-transformers



In [13]:
# ============================================================
# PERCOBAAN 4 - TRANSFORMER (SBERT) + LINEAR SVM
# ============================================================

from sentence_transformers import SentenceTransformer

sbert_model = SentenceTransformer("distiluse-base-multilingual-cased-v2")

print("[INFO] Menghasilkan embedding Transformer untuk data train...")
X_train_sbert = sbert_model.encode(list(X_train), show_progress_bar=True)
print("[INFO] Menghasilkan embedding Transformer untuk data test...")
X_test_sbert = sbert_model.encode(list(X_test), show_progress_bar=True)

svm_sbert_clf = LinearSVC(random_state=RANDOM_STATE)
svm_sbert_clf.fit(X_train_sbert, y_train)

y_pred_sbert_svm = svm_sbert_clf.predict(X_test_sbert)

print("=== Percobaan 4: Transformer (SBERT) + Linear SVM ===")
print("Accuracy:", accuracy_score(y_test, y_pred_sbert_svm))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_sbert_svm, digits=4))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_sbert_svm))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/610 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/539M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

2_Dense/model.safetensors:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

[INFO] Menghasilkan embedding Transformer untuk data train...


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

[INFO] Menghasilkan embedding Transformer untuk data test...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

=== Percobaan 4: Transformer (SBERT) + Linear SVM ===
Accuracy: 0.82

Classification Report:
              precision    recall  f1-score   support

    negative     0.8500    0.8500    0.8500        20
     neutral     0.7143    0.5000    0.5882        10
    positive     0.8261    0.9500    0.8837        20

    accuracy                         0.8200        50
   macro avg     0.7968    0.7667    0.7740        50
weighted avg     0.8133    0.8200    0.8111        50


Confusion Matrix:
[[17  2  1]
 [ 2  5  3]
 [ 1  0 19]]
