# 1. Install Library

In [2]:
!pip install transformers datasets scikit-learn --quiet


# 2. Buat Dataset

In [3]:
import pandas as pd

data = [
    ("Barang sesuai deskripsi", "positive"),
    ("Pengiriman cepat dan aman", "positive"),
    ("Kualitas sangat bagus", "positive"),
    ("Sesuai harga, lumayan lah", "positive"),
    ("Packing rapi dan pengiriman cepat", "positive"),
    ("Barang bagus, tidak mengecewakan", "positive"),
    ("Kualitas mantap dan pelayanan baik", "positive"),
    ("Produk sesuai gambar dan cepat sampai", "positive"),
    ("Sangat puas dengan pembelian ini", "positive"),
    ("Pelayanan ramah dan produk bagus", "positive"),
    ("Barang bagus banget, makasih seller", "positive"),
    ("Cepat sampai dan langsung dicoba", "positive"),
    ("Penjual sangat responsif", "positive"),
    ("Barang sesuai ekspektasi", "positive"),
    ("Kualitas produk luar biasa", "positive"),
    ("Barang cacat dan tidak bisa dipakai", "negative"),
    ("Pengiriman sangat lama", "negative"),
    ("Kualitas jelek dan murahan", "negative"),
    ("Tidak sesuai deskripsi, mengecewakan", "negative"),
    ("Produk rusak saat diterima", "negative"),
    ("Packing asal-asalan dan produk kotor", "negative"),
    ("Barang tidak sesuai dengan gambar", "negative"),
    ("Sangat mengecewakan, tidak direkomendasikan", "negative"),
    ("Produk rusak dan pelayanan buruk", "negative"),
    ("Barang lama sampai dan kualitas buruk", "negative"),
    ("Pesanan tidak lengkap", "negative"),
    ("Kualitas oke tapi pengiriman lambat", "negative"),
    ("Barang bagus, tapi kemasan rusak", "negative"),
    ("Terlambat kirim dan tidak sesuai permintaan", "negative"),
    ("Kemasan rapi tapi isi produk rusak", "negative")
]

df = pd.DataFrame(data, columns=["review", "sentiment"])
df.to_csv("ulasan_produk.csv", index=False, quoting=1)


# 3. Load Dataset & Persiapkan Label

In [4]:
df = pd.read_csv("ulasan_produk.csv", quotechar='"')
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})
print(df.head())


                              review sentiment  label
0            Barang sesuai deskripsi  positive      1
1          Pengiriman cepat dan aman  positive      1
2              Kualitas sangat bagus  positive      1
3          Sesuai harga, lumayan lah  positive      1
4  Packing rapi dan pengiriman cepat  positive      1


# 4. Tokenisasi dengan IndoBERT Tokenizer

In [5]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-base-p1")

tokens = tokenizer(
    list(df['review']),
    padding='max_length',
    truncation=True,
    max_length=100,
    return_tensors='tf'
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.


# 5. Split Data Train & Test

In [15]:
import numpy as np
from sklearn.model_selection import train_test_split

# Konversi Tensor menjadi NumPy array
input_ids = tokens['input_ids'].numpy()
attention_mask = tokens['attention_mask'].numpy()
labels = df['label'].values

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    input_ids, labels, test_size=0.2, random_state=42)

attn_train, attn_test = train_test_split(
    attention_mask, test_size=0.2, random_state=42)


# 6. Load IndoBERT Model for Classification

In [8]:
from transformers import TFBertForSequenceClassification
import tensorflow as tf

model = TFBertForSequenceClassification.from_pretrained(
    "indobenchmark/indobert-base-p1",
    num_labels=2
)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)


tf_model.h5:   0%|          | 0.00/656M [00:00<?, ?B/s]

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 7. Training Model

In [9]:
history = model.fit(
    [X_train, attn_train],
    y_train,
    validation_split=0.2,
    epochs=4,
    batch_size=8
)


Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


# 8. Evaluasi Model

In [11]:
import numpy as np
from sklearn.metrics import classification_report

# Lakukan prediksi pada data test
preds = model.predict([X_test, attn_test])

# Ambil label prediksi dari logits
y_pred = np.argmax(preds.logits, axis=1)

# Tampilkan evaluasi performa model
print(classification_report(y_test, y_pred, target_names=["Negative", "Positive"]))


              precision    recall  f1-score   support

    Negative       1.00      1.00      1.00         4
    Positive       1.00      1.00      1.00         2

    accuracy                           1.00         6
   macro avg       1.00      1.00      1.00         6
weighted avg       1.00      1.00      1.00         6



# 9. Prediksi Ulasan Baru

In [14]:
def prediksi_ulasan(text):
    inputs = tokenizer(text, return_tensors="tf", padding=True, truncation=True, max_length=100)
    output = model.predict([inputs['input_ids'], inputs['attention_mask']])
    pred_label = np.argmax(output.logits, axis=1)[0]
    confidence = tf.nn.softmax(output.logits)[0][pred_label].numpy()
    label_str = "Positive" if pred_label == 1 else "Negative"
    return label_str, confidence

# Coba prediksi:
ulasan = "Kemasan rapi tapi isi produk rusak"
hasil, skor = prediksi_ulasan(ulasan)
print(f"Ulasan: {ulasan}\nPrediksi: {hasil} (Confidence: {skor:.2f})")


Ulasan: Kemasan rapi tapi isi produk rusak
Prediksi: Negative (Confidence: 0.91)
