In [1]:
!pip install transformers==4.41.2 datasets==2.18.0 accelerate==0.30.1 --quiet


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m47.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m170.9/170.9 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m83.0 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2025.3.0 requires fsspec==2025.3.0, but you have fsspec 2024.2.0 which is incompatible.[0m

In [2]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AutoConfig, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import re


LOAD TRAIN AND TEST


In [3]:
train_df = pd.read_csv('/content/train_70.csv')
test_df  = pd.read_csv('/content/test_30.csv')

train_df.head(), test_df.head()


(      id          header_review  \
 0  50201             Luar Biasa   
 1  93053  Recomended Bagus Bang   
 2  28246   Barangnya Udh Sampai   
 3  13659    Barang Sudah Sampai   
 4   7886  Barang Sesuai Pesanan   
 
                                review_sangat_singkat  label  
 0  Thank u gan.sudah sampai.sesuai pesanan mantap...      1  
 1  Respon cepat. Recomended gan..thanks packing r...      1  
 2      Barangnya sangat bagus dan bahannya jga bagus      1  
 3                    Terima kasih gaaaaaaannnnnnnnnn      1  
 4                     Barang bagus,sesuai foto, puas      1  ,
       id                       header_review  \
 0  51890               Barang Sudah Sampai..   
 1  29962  Mantap Gan Pengiriman Cepat Sampai   
 2  86194              Sip...................   
 3  34344                               Trims   
 4  53163                             Kecewa!   
 
                                review_sangat_singkat  label  
 0                              barang sudah

Preprocessing


In [4]:
def preprocess(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

train_df['clean_review'] = train_df['review_sangat_singkat'].apply(preprocess)
test_df['clean_review']  = test_df['review_sangat_singkat'].apply(preprocess)


In [5]:
tokenizer = AutoTokenizer.from_pretrained("indolem/indobert-base-uncased")
config = AutoConfig.from_pretrained("indolem/indobert-base-uncased", num_labels=2)
bert = AutoModel.from_pretrained("indolem/indobert-base-uncased", config=config)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]



config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Dataset Class (disesuaikan dengan dataset kamu)

In [6]:
class ReviewDataset(Dataset):
    def __init__(self, df):
        self.texts = df['clean_review'].tolist()
        self.labels = df['label'].tolist()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoded = tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=128,
            return_tensors='pt'
        )

        item = {key: val.squeeze(0) for key, val in encoded.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item


In [7]:
train_dataset = ReviewDataset(train_df)
test_dataset  = ReviewDataset(test_df)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=16)


In [8]:
class IndoBERTClassifier(torch.nn.Module):
    def __init__(self, bert, num_labels=2):
        super().__init__()
        self.bert = bert
        self.dropout = torch.nn.Dropout(0.3)
        self.fc = torch.nn.Linear(768, num_labels)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_token = outputs.last_hidden_state[:, 0]
        x = self.dropout(cls_token)
        logits = self.fc(x)

        if labels is not None:
            loss_fn = torch.nn.CrossEntropyLoss()
            return loss_fn(logits, labels), logits
        return logits


In [9]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = IndoBERTClassifier(bert).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)




In [None]:
from tqdm.auto import tqdm

EPOCHS = 4 # Define the number of training epochs

for epoch in range(EPOCHS):
    print(f"\n===== EPOCH {epoch+1}/{EPOCHS} ====")
    total_loss = 0
    model.train()

    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        loss, _ = model(input_ids, attention_mask, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Loss akhir epoch {epoch+1}: {total_loss/len(train_loader):.4f}")


===== EPOCH 1/4 ====


Training Epoch 1:   0%|          | 0/4215 [00:00<?, ?it/s]

In [None]:
import torch
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].cpu().numpy()

        logits = model(input_ids, attention_mask)
        preds = torch.argmax(logits, dim=1).cpu().numpy()

        all_preds.extend(preds)
        all_labels.extend(labels)

# --- Print text output ---
print("\nConfusion Matrix:")
cm = confusion_matrix(all_labels, all_preds)
print(cm)

print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names=['negatif','positif']))

# --- Visual heatmap confusion matrix ---
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Negatif', 'Positif'],
            yticklabels=['Negatif', 'Positif'])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()




Confusion Matrix:
[[ 2386  1269]
 [  893 24351]]

Classification Report:
              precision    recall  f1-score   support

     negatif       0.73      0.65      0.69      3655
     positif       0.95      0.96      0.96     25244

    accuracy                           0.93     28899
   macro avg       0.84      0.81      0.82     28899
weighted avg       0.92      0.93      0.92     28899



In [None]:
import pandas as pd
import numpy as np # Add numpy import as it's used for np.random.choice

# pastikan df_test adalah dataframe test asli
df_test = test_df.reset_index(drop=True)

# Assign y_true and y_pred from the results of the previous cell
y_true = all_labels
y_pred = all_preds

# ambil 10 index acak
indices = np.random.choice(len(df_test), 10, replace=False)

print("===== 10 CONTOH HASIL PREDIKSI MODEL =====\n")

for i in indices:
    teks = df_test.loc[i, "review_sangat_singkat"]
    label_asli = y_true[i]
    prediksi = y_pred[i]
    status = "BENAR" if label_asli == prediksi else "SALAH"

    print(f"Teks Review : {teks}")
    print(f"Label Asli  : {label_asli} ({'Negatif' if label_asli==0 else 'Positif'})")
    print(f"Prediksi    : {prediksi} ({'Negatif' if prediksi==0 else 'Positif'})")
    print(f"Hasil       : {status}")
    print("-" * 60)


===== 10 CONTOH HASIL PREDIKSI MODEL =====

Teks Review : Mabtap biar bintang yg menjawab
Label Asli  : 1 (Positif)
Prediksi    : 1 (Positif)
Hasil       : BENAR
------------------------------------------------------------
Teks Review : Packing baik.. lembut dan wangi balm nya.. Cocok dengan kulit bibir saya
Label Asli  : 1 (Positif)
Prediksi    : 1 (Positif)
Hasil       : BENAR
------------------------------------------------------------
Teks Review : Sukak, bgs ya walaupun ga sesuai yg di gambar
Label Asli  : 1 (Positif)
Prediksi    : 1 (Positif)
Hasil       : BENAR
------------------------------------------------------------
Teks Review : saya telah konfirmasi penerimaan karena kehabisan kuota data
Label Asli  : 1 (Positif)
Prediksi    : 1 (Positif)
Hasil       : BENAR
------------------------------------------------------------
Teks Review : Sesuai harga........................................
Label Asli  : 1 (Positif)
Prediksi    : 1 (Positif)
Hasil       : BENAR
-----------------