<a href="https://colab.research.google.com/github/stephanusanggoro401-hash/siakad-mini-cloudflare/blob/main/Stephanus_Tanpa_augmentasi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q torch transformers accelerate scikit-learn nltk sentencepiece sacremoses



In [None]:
# =====================================================
# INDO-BERT TANPA AUGMENTASI (MODEL PEMBANDING)
# =====================================================

import pandas as pd
import numpy as np
import torch
import re
import nltk
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments
)

nltk.download('wordnet')
nltk.download('omw-1.4')

# =====================================================
# LOAD DATASET
# =====================================================

from google.colab import files
files.upload()

df = pd.read_excel("shopee 5k.xlsx")

df = df[['content', 'score']]
df.columns = ['review', 'score']

df['score'] = pd.to_numeric(df['score'], errors='coerce')
df.dropna(subset=['score'], inplace=True)

def convert_label(score):
    if score <= 2:
        return 0
    elif score == 3:
        return 1
    else:
        return 2

df['label'] = df['score'].apply(convert_label)
df = df[['review', 'label']]
df.dropna(inplace=True)

print("Jumlah data awal:", len(df))

# =====================================================
# PREPROCESSING (SAMA)
# =====================================================

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df['review'] = df['review'].apply(clean_text)
df = df[df['review'].str.split().str.len() >= 3]

print("Jumlah data setelah preprocessing:", len(df))

# =====================================================
# SPLIT DATA (TANPA AUGMENTASI)
# =====================================================

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['review'],
    df['label'],
    test_size=0.2,
    stratify=df['label'],
    random_state=42
)

# =====================================================
# TOKENISASI INDO-BERT
# =====================================================

tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-base-p1")

train_enc = tokenizer(
    train_texts.tolist(),
    padding=True,
    truncation=True,
    max_length=128
)

test_enc = tokenizer(
    test_texts.tolist(),
    padding=True,
    truncation=True,
    max_length=128
)

# =====================================================
# DATASET PYTORCH
# =====================================================

class ShopeeDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels.reset_index(drop=True)

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ShopeeDataset(train_enc, train_labels)
test_dataset = ShopeeDataset(test_enc, test_labels)

# =====================================================
# LOAD MODEL
# =====================================================

model = BertForSequenceClassification.from_pretrained(
    "indobenchmark/indobert-base-p1",
    num_labels=3
)

# =====================================================
# TRAINING ARGUMENT
# =====================================================

training_args = TrainingArguments(
    output_dir="./results_no_aug",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    logging_dir="./logs_no_aug",
    logging_steps=100
)

# =====================================================
# METRIK EVALUASI
# =====================================================

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="weighted"
    )
    acc = accuracy_score(labels, preds)

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

# =====================================================
# TRAINING
# =====================================================

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

# =====================================================
# EVALUASI
# =====================================================

result_no_aug = trainer.evaluate()
print("Hasil Evaluasi Model TANPA Augmentasi:", result_no_aug)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Saving shopee 5k.xlsx to shopee 5k.xlsx
Jumlah data awal: 4999
Jumlah data setelah preprocessing: 3063


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


Step,Training Loss
100,0.4563
200,0.3806
300,0.309
400,0.2234


Hasil Evaluasi Model TANPA Augmentasi: {'eval_loss': 0.4792124032974243, 'eval_accuracy': 0.8531810766721044, 'eval_precision': 0.810362451205831, 'eval_recall': 0.8531810766721044, 'eval_f1': 0.8304872404085009, 'eval_runtime': 4.0279, 'eval_samples_per_second': 152.189, 'eval_steps_per_second': 9.683, 'epoch': 3.0}
