<a href="https://colab.research.google.com/github/stephanusanggoro401-hash/siakad-mini-cloudflare/blob/main/Stephanus_Augmentasi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q transformers torch scikit-learn nltk sentencepiece


In [None]:
# =====================================================
# OPTIMASI INDO-BERT + AUGMENTASI TEKS SHOPEE (5K DATA)
# Back Translation menggunakan MarianMT
# =====================================================

import pandas as pd
import numpy as np
import torch
import re
import nltk
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    MarianMTModel,
    MarianTokenizer
)

from nltk.corpus import wordnet

nltk.download('wordnet')
nltk.download('omw-1.4')

# =====================================================
# LOAD DATASET (SESUAI DATA ASLI)
# =====================================================

from google.colab import files
files.upload()   # upload: shopee 5k .csv

df = pd.read_excel("shopee 5k.xlsx")


# Dataset asli:
# ['userName', 'score', 'at', 'content']

df = df[['content', 'score']]
df.columns = ['review', 'score']

# Konversi score → label sentimen (3 kelas)
# =====================================================
# PERBAIKAN TIPE DATA SCORE (WAJIB)
# =====================================================

# Pastikan score numerik (hindari TypeError)
df['score'] = pd.to_numeric(df['score'], errors='coerce')

# Buang data dengan score tidak valid
df.dropna(subset=['score'], inplace=True)

# Konversi score → label sentimen (3 kelas)
def convert_label(score):
    if score <= 2:
        return 0   # negatif
    elif score == 3:
        return 1   # netral
    else:
        return 2   # positif

df['label'] = df['score'].apply(convert_label)

# Simpan hanya kolom final
df = df[['review', 'label']]
df.dropna(inplace=True)

print("Jumlah data awal:", len(df))


# =====================================================
# PREPROCESSING
# =====================================================

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df['review'] = df['review'].apply(clean_text)
df = df[df['review'].str.split().str.len() >= 3]

print("Jumlah data setelah preprocessing:", len(df))

# =====================================================
# AUGMENTASI TEKS
# =====================================================

# A. Synonym Replacement (EDA)
def synonym_replacement(sentence, n=1):
    words = sentence.split()
    new_words = words.copy()

    random_words = list(set(words))
    np.random.shuffle(random_words)

    replaced = 0
    for word in random_words:
        synonyms = wordnet.synsets(word, lang='ind')
        if synonyms:
            synonym = synonyms[0].lemmas(lang='ind')[0].name()
            new_words = [synonym if w == word else w for w in new_words]
            replaced += 1
        if replaced >= n:
            break

    return " ".join(new_words)

# B. Back Translation (ID → EN → ID) MarianMT
device = "cuda" if torch.cuda.is_available() else "cpu"

model_id_en_name = "Helsinki-NLP/opus-mt-id-en"
model_en_id_name = "Helsinki-NLP/opus-mt-en-id"

tokenizer_id_en = MarianTokenizer.from_pretrained(model_id_en_name)
model_id_en = MarianMTModel.from_pretrained(model_id_en_name).to(device)

tokenizer_en_id = MarianTokenizer.from_pretrained(model_en_id_name)
model_en_id = MarianMTModel.from_pretrained(model_en_id_name).to(device)

def marian_back_translation(text):
    try:
        batch = tokenizer_id_en(text, return_tensors="pt", truncation=True).to(device)
        translated = model_id_en.generate(**batch)
        en_text = tokenizer_id_en.decode(translated[0], skip_special_tokens=True)

        batch = tokenizer_en_id(en_text, return_tensors="pt", truncation=True).to(device)
        translated = model_en_id.generate(**batch)
        id_text = tokenizer_en_id.decode(translated[0], skip_special_tokens=True)

        return id_text
    except:
        return text

# Terapkan augmentasi
aug_text, aug_label = [], []

# =====================================================
# AUGMENTASI TEKS (VERSI AMAN 30%)
# =====================================================

df_sample = df.sample(frac=0.3, random_state=42)

aug_text, aug_label = [], []

for i, (_, row) in enumerate(df_sample.iterrows()):
    if i % 50 == 0:
        print(f"Augmentasi ke-{i}")

    aug_text.append(synonym_replacement(row['review']))
    aug_label.append(row['label'])

    aug_text.append(marian_back_translation(row['review']))
    aug_label.append(row['label'])


df_aug = pd.DataFrame({
    'review': aug_text,
    'label': aug_label
})

df_final = pd.concat([df, df_aug]).sample(frac=1, random_state=42).reset_index(drop=True)
print("Total data setelah augmentasi:", len(df_final))

# =====================================================
# SPLIT DATA
# =====================================================

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df_final['review'],
    df_final['label'],
    test_size=0.2,
    stratify=df_final['label'],
    random_state=42
)

# =====================================================
# TOKENISASI INDO-BERT
# =====================================================

tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-base-p1")

train_enc = tokenizer(
    train_texts.tolist(),
    padding=True,
    truncation=True,
    max_length=128
)

test_enc = tokenizer(
    test_texts.tolist(),
    padding=True,
    truncation=True,
    max_length=128
)

# =====================================================
# DATASET PYTORCH
# =====================================================

class ShopeeDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels.reset_index(drop=True)

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ShopeeDataset(train_enc, train_labels)
test_dataset = ShopeeDataset(test_enc, test_labels)

# =====================================================
# LOAD MODEL INDO-BERT
# =====================================================

model = BertForSequenceClassification.from_pretrained(
    "indobenchmark/indobert-base-p1",
    num_labels=3
)

# =====================================================
# TRAINING ARGUMENT
# =====================================================

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    logging_dir="./logs",
    logging_steps=100
)

# =====================================================
# METRIK EVALUASI
# =====================================================

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="weighted"
    )
    acc = accuracy_score(labels, preds)

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

# =====================================================
# TRAINING MODEL
# =====================================================

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

# =====================================================
# EVALUASI AKHIR
# =====================================================

result = trainer.evaluate()
print("Hasil Evaluasi Model:", result)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Saving shopee 5k.xlsx to shopee 5k (3).xlsx
Jumlah data awal: 4999
Jumlah data setelah preprocessing: 3063




Augmentasi ke-0
Augmentasi ke-50
Augmentasi ke-100
Augmentasi ke-150
Augmentasi ke-200
Augmentasi ke-250
Augmentasi ke-300
Augmentasi ke-350
Augmentasi ke-400
Augmentasi ke-450
Augmentasi ke-500
Augmentasi ke-550
Augmentasi ke-600
Augmentasi ke-650
Augmentasi ke-700
Augmentasi ke-750
Augmentasi ke-800
Augmentasi ke-850
Augmentasi ke-900
Total data setelah augmentasi: 4901


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"




Step,Training Loss
100,0.4864
200,0.3958
300,0.3539
400,0.3028
500,0.2185
600,0.1697
700,0.1593




Hasil Evaluasi Model: {'eval_loss': 0.38191595673561096, 'eval_accuracy': 0.9011213047910296, 'eval_precision': 0.8929282918373966, 'eval_recall': 0.9011213047910296, 'eval_f1': 0.892254519761056, 'eval_runtime': 270.7084, 'eval_samples_per_second': 3.624, 'eval_steps_per_second': 0.229, 'epoch': 3.0}
