In [None]:
# Fine-tuning Model BERT Spam Detection Indonesia
# Fine-tune ulang dari model nahiar/spam-detection-bert-v1

import pandas as pd
import numpy as np
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")


In [None]:
# 1. Load Model dan Tokenizer yang sudah ada
print("Loading model dan tokenizer...")

MODEL_NAME = "nahiar/spam-detection-bert-v1"

# Load tokenizer dan model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, trust_remote_code=True)

# Move model to GPU if available
model.to(device)

print(f"Model loaded: {MODEL_NAME}")
print(f"Model parameters: {model.num_parameters():,}")
print(f"Tokenizer vocab size: {tokenizer.vocab_size:,}")
print(f"Max length: {tokenizer.model_max_length}")


In [None]:
# 2. Load dan Prepare Dataset
print("Loading dataset...")

# Load combined dataset (atau gunakan dataset baru jika ada)
df = pd.read_csv('../data/combined_dataset.csv')

print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nDistribusi label:")
print(df['label'].value_counts())

# Sampling dataset jika terlalu besar (opsional)
# df = df.sample(n=5000, random_state=42)  # Uncomment jika perlu sampling

# Clean data
df = df.dropna()
df['text'] = df['text'].str.strip()
df = df[df['text'].str.len() > 0]

print(f"\nAfter cleaning - Dataset shape: {df.shape}")
print("Sample data:")
print(df.head())


In [None]:
# 3. Tokenization dan Dataset Preparation
def tokenize_function(examples):
    """Tokenize text data untuk training"""
    return tokenizer(
        examples['text'],
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors='pt'
    )

# Split dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df['label']
)

print(f"Training samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")

# Create datasets
train_dataset = Dataset.from_dict({
    'text': train_texts,
    'labels': train_labels
})

val_dataset = Dataset.from_dict({
    'text': val_texts,
    'labels': val_labels
})

# Tokenize datasets
print("Tokenizing datasets...")
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

print("Dataset tokenization completed!")


In [None]:
# 4. Metric Computation Function
def compute_metrics(eval_pred):
    """Compute metrics untuk evaluasi model"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    accuracy = accuracy_score(labels, predictions)

    return {
        'accuracy': accuracy,
        'eval_loss': 0.0  # Will be computed automatically
    }

# 5. Training Arguments
print("Setting up training arguments...")

OUTPUT_DIR = "../models/v2"  # Directory untuk save model baru

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=3,  # Bisa disesuaikan, untuk fine-tune ulang biasanya 1-3 epoch cukup
    per_device_train_batch_size=16,  # Sesuaikan dengan GPU memory
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=f'{OUTPUT_DIR}/logs',
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    report_to=None,  # Disable wandb/tensorboard
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
    dataloader_num_workers=4,
    save_total_limit=2,  # Keep only 2 best checkpoints
)

print("Training arguments configured!")


In [None]:
# 6. Setup Trainer
print("Setting up trainer...")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("Trainer configured!")


In [None]:
# 7. Start Fine-tuning
print("=" * 50)
print("MEMULAI FINE-TUNING MODEL")
print("=" * 50)

# Evaluate model sebelum fine-tuning
print("Evaluating model sebelum fine-tuning...")
eval_results_before = trainer.evaluate()
print(f"Accuracy sebelum fine-tuning: {eval_results_before['eval_accuracy']:.4f}")

# Start training
print("\nMemulai training...")
training_start_time = pd.Timestamp.now()

trainer.train()

training_end_time = pd.Timestamp.now()
training_duration = training_end_time - training_start_time

print(f"\nTraining selesai!")
print(f"Durasi training: {training_duration}")

# Evaluate model setelah fine-tuning
print("\nEvaluating model setelah fine-tuning...")
eval_results_after = trainer.evaluate()
print(f"Accuracy setelah fine-tuning: {eval_results_after['eval_accuracy']:.4f}")

print(f"\nImprovement: {eval_results_after['eval_accuracy'] - eval_results_before['eval_accuracy']:.4f}")


In [None]:
# 8. Save Model
print("Saving fine-tuned model...")

# Save model dan tokenizer
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"Model berhasil disimpan di: {OUTPUT_DIR}")

# Create model info file
model_info = {
    "base_model": MODEL_NAME,
    "fine_tuned_on": pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S"),
    "training_samples": len(train_texts),
    "validation_samples": len(val_texts),
    "accuracy_before": eval_results_before['eval_accuracy'],
    "accuracy_after": eval_results_after['eval_accuracy'],
    "improvement": eval_results_after['eval_accuracy'] - eval_results_before['eval_accuracy'],
    "training_duration": str(training_duration)
}

import json
with open(f"{OUTPUT_DIR}/model_info.json", "w") as f:
    json.dump(model_info, f, indent=2)

print("Model info saved!")


In [None]:
# 9. Test Model Baru
print("=" * 50)
print("TESTING MODEL YANG SUDAH DI-FINE-TUNE")
print("=" * 50)

# Test dengan beberapa contoh text
test_texts = [
    "Dapatkan hadiah gratis dengan klik link ini!",
    "Selamat pagi, bagaimana kabarnya hari ini?",
    "GRATIS! Klik link ini untuk mendapat hadiah jutaan rupiah!",
    "Terima kasih atas informasinya, sangat membantu",
    "URGENT! Transfer uang sekarang juga untuk mendapat bonus",
    "Besok ada rapat penting di kantor",
    "Klik www.gratishadiah.com untuk mendapat iPhone gratis!",
    "Selamat hari raya, semoga bahagia selalu"
]

print("Testing model dengan sample texts:")
print("-" * 40)

for i, text in enumerate(test_texts, 1):
    # Tokenize
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Predict
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(predictions, dim=-1).item()
        confidence = predictions[0][predicted_class].item()

    label = "SPAM" if predicted_class == 1 else "HAM"

    print(f"{i}. Text: {text}")
    print(f"   Prediction: {label} (confidence: {confidence:.4f})")
    print()

print("Testing selesai!")


# Cara Menggunakan Model yang Sudah Di-Fine-tune

Setelah fine-tuning selesai, Anda bisa menggunakan model baru dengan cara berikut:

## 1. Load Model Lokal

```python
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load model dari direktori lokal
tokenizer = AutoTokenizer.from_pretrained("../models/v2")
model = AutoModelForSequenceClassification.from_pretrained("../models/v2")
```

## 2. Menggunakan Pipeline

```python
from transformers import pipeline

# Buat classifier pipeline
classifier = pipeline(
    "text-classification",
    model="../models/v2",
    tokenizer="../models/v2"
)

# Test
result = classifier("Dapatkan hadiah gratis sekarang!")
print(result)
```

## 3. Upload ke Hugging Face Hub (Opsional)

Jika ingin mengupload model baru ke Hugging Face:

```python
from huggingface_hub import HfApi

# Login ke Hugging Face (perlu token)
# huggingface-cli login

# Upload model
model.push_to_hub("nahiar/spam-detection-bert-v2")
tokenizer.push_to_hub("nahiar/spam-detection-bert-v2")
```

## Tips Fine-tuning:

1. **Epoch**: Untuk fine-tune ulang, biasanya 1-3 epoch sudah cukup
2. **Learning Rate**: Gunakan learning rate yang lebih kecil (default biasanya ok)
3. **Batch Size**: Sesuaikan dengan GPU memory yang tersedia
4. **Data**: Pastikan data bersih dan seimbang antara spam/ham
5. **Evaluasi**: Selalu evaluasi model sebelum dan sesudah fine-tuning
