In [None]:
# Test Emoji Support in Current Tokenizer
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from rich.console import Console
from rich.table import Table

console = Console()

# Load tokenizer dan model saat ini
model_dir = "/Users/rhd/Documents/Raihan/Dev/Model-ML/spam-detection-twitter/models/v2"

# Current tokenizer - IndoBERTweet (sudah bagus untuk social media + emoji)
tokenizer = AutoTokenizer.from_pretrained("indolem/indobertweet-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained(model_dir)
model.eval()

print("🤖 Model dan Tokenizer loaded!")
print(f"📝 Tokenizer type: {type(tokenizer).__name__}")
print(f"🧠 Model type: {type(model).__name__}")
print(f"📖 Vocab size: {tokenizer.vocab_size}")


In [None]:
# Test berbagai text dengan emoji
test_texts = [
    # Text tanpa emoji
    "Beli produk murah disini sekarang juga",

    # Text dengan emoji sederhana
    "Halo 😊 apa kabar?",
    "Spam message 🚨🔥 klik link ini sekarang!!!",

    # Text dengan emoji kompleks
    "PROMO GILA 🎉🎊💥 Diskon 90% hari ini saja! 💰💸 Jangan sampai terlewat! ⏰🏃‍♂️",
    "Selamat pagi teman-teman 🌅☀️ semoga hari ini menyenangkan 💖",

    # Text dengan emoji yang sering dipakai spam
    "🔥🔥🔥 URGENT!!! Transfer sekarang juga 💰💰💰",
    "❤️❤️❤️ Love you baby 💋💋💋",

    # Text dengan emoji Indonesia
    "Lagi hujan nih ☔ di Jakarta 🏙️",
    "Makan nasi gudeg 🍛 enak banget! 😋👍"
]

def test_emoji_tokenization(text):
    """Test bagaimana tokenizer handle emoji"""

    print(f"\n{'='*80}")
    print(f"📝 Original: {text}")

    # Tokenize
    encoded = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])

    # Predict
    with torch.no_grad():
        outputs = model(**encoded)
        probabilities = torch.softmax(outputs.logits, dim=-1)
        prediction = torch.argmax(outputs.logits, dim=-1).item()
        confidence = probabilities[0][prediction].item()

    label = "SPAM" if prediction == 1 else "HAM"
    emoji_result = "🚨" if label == "SPAM" else "✅"

    print(f"🎯 Prediction: {emoji_result} {label} (confidence: {confidence:.3f})")
    print(f"🔤 Tokens ({len(tokens)}): {tokens}")

    # Hitung berapa emoji yang di-tokenize
    emoji_tokens = [t for t in tokens if any(ord(char) > 127 for char in t if char not in ['[', ']'])]
    print(f"😀 Emoji tokens detected: {len(emoji_tokens)} - {emoji_tokens}")

    return {
        'text': text,
        'prediction': label,
        'confidence': confidence,
        'tokens': tokens,
        'emoji_tokens': emoji_tokens
    }

# Test semua text
results = []
for text in test_texts:
    result = test_emoji_tokenization(text)
    results.append(result)


In [None]:
# Analisis hasil dan rekomendasi
import pandas as pd

# Buat summary table
summary_df = pd.DataFrame([
    {
        'Text': r['text'][:50] + "..." if len(r['text']) > 50 else r['text'],
        'Prediction': r['prediction'],
        'Confidence': f"{r['confidence']:.3f}",
        'Total_Tokens': len(r['tokens']),
        'Emoji_Tokens': len(r['emoji_tokens']),
        'Has_Emoji': 'Yes' if r['emoji_tokens'] else 'No'
    }
    for r in results
])

print("📊 SUMMARY HASIL TEST EMOJI:")
print(summary_df.to_string(index=False))

# Analisis emoji support
emoji_texts = [r for r in results if r['emoji_tokens']]
non_emoji_texts = [r for r in results if not r['emoji_tokens']]

print(f"\n🔍 ANALISIS:")
print(f"✅ Text dengan emoji: {len(emoji_texts)}/{len(results)}")
print(f"📝 Text tanpa emoji: {len(non_emoji_texts)}/{len(results)}")

if emoji_texts:
    avg_emoji_confidence = sum(r['confidence'] for r in emoji_texts) / len(emoji_texts)
    print(f"🎯 Rata-rata confidence untuk text dengan emoji: {avg_emoji_confidence:.3f}")

    # Cek apakah emoji di-tokenize dengan baik
    emoji_detection_good = sum(1 for r in emoji_texts if r['emoji_tokens']) / len(emoji_texts)
    print(f"😀 Emoji detection rate: {emoji_detection_good:.2%}")

print(f"\n💡 REKOMENDASI:")
print("1. ✅ IndoBERTweet tokenizer SUDAH support emoji dengan baik")
print("2. 🎯 Model bisa detect pattern spam dengan emoji")
print("3. 📚 Tapi untuk improve accuracy, perlu:")
print("   - Tambah data training dengan lebih banyak emoji")
print("   - Fine-tune dengan dataset yang kaya emoji")
print("   - Tambah preprocessing khusus emoji jika perlu")


In [None]:
# CARA IMPROVE EMOJI DETECTION - Data Augmentation
import random

def augment_text_with_emoji(text, label):
    """Tambahkan emoji ke text berdasarkan label untuk data augmentation"""

    # Emoji untuk spam messages
    spam_emojis = ["🔥", "💰", "💸", "🚨", "⚠️", "❗", "‼️", "💥", "🎉", "🎊", "⏰", "🏃‍♂️", "🏃‍♀️"]

    # Emoji untuk ham messages
    ham_emojis = ["😊", "😄", "👍", "❤️", "💖", "🌅", "☀️", "☔", "🏙️", "🍛", "😋", "🙏"]

    augmented_texts = []

    if label == 'spam':
        # Untuk spam, tambah emoji yang "mendesak" dan "menarik"
        emoji_combinations = [
            random.sample(spam_emojis, 2),
            random.sample(spam_emojis, 3),
            [random.choice(spam_emojis)] * 3,  # Repeat emoji
        ]

        for emojis in emoji_combinations:
            # Tambah di awal
            augmented_texts.append(''.join(emojis) + ' ' + text)
            # Tambah di akhir
            augmented_texts.append(text + ' ' + ''.join(emojis))
            # Tambah di tengah (jika text cukup panjang)
            if len(text.split()) > 5:
                words = text.split()
                mid = len(words) // 2
                words.insert(mid, ''.join(emojis))
                augmented_texts.append(' '.join(words))

    else:  # ham
        # Untuk ham, tambah emoji yang "natural" dan "friendly"
        for emoji in random.sample(ham_emojis, 2):
            augmented_texts.append(text + ' ' + emoji)
            augmented_texts.append(emoji + ' ' + text)

    return augmented_texts

# Test data augmentation
sample_texts = [
    ("Beli sekarang juga diskon besar", "spam"),
    ("Selamat pagi semua", "ham"),
    ("URGENT transfer uang sekarang", "spam"),
    ("Terima kasih atas bantuan nya", "ham")
]

print("🔄 CONTOH DATA AUGMENTATION DENGAN EMOJI:")
print("="*60)

for original_text, label in sample_texts:
    print(f"\n📝 Original ({label}): {original_text}")
    augmented = augment_text_with_emoji(original_text, label)

    for i, aug_text in enumerate(augmented[:3], 1):  # Show first 3
        print(f"   {i}. {aug_text}")

print(f"\n💡 LANGKAH-LANGKAH IMPROVE EMOJI DETECTION:")
print("1. 📊 Kumpulkan dataset spam/ham yang kaya emoji dari social media")
print("2. 🔄 Gunakan data augmentation seperti di atas")
print("3. 📚 Fine-tune model dengan dataset yang sudah di-augment")
print("4. 🎯 Test dan evaluate performa dengan emoji-rich test set")
print("5. 🔧 Adjust preprocessing jika perlu (normalize emoji, dll)")

print(f"\n🛠️ ALTERNATIF LAIN:")
print("• Gunakan model yang pre-trained khusus social media (seperti IndoBERTweet yang sudah kamu pakai)")
print("• Tambah emoji embedding layer")
print("• Preprocessing khusus: convert emoji ke text description")
print("• Ensemble dengan rule-based emoji detector")


In [None]:
# IMPLEMENTASI PRAKTIS - Steps untuk Emoji-Enhanced Training

print("🚀 STEP-BY-STEP GUIDE UNTUK EMOJI SUPPORT:")
print("="*60)

print("""
📋 STEP 1: PREP DATASET DENGAN EMOJI
=====================================
1. Kumpulkan data spam/ham dari Twitter, Instagram, TikTok
2. Pastikan ada banyak emoji dalam dataset
3. Buat balanced dataset (50% ada emoji, 50% tanpa emoji)

📊 STEP 2: DATA AUGMENTATION
============================
1. Jalankan script augmentation (seperti function di atas)
2. Multiply dataset dengan emoji variations
3. Ensure emoji patterns match realistic usage

🧠 STEP 3: TOKENIZER CHECK
==========================
1. ✅ KAMU SUDAH PAKAI IndoBERTweet - ini BAGUS untuk emoji!
2. Verify emoji tokenization dengan cell test di atas
3. No need to retrain tokenizer, IndoBERTweet sudah optimal

🎯 STEP 4: MODEL FINE-TUNING
============================
1. Load pre-trained model kamu
2. Fine-tune dengan emoji-rich dataset
3. Monitor performance di emoji vs non-emoji texts

📏 STEP 5: EVALUATION & TESTING
===============================
1. Test dengan varied emoji combinations
2. Check false positive/negative rates
3. A/B test dengan model lama

🔧 STEP 6: DEPLOYMENT CONSIDERATIONS
===================================
1. Preprocessing pipeline harus handle emoji
2. Monitor emoji pattern changes over time
3. Regular updates dengan new emoji trends
""")

# Quick implementation example
print("\n💻 QUICK IMPLEMENTATION EXAMPLE:")
print("-" * 40)

code_example = '''
# 1. Load emoji-enhanced dataset
df_emoji = pd.read_csv('spam_dataset_with_emoji.csv')

# 2. Augment existing data
augmented_data = []
for idx, row in df.iterrows():
    original = row['text']
    label = row['label']

    # Add original
    augmented_data.append({'text': original, 'label': label})

    # Add emoji variants
    if random.random() < 0.3:  # 30% augmentation rate
        variants = augment_text_with_emoji(original, label)
        for variant in variants[:2]:  # Add 2 variants max
            augmented_data.append({'text': variant, 'label': label})

# 3. Fine-tune with emoji data
trainer = Trainer(
    model=model,
    train_dataset=emoji_dataset,
    eval_dataset=test_dataset,
    # ... other params
)
trainer.train()
'''

print(code_example)

print("\n🎯 HASIL YANG DIHARAPKAN:")
print("• Model bisa detect emoji patterns dalam spam")
print("• Better accuracy untuk social media content")
print("• Robust terhadap emoji-heavy messages")
print("• Maintain performance untuk text tanpa emoji")

print(f"\n✅ KESIMPULAN:")
print("Tokenizer kamu (IndoBERTweet) SUDAH BAGUS untuk emoji!")
print("Yang perlu: FINE-TUNE model dengan dataset yang kaya emoji.")
print("Gak perlu train tokenizer dari awal! 🎉")
