# Read Dataset

In [None]:
import pandas as pd

data = pd.read_excel("data/diy/sample_data_1.xlsx")
data.head()

In [None]:
# Contoh: styling kolom agar wrap
def display_fullscreen_wrap(df):
    return df.style.set_properties(
        **{
            "white-space": "pre-wrap",  # wrap isi cell
            "word-break": "break-word",  # pecah di mana saja kalau kepanjangan
            "width": "900px",  # bisa diganti sesuai kebutuhan
            "max-width": "700px",  # atur lebar kolom maksimal
        }
    )

# EDA

In [None]:
data.columns

### Summary

In [None]:
info = data.info()
print("Summary :", info)

### Find Missing Values

In [None]:
# menghitung dan menampilkan missing values
print("Jumlah missing values disetiap kolom:\n", data.isnull().sum())

### Delete unnecessary columns and does not provide any information

In [None]:
data = data[["full_text"]]
data.head()

### Check Duplicate Data that contain in the content column

In [None]:
data.duplicated().sum()

# Data Preparation

### Case Folding

In [None]:
# merubah jenis huruf menjadi huruf kecil
data["full_text"] = data["full_text"].str.lower()
data.head()

## Cleaning Data

### Emoji To Word

In [None]:
import pandas as pd
from indoNLP.preprocessing import (
    pipeline,
    replace_word_elongation,
    replace_slang,
    remove_html,
    remove_url,
    emoji_to_words
)
# Apply emoji_to_words to the text column
data["full_text"] = data["full_text"].apply(lambda x: emoji_to_words(str(x), lang="id"))
display_fullscreen_wrap(data.head())

### Remove HTML

In [None]:
# data["full_text"] = data["full_text"].apply(lambda x: remove_html(str(x)))
# display_fullscreen_wrap(data.head())

### Remove URL

In [None]:
# data["full_text"] = data["full_text"].apply(lambda x: remove_url(str(x)))
# display_fullscreen_wrap(data.head())

### Replace Slang

In [None]:
# data["full_text"] = data["full_text"].apply(lambda x: replace_slang(str(x)))
# display_fullscreen_wrap(data.head())

### Replace Word Elongation

In [None]:
data["full_text"] = data["full_text"].apply(lambda x: replace_word_elongation(str(x)))
display_fullscreen_wrap(data.head())

### Replace User Mentions

In [None]:
import re

# Function to replace URLs and user mentions
def replace_urls_and_mentions(text):
    # Replace URLs with HTTPURL
    text = re.sub(
        r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
        "HTTPURL",
        text,
    )
    text = re.sub(
        r"www\.(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
        "HTTPURL",
        text,
    )

    # Replace user mentions with @USER
    text = re.sub(r"@[A-Za-z0-9_]+", "@USER", text)

    return text

# Apply URL and mention replacement
data["full_text"] = data["full_text"].apply(lambda x: replace_urls_and_mentions(str(x)))
display_fullscreen_wrap(data.head())

In [None]:
# Check Ollama status and fix connection issues

import requests
import subprocess
import json

def check_ollama_status():
    """Check if Ollama is running and what models are available"""
    try:
        # Check if Ollama is running
        response = requests.get("http://localhost:11434/api/tags", timeout=5)
        if response.status_code == 200:
            models = response.json()
            print("✅ Ollama is running!")
            print("Available models:")
            for model in models.get("models", []):
                print(f"  - {model['name']}")
            return True, models.get("models", [])
        else:
            print(f"❌ Ollama API returned status: {response.status_code}")
            return False, []
    except requests.exceptions.ConnectionError:
        print("❌ Cannot connect to Ollama. Is it running?")
        print("To start Ollama, run: ollama serve")
        return False, []
    except Exception as e:
        print(f"❌ Error checking Ollama: {e}")
        return False, []


def start_ollama():
    """Try to start Ollama if it's not running"""
    try:
        print("🔄 Trying to start Ollama...")
        subprocess.Popen(
            ["ollama", "serve"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
        )
        print("✅ Ollama started! Wait a few seconds for it to initialize...")
        import time

        time.sleep(5)
        return True
    except Exception as e:
        print(f"❌ Failed to start Ollama: {e}")
        print("Please start Ollama manually by running 'ollama serve' in terminal")
        return False


def pull_model(model_name="llama3.1:8b"):
    """Pull the model if it's not available"""
    try:
        print(f"🔄 Pulling model {model_name}...")
        result = subprocess.run(
            ["ollama", "pull", model_name], capture_output=True, text=True, timeout=300
        )
        if result.returncode == 0:
            print(f"✅ Model {model_name} pulled successfully!")
            return True
        else:
            print(f"❌ Failed to pull model: {result.stderr}")
            return False
    except Exception as e:
        print(f"❌ Error pulling model: {e}")
        return False


# Check Ollama status
print("Checking Ollama status...")
is_running, available_models = check_ollama_status()

if not is_running:
    print("\n🔧 Ollama is not running. Trying to start it...")
    if start_ollama():
        # Recheck after starting
        import time

        time.sleep(3)
        is_running, available_models = check_ollama_status()

if is_running:
    # Check if we have the required model
    model_names = [model["name"] for model in available_models]
    required_models = ["llama3.1:8b", "llama3:latest", "llama3.1", "llama3"]

    available_model = None
    for model in required_models:
        if any(model in available for available in model_names):
            available_model = model
            break

    if not available_model:
        print(f"\n🔄 Required model not found. Available models: {model_names}")
        print("Pulling llama3.1:8b...")
        if pull_model("llama3.1:8b"):
            available_model = "llama3.1:8b"

    if available_model:
        print(f"\n✅ Using model: {available_model}")

        # Update the function with correct model
        def classify_hate_speech_with_ollama(text, model=available_model):
            """
            Classify text for hate speech using Ollama Llama3
            Returns: dict with 'is_hate_speech', 'target_type', 'confidence', 'reasoning'
            """

            prompt = f"""
Analisis teks berikut untuk menentukan apakah mengandung hate speech (ucapan kebencian) atau tidak.

Hate Speech adalah tindakan komunikasi berupa provokasi, hasutan, atau hinaan kepada individu/kelompok berdasarkan ras, warna kulit, etnis, gender, cacat, orientasi seksual, kewarganegaraan, agama, status sosial ekonomi, pandangan politik, atau karakteristik identitas lainnya.

Kriteria Hate Speech:
1. Mengandung kata-kata kasar, cercaan, atau makian yang ditujukan pada kelompok tertentu
2. Menyebarkan kebencian atau diskriminasi terhadap identitas seseorang
3. Menghasut atau memprovokasi tindakan kekerasan terhadap individu/kelompok
4. Merendahkan atau menghina berdasarkan karakteristik pribadi atau identitas
5. Mengandung ancaman atau intimidasi terhadap kelompok tertentu

Teks: "{text}"

Berikan analisis dalam format JSON:
{{
    "is_hate_speech": true/false,
    "target_type": "individual/group/none",
    "confidence": "high/medium/low",
    "reasoning": "penjelasan singkat mengapa diklasifikasikan demikian"
}}

Jawab hanya dengan JSON, tanpa penjelasan tambahan.
"""

            try:
                response = requests.post(
                    "http://localhost:11434/api/generate",
                    json={
                        "model": model,
                        "prompt": prompt,
                        "stream": False,
                        "options": {
                            "temperature": 0.1,
                            "top_p": 0.9,
                        },
                    },
                    timeout=60,
                )

                if response.status_code == 200:
                    result = response.json()
                    generated_text = result["response"].strip()

                    try:
                        import re

                        json_match = re.search(r"\{.*\}", generated_text, re.DOTALL)
                        if json_match:
                            json_str = json_match.group()
                            classification = json.loads(json_str)
                            return classification
                        else:
                            return {
                                "is_hate_speech": False,
                                "target_type": "none",
                                "confidence": "low",
                                "reasoning": "Failed to parse response",
                            }
                    except json.JSONDecodeError:
                        return {
                            "is_hate_speech": False,
                            "target_type": "none",
                            "confidence": "low",
                            "reasoning": "Invalid JSON response",
                        }
                else:
                    return {
                        "is_hate_speech": False,
                        "target_type": "none",
                        "confidence": "low",
                        "reasoning": f"API error: {response.status_code}",
                    }

            except Exception as e:
                return {
                    "is_hate_speech": False,
                    "target_type": "none",
                    "confidence": "low",
                    "reasoning": f"Error: {str(e)}",
                }

        # Test the function with a sample text
        sample_text = "Selamat pagi, bagaimana kabar Anda hari ini?"
        test_result = classify_hate_speech_with_ollama(sample_text)
        print("\n🧪 Test classification result:")
        print(json.dumps(test_result, indent=2, ensure_ascii=False))

    else:
        print("❌ Could not setup any model")
else:
    print("\n❌ Please start Ollama manually:")
    print("1. Open terminal")
    print("2. Run: ollama serve")
    print("3. In another terminal, run: ollama pull llama3.1:8b")
    print("4. Then run this cell again")

In [None]:
# Fixed Multi-Category Hate Speech Classification
import json


def classify_hate_speech_detailed(
    text, model="qwen2.5:latest"
):  # ✅ Gunakan model yang tersedia
    """
    Classify text for hate speech with detailed categories
    Returns: dict with multiple binary labels for different types of hate speech
    """

    prompt = f"""
Kamu adalah ahli analisis hate speech bahasa Indonesia. Analisis teks berikut dengan SANGAT HATI-HATI.

DEFINISI HATE SPEECH:
- Ucapan yang mengandung KEBENCIAN, DISKRIMINASI, atau PENGHINAAN terhadap individu/kelompok
- Berdasarkan ras, agama, gender, etnis, orientasi seksual, disabilitas, dll
- Menghasut kekerasan atau menciptakan permusuhan

YANG BUKAN HATE SPEECH:
- Berita kecelakaan, bencana, kriminal
- Kritik konstruktif
- Keluhan layanan publik
- Diskusi politik normal

Teks: "{text}"

Analisis dengan format JSON (nilai 1=ada, 0=tidak ada):
{{
    "HS": 0,
    "HS_INDIVIDU": 0,
    "HS_KELOMPOK": 0,
    "HS_RAS": 0,
    "HS_GENDER": 0,
    "HS_AGAMA": 0,
    "HS_POLITIK": 0,
    "HS_FISIK": 0,
    "HS_SOSIAL": 0,
    "confidence": "high",
    "reasoning": "penjelasan detail"
}}

HANYA jawab JSON, tidak ada teks lain.
"""

    try:
        response = requests.post(
            "http://localhost:11434/api/generate",
            json={
                "model": model,
                "prompt": prompt,
                "stream": False,
                "options": {
                    "temperature": 0.3,  # ✅ Naikkan sedikit untuk fleksibilitas
                    "top_p": 0.9,
                },
            },
            timeout=60,
        )

        if response.status_code == 200:
            result = response.json()
            generated_text = result["response"].strip()

            try:
                import re

                json_match = re.search(r"\{.*\}", generated_text, re.DOTALL)
                if json_match:
                    json_str = json_match.group()
                    classification = json.loads(json_str)

                    # Ensure all required fields exist with default values
                    default_result = {
                        "HS": 0,
                        "HS_INDIVIDU": 0,
                        "HS_KELOMPOK": 0,
                        "HS_RAS": 0,
                        "HS_GENDER": 0,
                        "HS_AGAMA": 0,
                        "HS_POLITIK": 0,
                        "HS_FISIK": 0,
                        "HS_SOSIAL": 0,
                        "confidence": "low",
                        "reasoning": "No classification",
                    }

                    # Update with actual results
                    default_result.update(classification)
                    return default_result
                else:
                    return default_result
            except json.JSONDecodeError:
                return default_result
        else:
            return default_result

    except Exception as e:
        return {
            "HS": 0,
            "HS_INDIVIDU": 0,
            "HS_KELOMPOK": 0,
            "HS_RAS": 0,
            "HS_GENDER": 0,
            "HS_AGAMA": 0,
            "HS_POLITIK": 0,
            "HS_FISIK": 0,
            "HS_SOSIAL": 0,
            "confidence": "low",
            "reasoning": f"Error: {str(e)}",
        }


# Test dengan teks yang JELAS BUKAN hate speech (FIXED - no double output)
test_texts = [
    "palang pintu pelintasan terbuka saat ka malioboro ekspres melintas kecelakaan pun tak bisa dihindari",
    "selamat pagi semua, semoga hari ini menyenangkan",
    "dasar orang hitam jelek kayak monyet",  # ini baru hate speech
]

print("Testing full JSON output:")
for i, text in enumerate(test_texts):
    result = classify_hate_speech_detailed(text)
    print(f"\n=== Text {i+1} ===")
    print(f"Text: {text[:50]}...")
    print("Full JSON Result:")
    print(json.dumps(result, indent=2, ensure_ascii=False))
    print("-" * 70)

In [None]:
# Test klasifikasi dengan 3 data pertama
import time
import pandas as pd

print("=== Testing dengan 3 data pertama ===")
test_data = data.head(3).copy()

# Tambahkan kolom-kolom hasil klasifikasi
classification_columns = [
    "HS",
    "HS_INDIVIDU",
    "HS_KELOMPOK",
    "HS_RAS",
    "HS_GENDER",
    "HS_AGAMA",
    "HS_POLITIK",
    "HS_FISIK",
    "HS_SOSIAL",
    "confidence",
    "reasoning",
]

for col in classification_columns:
    test_data[col] = None

# Klasifikasi 3 data pertama
for idx, row in test_data.iterrows():
    print(f"\n🔄 Memproses data ke-{idx+1}...")
    text = row["full_text"]
    print(f"Text: {text[:100]}...")

    # Klasifikasi
    result = classify_hate_speech_detailed(text)

    # Simpan hasil ke dataframe
    for col in classification_columns:
        test_data.at[idx, col] = result.get(
            col, 0 if col.startswith("HS") else "unknown"
        )

    print(f"HS: {result['HS']}, Confidence: {result['confidence']}")

    # Delay untuk menghindari overload
    time.sleep(2)

print("\n=== Hasil Testing 3 Data ===")
display_fullscreen_wrap(
    test_data[
        ["full_text", "HS", "HS_INDIVIDU", "HS_KELOMPOK", "confidence", "reasoning"]
    ]
)

In [None]:
# Setelah testing berhasil, proses semua data
print("=== Memproses SEMUA data ===")
print(f"Total data yang akan diproses: {len(data)}")

# Buat copy data dengan kolom klasifikasi
final_data = data.copy()

# Tambahkan kolom hasil klasifikasi
classification_columns = [
    "HS",
    "HS_INDIVIDU",
    "HS_KELOMPOK",
    "HS_RAS",
    "HS_GENDER",
    "HS_AGAMA",
    "HS_POLITIK",
    "HS_FISIK",
    "HS_SOSIAL",
    "confidence",
    "reasoning",
]

for col in classification_columns:
    final_data[col] = None

# Progress tracking
total_data = len(final_data)
processed = 0
start_time = time.time()

print("🚀 Mulai klasifikasi semua data...")

for idx, row in final_data.iterrows():
    processed += 1
    text = row["full_text"]

    # Show progress
    if processed % 10 == 0 or processed <= 5:
        elapsed = time.time() - start_time
        avg_time = elapsed / processed
        remaining = (total_data - processed) * avg_time
        print(
            f"Progress: {processed}/{total_data} ({processed/total_data*100:.1f}%) - ETA: {remaining/60:.1f} menit"
        )
        print(f"Current text: {text[:80]}...")

    # Klasifikasi
    result = classify_hate_speech_detailed(text)

    # Simpan hasil
    for col in classification_columns:
        final_data.at[idx, col] = result.get(
            col, 0 if col.startswith("HS") else "unknown"
        )

    # Delay untuk stability
    time.sleep(1)

print(f"\n✅ Selesai! Total waktu: {(time.time() - start_time)/60:.1f} menit")

In [None]:
# Simpan hasil klasifikasi
import os

# Buat folder output jika belum ada
output_dir = "data/diy"
os.makedirs(output_dir, exist_ok=True)

# Simpan ke Excel
output_file = f"{output_dir}/hate_speech_classification_results.xlsx"
final_data.to_excel(output_file, index=False)
print(f"✅ Data berhasil disimpan ke: {output_file}")

# Simpan juga ke CSV untuk backup
csv_file = f"{output_dir}/hate_speech_classification_results.csv"
final_data.to_csv(csv_file, index=False)
print(f"✅ Backup CSV disimpan ke: {csv_file}")

# Show summary statistics
print("\n=== SUMMARY HASIL KLASIFIKASI ===")
print(f"Total data: {len(final_data)}")
print(
    f"Hate Speech terdeteksi: {final_data['HS'].sum()} ({final_data['HS'].sum()/len(final_data)*100:.1f}%)"
)
print(
    f"Non Hate Speech: {len(final_data) - final_data['HS'].sum()} ({(len(final_data) - final_data['HS'].sum())/len(final_data)*100:.1f}%)"
)

print("\n=== Breakdown per kategori ===")
for col in [
    "HS_INDIVIDU",
    "HS_KELOMPOK",
    "HS_RAS",
    "HS_GENDER",
    "HS_AGAMA",
    "HS_POLITIK",
    "HS_FISIK",
    "HS_SOSIAL",
]:
    count = final_data[col].sum()
    print(f"{col}: {count} ({count/len(final_data)*100:.1f}%)")

print("\n=== Confidence level ===")
print(final_data["confidence"].value_counts())

# Show sample results
print("\n=== Sample Hasil Klasifikasi ===")
display_fullscreen_wrap(
    final_data[
        ["full_text", "HS", "HS_INDIVIDU", "HS_KELOMPOK", "confidence", "reasoning"]
    ].head(10)
)