In [None]:
import requests
import json
import pandas as pd
from IPython.display import display
import itables

class getConnectorDatabase:
    def __init__(self, gateway_url, service_slug, api_key=None):
        self.gateway_url = gateway_url.rstrip("/")  # Remove trailing slash
        self.service_slug = service_slug

        self.api_key = api_key
        self.headers = {}

        # Set API key in headers if provided
        if self.api_key:
            self.headers["x-api-key"] = self.api_key
            self.headers["Content-Type"] = "application/json"

    def build_url(self, endpoint_slug):
        """Build complete URL from components"""
        return f"{self.gateway_url}/{self.service_slug}/{endpoint_slug}"

    def get_connection(self, params=None, pretty_print=True):
        """Get data from specific endpoint"""
        url = self.build_url("get-all-connector")

        try:
            response = requests.get(
                url, headers=self.headers, params=params, timeout=30
            )

            if response.status_code == 200:
                data = response.json()

                if pretty_print:
                    print(json.dumps(data, indent=4, ensure_ascii=False))

                return data
            else:
                raise Exception(f"API Error: {response.status_code} - {response.text}")

        except requests.exceptions.RequestException as e:
            raise Exception(f"Connection Error: {str(e)}")

    def execute_query(self, connector_id, query, params=None, max_rows=1000):
        """Execute SQL query on the database"""
        url = self.build_url(f"query-sql?connector_id={connector_id}")

        payload = {"query": query, "params": params or [], "max_rows": max_rows}

        try:
            response = requests.post(
                url, headers=self.headers, json=payload, timeout=30
            )

            if response.status_code == 200:
                return response.json()
            else:
                raise Exception(
                    f"Query Error: {response.status_code} - {response.text}"
                )

        except requests.exceptions.RequestException as e:
            raise Exception(f"Connection Error: {str(e)}")

# Define Connector


In [None]:
# Usage Example
connector = getConnectorDatabase(
    gateway_url="https://ai.admasolusi.space",
    service_slug="connector-service",
    api_key="c1b311a30e10b6fc1b0137576667b024ced36054e3603ce36ce5be2b645f4128",
)

### List All Connector connection


In [None]:
# Get data from specific endpoint
data = connector.get_connection(pretty_print=True)

In [None]:
class Database:
    def __init__(self, connector, connector_id):
        self.connector = connector
        self.connector_id = connector_id

    def view_table_interactive(self, data, limit=50, height="400px", title=None):
        """
        Display database table or DataFrame using itables with fixed height and interactive features

        Args:
            data: Either table name (str) or DataFrame
            limit (int): Maximum number of rows to display (only for table names)
            height (str): Fixed height for the table container
            title (str): Optional title override
        """
        try:
            # Case 1: data is a DataFrame
            if isinstance(data, pd.DataFrame):
                df = data
                display_title = title or "DataFrame Results"

                # Display info
                print(
                    f"📋 {display_title} | 📊 {len(df)} rows | 📈 {len(df.columns)} cols"
                )

                # Show sentiment distribution if sentiment columns exist
                if "sentiment_label" in df.columns:
                    print("\n📊 Sentiment Distribution:")
                    sentiment_counts = df["sentiment_label"].value_counts()
                    for sentiment, count in sentiment_counts.items():
                        percentage = (count / len(df)) * 100
                        print(
                            f"  {sentiment.capitalize()}: {count} ({percentage:.1f}%)"
                        )

                # Display with itables - LEFT ALIGNED
                itables.show(
                    df,
                    lengthMenu=[10, 25, 50, 100],
                    scrollY=height,
                    scrollX=True,
                    paging=True,
                    maxBytes=10**6,
                    maxColumns=50,
                    style="text-align: left;",  # Add this for left alignment
                    classes="display",
                    columnDefs=[
                        {"className": "dt-left", "targets": "_all"}  # Force left alignment for all columns
                    ]
                )

            # Case 2: data is a table name (string)
            elif isinstance(data, str):
                table_name = data

                # Get total row count and column info
                count_query = f"SELECT COUNT(*) as total_rows FROM {table_name}"
                count_result = self.connector.execute_query(
                    self.connector_id, count_query
                )

                # Get column count
                col_query = f"""
                SELECT COUNT(*) as total_cols
                FROM information_schema.columns
                WHERE table_name = '{table_name}'
                """
                col_result = self.connector.execute_query(self.connector_id, col_query)

                # Get table data
                query = f"SELECT * FROM {table_name} LIMIT {limit}"
                result = self.connector.execute_query(
                    self.connector_id, query, max_rows=limit
                )

                if (
                    result["success"]
                    and count_result["success"]
                    and col_result["success"]
                ):
                    df = pd.DataFrame(result["rows"])

                    # Get actual totals
                    total_rows = count_result["rows"][0]["total_rows"]
                    total_cols = col_result["rows"][0]["total_cols"]

                    # Display info with actual table size vs displayed size
                    display_title = title or table_name
                    print(
                        f"📋 {display_title} | 📊 {total_rows} total rows ({len(df)} displayed) | 📈 {total_cols} cols | ⚡ {result['execution_time_ms']}ms"
                    )

                    # Display with itables - LEFT ALIGNED
                    itables.show(
                        df,
                        lengthMenu=[10, 25, 50, 100],
                        scrollY=height,
                        scrollX=True,
                        paging=True,
                        maxBytes=10**6,
                        maxColumns=50,
                        style="text-align: left;",  # Add this for left alignment
                        classes="display",
                        columnDefs=[
                            {"className": "dt-left", "targets": "_all"}  # Force left alignment for all columns
                        ]
                    )

                else:
                    print(f"❌ Error: {result.get('message', 'Unknown error')}")
            else:
                print(
                    f"❌ Error: Invalid data type. Expected DataFrame or table name string."
                )

        except Exception as e:
            print(f"❌ Error: {str(e)}")

    def get_table_list(self):
        """Get all available tables"""
        query = "SELECT table_name FROM information_schema.tables WHERE table_schema = 'public'"
        result = self.connector.execute_query(self.connector_id, query)

        if result["success"]:
            tables = [row["table_name"] for row in result["rows"]]
            print("📋 Available Tables:")
            for i, table in enumerate(tables, 1):
                print(f"{i}. {table}")
            return tables
        else:
            print(f"❌ Error: {result.get('message', 'Unknown error')}")

    def describe_table(self, table_name):
        """Get detailed table structure"""
        query = f"""
        SELECT column_name, data_type, is_nullable, column_default
        FROM information_schema.columns
        WHERE table_name = '{table_name}'
        ORDER BY ordinal_position
        """
        result = self.connector.execute_query(self.connector_id, query)

        if result["success"]:
            df = pd.DataFrame(result["rows"])
            print(f"📋 Table Structure: {table_name}")
            print("=" * 60)
            display(df)
            return df
        else:
            print(f"❌ Error: {result.get('message', 'Unknown error')}")

    def get_dataframe(self, table_name, limit=100, columns=None):
        """
        Get DataFrame from table with optional column selection

        Args:
            table_name (str): Name of the table
            limit (int): Number of rows to fetch
            columns (list): Specific columns to select (optional)

        Returns:
            pd.DataFrame: DataFrame with the data
        """
        try:
            # Build query
            if columns:
                columns_str = ", ".join(columns)
                query = f"SELECT {columns_str} FROM {table_name} LIMIT {limit}"
            else:
                query = f"SELECT * FROM {table_name} LIMIT {limit}"

            result = self.connector.execute_query(self.connector_id, query, max_rows=limit)

            if result["success"]:
                df = pd.DataFrame(result["rows"])
                print(f"✅ Fetched {len(df)} rows from {table_name}")
                if columns:
                    print(f"📊 Selected columns: {list(df.columns)}")
                else:
                    print(f"📊 All columns: {list(df.columns)}")
                return df
            else:
                print(f"❌ Error: {result.get('message', 'Unknown error')}")
                return None

        except Exception as e:
            print(f"❌ Error: {str(e)}")
            return None

    def get_columns(self, table_name):
        """
        Get all column names from a table

        Args:
            table_name (str): Name of the table

        Returns:
            list: List of column names
        """
        query = f"""
        SELECT column_name
        FROM information_schema.columns
        WHERE table_name = '{table_name}'
        ORDER BY ordinal_position
        """
        result = self.connector.execute_query(self.connector_id, query)

        if result["success"]:
            columns = [row["column_name"] for row in result["rows"]]
            print(f"📋 Columns in {table_name}: {columns}")
            return columns
        else:
            print(f"❌ Error getting columns: {result.get('message', 'Unknown error')}")
            return []

In [None]:
connector_id = "a46449c2-3b09-453f-b0ee-381c91da9779"
db = Database(connector, connector_id)
tables = db.get_table_list()

In [None]:
db.view_table_interactive("sample_data3")

# Preprocessing


### 1. IndoNLP Cleaning

In [None]:
from indoNLP.preprocessing import (
    remove_html,
    remove_url,
    remove_stopwords,
    replace_slang,
    replace_word_elongation,
    emoji_to_words,
    pipeline,
)
# 1. Setup pipeline
nlp_pipeline = pipeline(
    [remove_html, remove_url, emoji_to_words, replace_slang, replace_word_elongation]
)

# 2. Ambil data
df = db.get_dataframe("sample_data3", limit=500)


# 3. Simple preprocessing function
def clean_text(text):
    if pd.isna(text) or text is None or text == "":
        return "No Comment"
    try:
        return nlp_pipeline(str(text))
    except:
        return str(text)


# 4. Apply preprocessing
print("🔄 Preprocessing...")
df["full_text_clean"] = df["full_text"].apply(clean_text)

# 5. Lihat hasil
print("✅ Done!")
print(f"📊 Processed {len(df)} rows")

# Show examples
for i in range(3):
    print(f"\nExample {i+1}:")
    print(f"Original:  {df['full_text'].iloc[i]}")
    print(f"Processed: {df['full_text_clean'].iloc[i]}")

# 6. View interactive
db.view_table_interactive(df[["full_text", "full_text_clean"]], limit=50, title="Processed Comments", height="400px")

### 2. Twitter Cleaning

In [None]:
import re


class TwitterCleaner:
    @staticmethod  # Add this decorator
    def advanced_social_media_cleaning(text):
        """
        More advanced social media cleaning
        """
        if pd.isna(text) or text is None or text == "":
            return "No Comment"

        try:
            text = str(text)

            # 1. Remove URLs (more comprehensive)
            text = re.sub(
                r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
                "",
                text,
            )
            text = re.sub(
                r"www\.(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
                "",
                text,
            )

            # 2. Remove mentions and replies
            text = re.sub(r"@\w+", "", text)

            # 3. Clean hashtags (remove # but keep the word)
            text = re.sub(r"#(\w+)", r"\1", text)

            # 4. Remove retweet markers
            text = re.sub(r"^RT\s*:?\s*", "", text, flags=re.IGNORECASE)
            text = re.sub(r"\bRT\b", "", text, flags=re.IGNORECASE)

            # 5. Remove quote tweet markers
            text = re.sub(r'^".*"', "", text)

            # 6. Clean excessive punctuation
            text = re.sub(r"[!]{2,}", "!", text)
            text = re.sub(r"[?]{2,}", "?", text)
            text = re.sub(r"[.]{3,}", "...", text)

            # 7. Remove special characters but keep important punctuation
            text = re.sub(r"[^\w\s!?.,\-]", " ", text)

            # 8. Fix spacing issues
            text = re.sub(r"\s+", " ", text)
            text = re.sub(r"\n+", " ", text)

            # 9. Convert to lowercase
            text = text.lower()

            # 10. Remove leading/trailing whitespace
            text = text.strip()

            return text if text else "No Comment"

        except Exception as e:
            print(f"Error in advanced cleaning: {e}")
            return str(text)

    # Now apply Twitter cleaning to your IndoNLP cleaned text
print("🔄 Applying Twitter cleaning to IndoNLP processed text...")

# Apply Twitter cleaning to the already cleaned text
df["full_text_twitter_clean"] = df["full_text_clean"].apply(
    TwitterCleaner.advanced_social_media_cleaning
)

db.view_table_interactive(
    df[["full_text_clean", "full_text_twitter_clean"]],
    limit=50,
    title="IndoNLP vs Twitter Cleaning",
    height="400px",
)

# Model Inspect

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
import torch
from tqdm import tqdm

In [1]:
from huggingface_hub import HfApi
import humanize  # Untuk format angka (pip install humanize)


def inspect_model_details(model_names):
    """
    Inspect multiple models for a comprehensive comparison, including
    architecture, tokenizer, and Hub metadata.
    """
    print("🔍 COMPREHENSIVE MODEL INSPECTION")
    print("=" * 80)

    # Inisialisasi API untuk mengambil data dari Hugging Face Hub
    hf_api = HfApi()
    results = {}

    for model_name in model_names:
        print(f"\n📋 Inspecting: {model_name}")
        print("-" * 60)

        try:
            # 1. Inspeksi Konfigurasi (Seperti kode Anda, dengan tambahan)
            config = AutoConfig.from_pretrained(model_name)
            info = {
                "model_type": config.model_type,
                # Detail Arsitektur
                "hidden_size": getattr(config, "hidden_size", "N/A"),
                "num_layers": getattr(config, "num_hidden_layers", "N/A"),
                "num_heads": getattr(config, "num_attention_heads", "N/A"),
                "num_parameters": (
                    humanize.intword(config.num_parameters())
                    if hasattr(config, "num_parameters")
                    and callable(config.num_parameters)
                    else "N/A"
                ),
                # Detail Klasifikasi
                "num_labels": config.num_labels,
                "labels": dict(config.id2label) if hasattr(config, "id2label") else {},
                "problem_type": getattr(config, "problem_type", "Not specified"),
            }

            print("   [Architecture]")
            print(f"   - Model Type: {info['model_type']}")
            print(f"   - Parameters: {info['num_parameters']}")
            print(
                f"   - Layers: {info['num_layers']}, Hidden Size: {info['hidden_size']}, Heads: {info['num_heads']}"
            )

            print("\n   [Classification Task]")
            print(f"   - Problem Type: {info['problem_type']}")
            print(f"   - Number of Labels: {info['num_labels']}")
            if info["labels"]:
                print(f"   - Categories: {list(info['labels'].values())}")

            # 2. Inspeksi Tokenizer
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            info["tokenizer_class"] = tokenizer.__class__.__name__
            info["vocab_size"] = humanize.intword(tokenizer.vocab_size)

            print("\n   [Tokenizer]")
            print(f"   - Class: {info['tokenizer_class']}")
            print(f"   - Vocabulary Size: {info['vocab_size']}")

            # 3. Inspeksi Metadata dari Hugging Face Hub
            model_info_hub = hf_api.model_info(model_name)
            info["downloads"] = humanize.intword(model_info_hub.downloads)
            info["likes"] = humanize.intword(model_info_hub.likes)
            info["last_modified"] = model_info_hub.lastModified.split("T")[0]

            print("\n   [Hub Info]")
            print(f"   - Downloads: {info['downloads']}")
            print(f"   - Likes: {info['likes']}")
            print(f"   - Last Modified: {info['last_modified']}")

            results[model_name] = info

        except Exception as e:
            print(f"   ❌ Error processing {model_name}: {e}")
            results[model_name] = {"error": str(e)}

    return results

ModuleNotFoundError: No module named 'humanize'

In [None]:
models_to_compare = [
    "PaceKW/indobert-base-p1-multilabel-indonesian-hate-speech-new",
    "Aardiiiiy/indobertweet-base-Indonesian-sentiment-analysis",
    "Aardiiiiy/EmoSense-ID-Indonesian-Emotion-Classifier",
]

detailed_results = inspect_model_details(models_to_compare)

# Sentiment Analyze

In [None]:
class SentimentAnalyzer:
    def __init__(
        self, model_name="Aardiiiiy/indobertweet-base-Indonesian-sentiment-analysis"
    ):
        """Initialize sentiment analyzer with IndoBERTweet model"""
        print("🔄 Loading IndoBERTweet sentiment model...")

        # Using pipeline (simplest approach)
        self.pipe = pipeline(
            "text-classification",
            model=model_name,
            device=0 if torch.cuda.is_available() else -1,
        )

        # Load tokenizer and model separately for more control if needed
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)

        # Based on our inspection - we know exactly what labels exist
        self.sentiment_labels = ["NEGATIVE", "NEUTRAL", "POSITIVE"]

        print("✅ Sentiment model loaded successfully!")
        print(f"🔧 Using device: {'GPU' if torch.cuda.is_available() else 'CPU'}")
        print(f"🏷️ Labels: {', '.join(self.sentiment_labels)}")

    def predict_single(self, text):
        """Predict sentiment for a single text"""
        if pd.isna(text) or text is None or text == "" or text == "No Comment":
            return {"label": "NEUTRAL", "score": 0.0, "confidence": "low"}

        try:
            result = self.pipe(str(text))
            prediction = result[0]

            # Add confidence level based on score
            if prediction["score"] >= 0.8:
                confidence = "high"
            elif prediction["score"] >= 0.6:
                confidence = "medium"
            else:
                confidence = "low"

            return {
                "label": prediction["label"],
                "score": prediction["score"],
                "confidence": confidence,
            }

        except Exception as e:
            print(f"Error predicting sentiment: {e}")
            return {"label": "NEUTRAL", "score": 0.0, "confidence": "error"}

    def predict_batch(self, texts, batch_size=32):
        """Predict sentiment for multiple texts efficiently"""
        results = []

        # Convert to list if pandas Series
        if hasattr(texts, "tolist"):
            texts = texts.tolist()

        print(f"🔄 Processing {len(texts)} texts for sentiment analysis...")
        print(f"📊 Batch size: {batch_size}")

        # Process in batches with progress bar
        for i in tqdm(range(0, len(texts), batch_size), desc="Analyzing sentiment"):
            batch = texts[i : i + batch_size]

            # Clean batch texts
            clean_batch = []
            for text in batch:
                if pd.isna(text) or text is None or text == "" or text == "No Comment":
                    clean_batch.append("No Comment")
                else:
                    clean_batch.append(str(text))

            try:
                # Predict batch
                batch_results = self.pipe(clean_batch)

                # Process results
                for j, result in enumerate(batch_results):
                    if clean_batch[j] == "No Comment":
                        results.append(
                            {"label": "NEUTRAL", "score": 0.0, "confidence": "low"}
                        )
                    else:
                        # Add confidence level
                        if result["score"] >= 0.8:
                            confidence = "high"
                        elif result["score"] >= 0.6:
                            confidence = "medium"
                        else:
                            confidence = "low"

                        results.append(
                            {
                                "label": result["label"],
                                "score": result["score"],
                                "confidence": confidence,
                            }
                        )

            except Exception as e:
                print(f"Error in sentiment batch {i//batch_size + 1}: {e}")
                # Add neutral predictions for failed batch
                for _ in range(len(batch)):
                    results.append(
                        {"label": "NEUTRAL", "score": 0.0, "confidence": "error"}
                    )

        return results

    def analyze_results(self, results):
        """Analyze and display sentiment analysis results"""
        total = len(results)

        print(f"\n📊 SENTIMENT ANALYSIS SUMMARY")
        print("=" * 50)
        print(f"📝 Total texts: {total}")

        # Sentiment distribution
        sentiment_counts = {"POSITIVE": 0, "NEGATIVE": 0, "NEUTRAL": 0}
        for result in results:
            label = result["label"]
            sentiment_counts[label] = sentiment_counts.get(label, 0) + 1

        print(f"\n💭 Sentiment Distribution:")
        for sentiment, count in sentiment_counts.items():
            percentage = (count / total) * 100
            emoji = (
                "😊"
                if sentiment == "POSITIVE"
                else "😞" if sentiment == "NEGATIVE" else "😐"
            )
            print(f"   {emoji} {sentiment}: {count} ({percentage:.1f}%)")

        # Confidence distribution
        conf_counts = {"high": 0, "medium": 0, "low": 0, "error": 0}
        for result in results:
            conf_counts[result["confidence"]] += 1

        print(f"\n🎯 Confidence Distribution:")
        for conf, count in conf_counts.items():
            percentage = (count / total) * 100
            print(f"   {conf.capitalize()}: {count} ({percentage:.1f}%)")

        return {
            "total": total,
            "sentiment_distribution": sentiment_counts,
            "confidence_distribution": conf_counts,
        }

In [None]:
# Test SentimentAnalyzer only

print("🚀 Testing SentimentAnalyzer...")

# Initialize
try:
    sentiment_analyzer = SentimentAnalyzer()

    # Test with some examples
    test_texts = [
        "Selamat pagi semua! Hari ini sangat menyenangkan",
        "Pelayanan buruk sekali, mengecewakan banget",
        "Biasa saja, tidak ada yang istimewa",
        "Makanannya enak banget, sangat memuaskan!",
        "Harganya terlalu mahal untuk kualitas begini",
        "Terima kasih banyak atas bantuan Anda",
    ]

    print(f"\n🧪 Testing individual sentiment predictions:")
    print("-" * 60)

    for text in test_texts:
        result = sentiment_analyzer.predict_single(text)
        sentiment_emoji = (
            "😊"
            if result["label"] == "POSITIVE"
            else "😞" if result["label"] == "NEGATIVE" else "😐"
        )
        print(f"Text: '{text}'")
        print(
            f"   💭 Sentiment: {sentiment_emoji} {result['label']} ({result['score']:.3f}) - {result['confidence']}"
        )
        print()

    print("✅ SentimentAnalyzer test completed!")

except Exception as e:
    print(f"❌ Error in SentimentAnalyzer: {e}")
    import traceback

    traceback.print_exc()

# Emotion Analyze

In [None]:
class EmotionAnalyzer:
    def __init__(
        self, model_name="Aardiiiiy/EmoSense-ID-Indonesian-Emotion-Classifier"
    ):
        """Initialize emotion analyzer with EmoSense model"""
        print("🔄 Loading EmoSense Indonesian emotion model...")

        # Using pipeline (simplest approach)
        self.pipe = pipeline(
            "text-classification",
            model=model_name,
            device=0 if torch.cuda.is_available() else -1,
        )

        # Load tokenizer and model separately for more control if needed
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)

        # Based on our inspection - we know exactly what labels exist (Plutchik's 8 emotions)
        self.emotion_labels = [
            "Anger",
            "Anticipation",
            "Disgust",
            "Fear",
            "Joy",
            "Sadness",
            "Surprise",
            "Trust",
        ]

        # Emotion emojis for better display
        self.emotion_emojis = {
            "Anger": "😡",
            "Anticipation": "🤔",
            "Disgust": "🤢",
            "Fear": "😨",
            "Joy": "😊",
            "Sadness": "😢",
            "Surprise": "😲",
            "Trust": "🤝",
        }

        print("✅ Emotion model loaded successfully!")
        print(f"🔧 Using device: {'GPU' if torch.cuda.is_available() else 'CPU'}")
        print(f"🎭 Emotions: {', '.join(self.emotion_labels)}")

    def predict_single(self, text):
        """Predict emotion for a single text"""
        if pd.isna(text) or text is None or text == "" or text == "No Comment":
            return {"label": "Trust", "score": 0.0, "confidence": "low"}

        try:
            result = self.pipe(str(text))
            prediction = result[0]

            # Add confidence level based on score
            if prediction["score"] >= 0.8:
                confidence = "high"
            elif prediction["score"] >= 0.6:
                confidence = "medium"
            else:
                confidence = "low"

            return {
                "label": prediction["label"],
                "score": prediction["score"],
                "confidence": confidence,
            }

        except Exception as e:
            print(f"Error predicting emotion: {e}")
            return {"label": "Trust", "score": 0.0, "confidence": "error"}

    def predict_batch(self, texts, batch_size=32):
        """Predict emotion for multiple texts efficiently"""
        results = []

        # Convert to list if pandas Series
        if hasattr(texts, "tolist"):
            texts = texts.tolist()

        print(f"🔄 Processing {len(texts)} texts for emotion analysis...")
        print(f"📊 Batch size: {batch_size}")

        # Process in batches with progress bar
        for i in tqdm(range(0, len(texts), batch_size), desc="Analyzing emotions"):
            batch = texts[i : i + batch_size]

            # Clean batch texts
            clean_batch = []
            for text in batch:
                if pd.isna(text) or text is None or text == "" or text == "No Comment":
                    clean_batch.append("No Comment")
                else:
                    clean_batch.append(str(text))

            try:
                # Predict batch
                batch_results = self.pipe(clean_batch)

                # Process results
                for j, result in enumerate(batch_results):
                    if clean_batch[j] == "No Comment":
                        results.append(
                            {"label": "Trust", "score": 0.0, "confidence": "low"}
                        )
                    else:
                        # Add confidence level
                        if result["score"] >= 0.8:
                            confidence = "high"
                        elif result["score"] >= 0.6:
                            confidence = "medium"
                        else:
                            confidence = "low"

                        results.append(
                            {
                                "label": result["label"],
                                "score": result["score"],
                                "confidence": confidence,
                            }
                        )

            except Exception as e:
                print(f"Error in emotion batch {i//batch_size + 1}: {e}")
                # Add default predictions for failed batch
                for _ in range(len(batch)):
                    results.append(
                        {"label": "Trust", "score": 0.0, "confidence": "error"}
                    )

        return results

    def analyze_results(self, results):
        """Analyze and display emotion analysis results"""
        total = len(results)

        print(f"\n📊 EMOTION ANALYSIS SUMMARY")
        print("=" * 50)
        print(f"📝 Total texts: {total}")

        # Emotion distribution
        emotion_counts = {}
        for emotion in self.emotion_labels:
            emotion_counts[emotion] = 0

        for result in results:
            label = result["label"]
            emotion_counts[label] = emotion_counts.get(label, 0) + 1

        print(f"\n🎭 Emotion Distribution:")
        # Sort by count, descending
        sorted_emotions = sorted(
            emotion_counts.items(), key=lambda x: x[1], reverse=True
        )
        for emotion, count in sorted_emotions:
            percentage = (count / total) * 100
            emoji = self.emotion_emojis.get(emotion, "🎭")
            print(f"   {emoji} {emotion}: {count} ({percentage:.1f}%)")

        # Confidence distribution
        conf_counts = {"high": 0, "medium": 0, "low": 0, "error": 0}
        for result in results:
            conf_counts[result["confidence"]] += 1

        print(f"\n🎯 Confidence Distribution:")
        for conf, count in conf_counts.items():
            percentage = (count / total) * 100
            print(f"   {conf.capitalize()}: {count} ({percentage:.1f}%)")

        return {
            "total": total,
            "emotion_distribution": emotion_counts,
            "confidence_distribution": conf_counts,
        }

In [None]:
# Test EmotionAnalyzer only

print("🚀 Testing EmotionAnalyzer...")

# Initialize
try:
    emotion_analyzer = EmotionAnalyzer()

    # Test with emotion-specific examples
    emotion_test_texts = [
        ("Saya sangat marah dengan pelayanan ini!", "Expected: Anger"),
        ("Wah senang sekali dapat hadiah ini!", "Expected: Joy"),
        ("Saya merasa sedih sekali hari ini", "Expected: Sadness"),
        ("Ngeri banget nonton film horror tadi", "Expected: Fear"),
        ("Kaget banget ternyata dia datang!", "Expected: Surprise"),
        ("Jijik banget lihat yang begitu", "Expected: Disgust"),
        ("Saya percaya sepenuhnya dengan tim ini", "Expected: Trust"),
        ("Tidak sabar menunggu acara besok!", "Expected: Anticipation"),
    ]

    print(f"\n🧪 Testing individual emotion predictions:")
    print("-" * 60)

    for text, expected in emotion_test_texts:
        result = emotion_analyzer.predict_single(text)
        emotion_emoji = emotion_analyzer.emotion_emojis.get(result["label"], "🎭")
        print(f"Text: '{text}'")
        print(f"   {expected}")
        print(
            f"   🎭 Emotion: {emotion_emoji} {result['label']} ({result['score']:.3f}) - {result['confidence']}"
        )
        print()

    print("✅ EmotionAnalyzer test completed!")

except Exception as e:
    print(f"❌ Error in EmotionAnalyzer: {e}")
    import traceback

    traceback.print_exc()

# Hate speech Analyze

In [None]:
class HateSpeechAnalyzer:
    def __init__(
        self, model_name="PaceKW/indobert-base-p1-multilabel-indonesian-hate-speech-new"
    ):
        """Initialize hate speech analyzer"""
        print("🔄 Loading Indonesian Hate Speech model...")

        # Using pipeline (simplest approach)
        self.pipe = pipeline(
            "text-classification",
            model=model_name,
            device=0 if torch.cuda.is_available() else -1,
            return_all_scores=True,  # Important for multilabel
        )

        # Load tokenizer and model separately for more control if needed
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)

        # Based on our inspection - we know exactly what labels exist
        self.hate_categories = [
            "HS",
            "Abusive",
            "HS_Individual",
            "HS_Group",
            "HS_Religion",
            "HS_Race",
            "HS_Physical",
            "HS_Gender",
            "HS_Other",
            "HS_Weak",
            "HS_Moderate",
            "HS_Strong",
        ]

        print("✅ Hate speech model loaded successfully!")
        print(f"🔧 Using device: {'GPU' if torch.cuda.is_available() else 'CPU'}")
        print(f"🏷️ Categories: {', '.join(self.hate_categories)}")

    def predict_single(self, text, threshold=0.5):
        """Predict hate speech for a single text"""
        if pd.isna(text) or text is None or text == "" or text == "No Comment":
            return {
                "is_hate_speech": False,
                "categories": [],
                "scores": {},
                "max_score": 0.0,
                "confidence": "low",
            }

        try:
            # Get predictions for all labels
            results = self.pipe(str(text))

            # Debug: Print hasil untuk lihat struktur
            print(f"Debug - Raw result type: {type(results)}")
            print(f"Debug - Raw result: {results}")

            # Handle different output formats
            if isinstance(results, list):
                # Jika hasil adalah list of lists (nested)
                if len(results) > 0 and isinstance(results[0], list):
                    predictions = results[0]  # Ambil list pertama
                else:
                    predictions = results  # Sudah format yang benar
            else:
                predictions = [results]  # Bungkus dalam list jika bukan list

            # Process multilabel results
            active_categories = []
            all_scores = {}
            max_score = 0.0

            for prediction in predictions:
                # Handle different key formats
                if isinstance(prediction, dict):
                    if "label" in prediction and "score" in prediction:
                        label = prediction["label"]
                        score = prediction["score"]
                    elif "LABEL" in prediction and "SCORE" in prediction:
                        label = prediction["LABEL"]
                        score = prediction["SCORE"]
                    else:
                        print(f"Debug - Unknown prediction format: {prediction}")
                        continue
                else:
                    print(f"Debug - Unexpected prediction type: {type(prediction)}")
                    continue

                all_scores[label] = score
                max_score = max(max_score, score)

                # Add to active categories if above threshold
                if score >= threshold:
                    active_categories.append(label)

            # Determine if hate speech detected
            is_hate_speech = len(active_categories) > 0

            # Confidence based on max score
            if max_score >= 0.8:
                confidence = "high"
            elif max_score >= 0.6:
                confidence = "medium"
            else:
                confidence = "low"

            return {
                "is_hate_speech": is_hate_speech,
                "categories": active_categories,
                "scores": all_scores,
                "max_score": max_score,
                "confidence": confidence,
            }

        except Exception as e:
            print(f"Error predicting hate speech: {e}")
            import traceback

            traceback.print_exc()
            return {
                "is_hate_speech": False,
                "categories": [],
                "scores": {},
                "max_score": 0.0,
                "confidence": "error",
            }

    # ... rest of the methods remain the same ...
    def predict_batch(self, texts, batch_size=16, threshold=0.5):
        """Predict hate speech for multiple texts efficiently"""
        results = []

        # Convert to list if pandas Series
        if hasattr(texts, "tolist"):
            texts = texts.tolist()

        print(f"🔄 Processing {len(texts)} texts for hate speech analysis...")
        print(f"📊 Threshold: {threshold} | Batch size: {batch_size}")

        # Process in batches
        for i in tqdm(range(0, len(texts), batch_size), desc="Analyzing hate speech"):
            batch = texts[i : i + batch_size]

            # Process each text in batch
            for text in batch:
                result = self.predict_single(text, threshold)
                results.append(result)

        return results

In [None]:
# Test HateSpeechAnalyzer only - FIXED VERSION

print("🚀 Testing HateSpeechAnalyzer...")

# Initialize
try:
    # Buat versi yang lebih simple untuk testing
    hate_analyzer = HateSpeechAnalyzer()

    # Test with hate speech examples
    hate_test_texts = [
        ("Selamat pagi semua!", "Expected: Clean"),
        ("Terima kasih atas bantuannya", "Expected: Clean"),
        ("Dasar bodoh tidak tahu apa-apa", "Expected: Abusive/HS_Individual"),
        ("Agama kalian sesat semua", "Expected: HS_Religion/HS_Group"),
        ("Perempuan memang inferior", "Expected: HS_Gender"),
        ("Orang ras itu memang jelek", "Expected: HS_Race"),
        ("Bunuh saja dia", "Expected: HS_Strong"),
        ("Agak aneh sih orangnya", "Expected: HS_Weak"),
    ]

    print(f"\n🧪 Testing individual hate speech predictions:")
    print("-" * 60)

    for text, expected in hate_test_texts:
        # Hapus debug print untuk testing yang clean
        result = hate_analyzer.predict_single(text, threshold=0.5)

        print(f"Text: '{text}'")
        print(f"   {expected}")
        hate_status = "🚨 YES" if result["is_hate_speech"] else "✅ NO"
        print(
            f"   🚨 Hate Speech: {hate_status} ({result['max_score']:.3f}) - {result['confidence']}"
        )
        if result["categories"]:
            print(f"   🏷️ Categories: {', '.join(result['categories'])}")
        print()

    print("✅ HateSpeechAnalyzer test completed!")

except Exception as e:
    print(f"❌ Error in HateSpeechAnalyzer: {e}")
    import traceback

    traceback.print_exc()

In [None]:
hate_analyzer = HateSpeechAnalyzer()

hate_analyzer.predict_single("Dasar lu bego banget sih.")

In [None]:
# Simple debugging version
from transformers import pipeline
import json

print("=== DEBUGGING HATE SPEECH MODEL ===\n")

# Load the model
print("1. Loading model...")
try:
    pipe = pipeline(
        "text-classification",
        model="PaceKW/indobert-base-p1-multilabel-indonesian-hate-speech-new",
    )
    print("✅ Model loaded successfully!\n")
except Exception as e:
    print(f"❌ Error loading model: {e}")
    exit()

# Test text
test_text = "Kamu jelek banget"
print(f"2. Testing with: '{test_text}'\n")

# Method 1: Default prediction (no return_all_scores)
print("--- Method 1: Default prediction ---")
try:
    result1 = pipe(test_text)
    print(f"Type: {type(result1)}")
    print(f"Content: {result1}")
    print(f"JSON: {json.dumps(result1, indent=2, ensure_ascii=False)}\n")
except Exception as e:
    print(f"❌ Error: {e}\n")

# Method 2: With return_all_scores=True
print("--- Method 2: With return_all_scores=True ---")
try:
    result2 = pipe(test_text, return_all_scores=True)
    print(f"Type: {type(result2)}")
    print(f"Length: {len(result2) if hasattr(result2, '__len__') else 'N/A'}")
    print(f"Content: {result2}")
    print(f"JSON: {json.dumps(result2, indent=2, ensure_ascii=False)}\n")
except Exception as e:
    print(f"❌ Error: {e}\n")

# Method 3: Multiple texts
print("--- Method 3: Multiple texts ---")
texts = ["Kamu jelek", "Selamat pagi"]
try:
    result3 = pipe(texts)
    print(f"Type: {type(result3)}")
    print(f"Length: {len(result3) if hasattr(result3, '__len__') else 'N/A'}")
    print(f"Content: {result3}")
    print(f"JSON: {json.dumps(result3, indent=2, ensure_ascii=False)}\n")
except Exception as e:
    print(f"❌ Error: {e}\n")

# Method 4: Check model config
print("--- Method 4: Model info ---")
try:
    print(f"Model name: {pipe.model.name_or_path}")
    print(f"Task: {pipe.task}")
    if hasattr(pipe.model.config, "id2label"):
        print(f"Labels: {pipe.model.config.id2label}")
    if hasattr(pipe.model.config, "problem_type"):
        print(f"Problem type: {pipe.model.config.problem_type}")
except Exception as e:
    print(f"❌ Error getting model info: {e}")

print("\n=== DEBUG COMPLETE ===")
print("Run this first, then tell me what you see!")

In [None]:
from transformers import pipeline

# 1. Inisialisasi pipeline "fill-mask"
# Ini akan mengunduh model jika belum ada di cache
print("Mengunduh model (jika diperlukan)...")
tebak_kata = pipeline("fill-mask", model="cahya/bert-base-indonesian-1.5G")
print("Model siap digunakan.")

# 2. Siapkan beberapa kalimat tes
kalimat1 = "Ibu kota negara Indonesia adalah [MASK]."
kalimat2 = "Orang yang bekerja di rumah sakit biasanya adalah seorang [MASK]."
kalimat3 = "Setelah lelah bekerja seharian, enaknya minum [MASK] dingin."
kalimat4 = "Dia membeli mobil baru berwarna [MASK]."

# 3. Lakukan prediksi dan lihat hasilnya
print(f"\n--- Tes untuk: '{kalimat1}' ---")
hasil1 = tebak_kata(kalimat1)
for prediksi in hasil1:
    print(
        f"Kata: {prediksi['token_str']:<15} | Skor Keyakinan: {prediksi['score']:.4f}"
    )

print(f"\n--- Tes untuk: '{kalimat2}' ---")
hasil2 = tebak_kata(kalimat2, top_k=3)  # Minta 3 tebakan teratas
for prediksi in hasil2:
    print(f"Kalimat Lengkap: {prediksi['sequence']}")

print(f"\n--- Tes untuk: '{kalimat3}' ---")
hasil3 = tebak_kata(kalimat3, top_k=3)
for prediksi in hasil3:
    print(f"Kalimat Lengkap: {prediksi['sequence']}")

print(f"\n--- Tes untuk: '{kalimat4}' ---")
hasil4 = tebak_kata(kalimat4, top_k=3)
for prediksi in hasil4:
    print(f"Kalimat Lengkap: {prediksi['sequence']}")