# Multilingual Stopwords Dataset with English Translation

This notebook demonstrates the complete process of creating and using a comprehensive multilingual stopwords dataset with English translations, optimized for Indonesian social media text analysis.

## Overview
- **Total entries**: 2,386
- **Languages**: Indonesian (formal/colloquial), English, Javanese, Sundanese
- **English coverage**: 46.6% (1,113 translations)
- **Translation method**: Dictionary-based mapping with manual curation

## 1. Setup and Data Loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)

print("📚 Libraries loaded successfully!")

In [None]:
# Load the translated multilingual stopwords dataset
df = pd.read_csv('multilingual_stopwords_dict_only.csv')

print(f"📊 Dataset loaded successfully!")
print(f"Total entries: {len(df):,}")
print(f"Columns: {list(df.columns)}")
print("\n📋 First 5 rows:")
df.head()

## 2. Dataset Statistics and Analysis

In [None]:
# Calculate language coverage statistics
def calculate_coverage(df):
    total_entries = len(df)
    coverage_stats = {}
    
    for col in df.columns:
        non_empty = (df[col].notna() & (df[col] != '')).sum()
        percentage = (non_empty / total_entries) * 100
        coverage_stats[col] = {
            'count': non_empty,
            'percentage': percentage
        }
    
    return coverage_stats

coverage = calculate_coverage(df)

print("🌍 Language Coverage Statistics:")
print("=" * 50)
for lang, stats in coverage.items():
    lang_name = {
        'en': 'English',
        'id': 'Indonesian (Colloquial)',
        'jv': 'Javanese',
        'su': 'Sundanese',
        'formal_id': 'Indonesian (Formal)'
    }.get(lang, lang)
    
    print(f"{lang_name:25}: {stats['count']:4,} entries ({stats['percentage']:5.1f}%)")

In [None]:
# Visualize language coverage
plt.figure(figsize=(12, 6))

# Prepare data for visualization
languages = ['English', 'Indonesian\n(Colloquial)', 'Javanese', 'Sundanese', 'Indonesian\n(Formal)']
counts = [coverage[col]['count'] for col in ['en', 'id', 'jv', 'su', 'formal_id']]
percentages = [coverage[col]['percentage'] for col in ['en', 'id', 'jv', 'su', 'formal_id']]

# Create bar plot
bars = plt.bar(languages, counts, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7'])

# Add percentage labels on bars
for bar, pct in zip(bars, percentages):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 20,
             f'{pct:.1f}%', ha='center', va='bottom', fontweight='bold')

plt.title('Multilingual Stopwords Dataset - Language Coverage', fontsize=16, fontweight='bold')
plt.ylabel('Number of Entries', fontsize=12)
plt.xlabel('Languages', fontsize=12)
plt.xticks(rotation=45)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

print(f"📈 English coverage improved significantly with {coverage['en']['count']:,} translations!")

## 3. Translation Quality Analysis

In [None]:
# Analyze translation quality by categories
def analyze_translation_categories(df):
    # Define categories based on English translations
    categories = {
        'Pronouns': ['i', 'you', 'he', 'she', 'they', 'we', 'it'],
        'Question Words': ['what', 'how', 'why', 'when', 'where', 'who', 'which'],
        'Intensifiers': ['very', 'too', 'quite', 'enough', 'most', 'more', 'less', 'much'],
        'Demonstratives': ['this', 'that', 'like this', 'like that'],
        'Conjunctions': ['and', 'or', 'but', 'because', 'when', 'if', 'while'],
        'Prepositions': ['in', 'on', 'at', 'to', 'from', 'with', 'for', 'by'],
        'Modal Verbs': ['can', 'may', 'must', 'should', 'will', 'would'],
        'Time Words': ['now', 'later', 'yesterday', 'tomorrow', 'today', 'before', 'after']
    }
    
    category_stats = {}
    
    for category, words in categories.items():
        matches = df[df['en'].isin(words)]
        category_stats[category] = len(matches)
    
    return category_stats

category_stats = analyze_translation_categories(df)

print("🎯 Translation Quality by Category:")
print("=" * 40)
for category, count in category_stats.items():
    print(f"{category:15}: {count:3d} entries")

total_categorized = sum(category_stats.values())
print(f"\n📊 Total categorized translations: {total_categorized}")

In [None]:
# Show sample high-quality translations
print("✨ Sample High-Quality Translations:")
print("=" * 50)

# Get some representative examples
sample_categories = {
    'Pronouns': ['i', 'you', 'he', 'they', 'we'],
    'Question Words': ['what', 'how', 'why', 'when', 'where'],
    'Intensifiers': ['very', 'too', 'enough'],
    'Demonstratives': ['this', 'that', 'like this', 'like that']
}

for category, words in sample_categories.items():
    print(f"\n{category}:")
    for word in words:
        matches = df[df['en'] == word]
        if not matches.empty:
            # Get first match
            row = matches.iloc[0]
            indonesian = row['formal_id'] if pd.notna(row['formal_id']) else row['id']
            if pd.notna(indonesian):
                print(f"  {indonesian:12} → {word}")

## 4. Translation Dictionary Implementation

In [None]:
# Show the comprehensive Indonesian-English dictionary used for translation
INDONESIAN_ENGLISH_DICT = {
    # Pronouns
    'saya': 'i', 'aku': 'i', 'kamu': 'you', 'dia': 'he', 'mereka': 'they',
    'kita': 'we', 'kami': 'we', 'kalian': 'you', 'beliau': 'he',
    
    # Function words
    'yang': 'which', 'dengan': 'with', 'untuk': 'for', 'dari': 'from',
    'pada': 'on', 'dalam': 'in', 'oleh': 'by', 'ke': 'to', 'di': 'in',
    
    # Intensifiers and adverbs
    'banget': 'very', 'sangat': 'very', 'sekali': 'very', 'agak': 'quite',
    'cukup': 'enough', 'terlalu': 'too', 'lebih': 'more', 'kurang': 'less',
    
    # Question words
    'apa': 'what', 'bagaimana': 'how', 'kenapa': 'why', 'kapan': 'when',
    'dimana': 'where', 'siapa': 'who', 'mana': 'which',
    
    # Demonstratives
    'ini': 'this', 'itu': 'that', 'begitu': 'like that', 'begini': 'like this',
    'seperti': 'like', 'kayak': 'like', 'kaya': 'like',
    
    # Conjunctions
    'dan': 'and', 'atau': 'or', 'tetapi': 'but', 'karena': 'because',
    'jika': 'if', 'ketika': 'when', 'saat': 'when', 'waktu': 'when',
    
    # Modal verbs
    'bisa': 'can', 'dapat': 'can', 'mau': 'want', 'ingin': 'want',
    'harus': 'must', 'perlu': 'need', 'boleh': 'may',
    
    # Social media slang
    'gitu': 'like that', 'gini': 'like this', 'gimana': 'how',
    'bgt': 'very', 'bngt': 'very', 'yg': 'which', 'dgn': 'with',
    
    # Particles (mapped to empty string for removal)
    'lah': '', 'kah': '', 'pun': '', 'sih': '', 'dong': '', 'kok': '',
    'deh': '', 'tuh': '', 'nih': ''
}

print(f"📖 Translation Dictionary Statistics:")
print(f"Total mappings: {len(INDONESIAN_ENGLISH_DICT)}")
print(f"Non-empty translations: {len([v for v in INDONESIAN_ENGLISH_DICT.values() if v])}")
print(f"Particle removals: {len([v for v in INDONESIAN_ENGLISH_DICT.values() if not v])}")

In [None]:
# Demonstrate the translation process
def translate_with_dictionary(text, dictionary):
    """Translate Indonesian text using the predefined dictionary"""
    words = text.lower().split()
    translated_words = []
    
    for word in words:
        if word in dictionary:
            translation = dictionary[word]
            if translation:  # Only add non-empty translations
                translated_words.append(translation)
        else:
            translated_words.append(word)  # Keep original if no translation
    
    return ' '.join(translated_words)

# Test translation examples
test_sentences = [
    "saya sangat suka dengan makanan ini",
    "dia banget pintar dalam matematika",
    "aku mau pergi ke sana dong",
    "bagaimana cara untuk membuat kue yang enak",
    "mereka bisa datang kapan saja"
]

print("🔄 Translation Examples:")
print("=" * 60)
for sentence in test_sentences:
    translated = translate_with_dictionary(sentence, INDONESIAN_ENGLISH_DICT)
    print(f"Original : {sentence}")
    print(f"Translated: {translated}")
    print()

## 5. Practical Applications

In [None]:
# Create stopword removal function
def create_stopword_remover(language='id'):
    """Create a stopword removal function for specified language"""
    
    if language == 'id':
        # Combine colloquial and formal Indonesian
        stopwords = set()
        stopwords.update(df['id'].dropna().str.lower().tolist())
        stopwords.update(df['formal_id'].dropna().str.lower().tolist())
    elif language == 'en':
        stopwords = set(df['en'].dropna().str.lower().tolist())
    elif language == 'jv':
        stopwords = set(df['jv'].dropna().str.lower().tolist())
    elif language == 'su':
        stopwords = set(df['su'].dropna().str.lower().tolist())
    else:
        raise ValueError(f"Unsupported language: {language}")
    
    def remove_stopwords(text):
        words = text.lower().split()
        filtered_words = [word for word in words if word not in stopwords]
        return ' '.join(filtered_words)
    
    return remove_stopwords, stopwords

# Create stopword removers for different languages
id_remover, id_stopwords = create_stopword_remover('id')
en_remover, en_stopwords = create_stopword_remover('en')

print(f"📊 Stopword Statistics:")
print(f"Indonesian stopwords: {len(id_stopwords):,}")
print(f"English stopwords: {len(en_stopwords):,}")

In [None]:
# Demonstrate social media text preprocessing
social_media_texts = [
    "wah banget sih kak, aku tuh pengen banget ke sana dong!",
    "gimana caranya untuk daftar kuliah yang bagus?",
    "dia itu sangat pintar dalam bidang teknologi loh",
    "aku mau beli makanan yang enak di warung itu",
    "mereka bisa datang ke acara kita nanti sore"
]

print("🔧 Social Media Text Preprocessing:")
print("=" * 70)

for i, text in enumerate(social_media_texts, 1):
    cleaned = id_remover(text)
    print(f"Text {i}:")
    print(f"  Original: {text}")
    print(f"  Cleaned : {cleaned}")
    print()

In [None]:
# Cross-language mapping demonstration
def get_cross_language_mapping(indonesian_word):
    """Get cross-language mappings for an Indonesian word"""
    matches = df[
        (df['id'].str.lower() == indonesian_word.lower()) | 
        (df['formal_id'].str.lower() == indonesian_word.lower())
    ]
    
    if matches.empty:
        return None
    
    row = matches.iloc[0]
    mapping = {}
    
    for lang in ['en', 'jv', 'su']:
        if pd.notna(row[lang]) and row[lang] != '':
            mapping[lang] = row[lang]
    
    return mapping

# Test cross-language mapping
test_words = ['saya', 'yang', 'dengan', 'banget', 'apa', 'ini']

print("🌐 Cross-Language Mapping Examples:")
print("=" * 50)

for word in test_words:
    mapping = get_cross_language_mapping(word)
    if mapping:
        print(f"Indonesian: {word}")
        for lang, translation in mapping.items():
            lang_name = {'en': 'English', 'jv': 'Javanese', 'su': 'Sundanese'}[lang]
            print(f"  {lang_name:10}: {translation}")
        print()

## 6. Sentiment Analysis Application

In [None]:
# Simulate sentiment analysis with stopword removal
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Sample Indonesian social media data for sentiment analysis
sample_data = [
    ("film ini sangat bagus dan menarik sekali", "positive"),
    ("makanan di restoran itu enak banget", "positive"),
    ("pelayanan yang buruk dan mengecewakan", "negative"),
    ("harga terlalu mahal untuk kualitas segitu", "negative"),
    ("tempat wisata yang indah dan nyaman", "positive"),
    ("produk ini tidak sesuai dengan ekspektasi", "negative"),
    ("pengalaman berbelanja yang menyenangkan", "positive"),
    ("kualitas produk sangat mengecewakan sekali", "negative"),
    ("rekomendasi tempat makan yang enak", "positive"),
    ("pelayanan lambat dan tidak profesional", "negative")
]

texts = [item[0] for item in sample_data]
labels = [item[1] for item in sample_data]

print("📊 Sample Data for Sentiment Analysis:")
for i, (text, label) in enumerate(sample_data[:5], 1):
    print(f"{i}. [{label.upper()}] {text}")

In [None]:
# Compare sentiment analysis with and without stopword removal
def compare_sentiment_analysis(texts, labels, stopwords=None):
    """Compare sentiment analysis performance with/without stopwords"""
    
    # Preprocess texts
    if stopwords:
        processed_texts = [id_remover(text) for text in texts]
        print("🔧 Using stopword removal")
    else:
        processed_texts = texts
        print("📝 Without stopword removal")
    
    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer(max_features=100, ngram_range=(1, 2))
    X = vectorizer.fit_transform(processed_texts)
    
    # Show feature names
    feature_names = vectorizer.get_feature_names_out()
    print(f"\n📋 Top 10 features: {list(feature_names[:10])}")
    print(f"Total features: {len(feature_names)}")
    
    return X, vectorizer

print("Comparison: Sentiment Analysis Feature Extraction")
print("=" * 60)

# Without stopword removal
X_without, vec_without = compare_sentiment_analysis(texts, labels, stopwords=None)

print("\n" + "-" * 40)

# With stopword removal
X_with, vec_with = compare_sentiment_analysis(texts, labels, stopwords=id_stopwords)

## 7. Export and Usage Guidelines

In [None]:
# Create usage examples for different frameworks
usage_examples = {
    'pandas': '''
# Load and use with pandas
import pandas as pd
df = pd.read_csv('multilingual_stopwords_dict_only.csv')
indonesian_stopwords = set(df['id'].dropna().str.lower().tolist())
''',
    
    'sklearn': '''
# Use with scikit-learn TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words=list(indonesian_stopwords))
''',
    
    'nltk': '''
# Use with NLTK
import nltk
from nltk.corpus import stopwords
# Add custom Indonesian stopwords to NLTK
nltk_stopwords = set(stopwords.words('english'))
nltk_stopwords.update(indonesian_stopwords)
''',
    
    'spacy': '''
# Use with spaCy (custom component)
import spacy
nlp = spacy.blank('id')  # Indonesian language model
nlp.Defaults.stop_words.update(indonesian_stopwords)
'''
}

print("📚 Framework Integration Examples:")
print("=" * 50)

for framework, code in usage_examples.items():
    print(f"\n{framework.upper()}:")
    print(code.strip())

In [None]:
# Final summary and recommendations
print("🎯 MULTILINGUAL STOPWORDS DATASET SUMMARY")
print("=" * 60)
print(f"📊 Total entries: {len(df):,}")
print(f"🌍 Languages supported: 5 (Indonesian formal/colloquial, English, Javanese, Sundanese)")
print(f"🔤 English translations: {coverage['en']['count']:,} ({coverage['en']['percentage']:.1f}%)")
print(f"📈 Translation improvement: +{coverage['en']['count'] - 245:,} new English entries")

print("\n✨ KEY FEATURES:")
features = [
    "✓ Comprehensive Indonesian social media slang coverage",
    "✓ High-quality manual translations for core stopwords",
    "✓ Support for formal and colloquial Indonesian variants",
    "✓ Regional language support (Javanese, Sundanese)",
    "✓ Cross-language mapping capabilities",
    "✓ Optimized for sentiment analysis applications",
    "✓ Ready for integration with popular NLP frameworks"
]

for feature in features:
    print(f"  {feature}")

print("\n🚀 RECOMMENDED APPLICATIONS:")
applications = [
    "• Indonesian social media sentiment analysis",
    "• Cross-language text classification",
    "• Multilingual information retrieval",
    "• Regional Indonesian language processing",
    "• Social media monitoring and analytics"
]

for app in applications:
    print(f"  {app}")

print("\n📁 FILES CREATED:")
files = [
    "• multilingual_stopwords_dict_only.csv - Final translated dataset",
    "• translate_with_dictionary.py - Translation script",
    "• README_Translated_Multilingual_Stopwords.md - Documentation",
    "• multilingual_stopwords_translation_demo.ipynb - This notebook"
]

for file in files:
    print(f"  {file}")

print("\n🎉 Translation project completed successfully!")