In [1]:
import pandas as pd
import re
import unicodedata
from typing import List, Dict
import warnings
warnings.filterwarnings('ignore')

try:
    import demoji
    from nltk.corpus import stopwords
    from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
    print("Core libraries loaded successfully")
except ImportError as e:
    print(f"Error: {e}")
    print("Install required packages: pip install demoji nltk Sastrawi")
    raise

try:
    from indonlp.preprocessing import replace_slang, replace_word_elongation
    INDONLP_AVAILABLE = True
    print("indoNLP loaded")
except ImportError:
    INDONLP_AVAILABLE = False
    replace_slang = lambda x: x
    replace_word_elongation = lambda x: x
    print("indoNLP not found - using fallback mode")

Core libraries loaded successfully
indoNLP not found - using fallback mode


In [None]:
class Config:
    INPUT_FILE = 'data_filtered.csv'
    LABEL_FILE = 'dataset_labeled.csv'
    OUTPUT_FILE = 'data_preprocessed.csv'
    
    KEYWORDS = [
        'pemerintah', 'penduduk', 'pengangguran', 'pendapatan', 
        'perusahaan', 'pemimpin', 'mematikan', 'kemiskinan', 
        'keadilan', 'kebijakan', 'berantakan',
    ]
    
    NEGATION_WORDS = {
        'tidak', 'bukan', 'jangan', 'ga', 'gak', 'enggak', 
        'nggak', 'ndak', 'engga', 'belum', 'tanpa'
    }
    
    CUSTOM_STOPWORDS = {
        'yg', 'dg', 'rt', 'dgn', 'ny', 'd', 'klo', 'kalo', 'amp', 
        'biar', 'bkn', 'na', 'nya', 'nih', 'sih', 'si', 'tau', 'tuh', 
        'utk', 'ya', 'gaes', 'bang', 'bro', 'sob', 'gw', 'gua', 'lu', 
        'lo', 'wkwk', 'haha', 'wkwkwk', 'amin', 'amiin', 'aamiin', 
        'yuk', 'dong', 'deh', 'kok', 
        'government', 'tax', 'salary', 'system', 'netizen', 
        'the', 'and', 'or', 'in', 'of', 'to', 'is', 'for', 'it', 'on', 'at'
    }

print("Configuration loaded (Cleaned)")

Configuration loaded (Cleaned)


In [4]:
class PreprocessingTools:
    def __init__(self, config: Config):
        self.config = config
        self._initialize_tools()
        self._build_keyword_set()
        self._build_stopwords()
        
    def _initialize_tools(self):
        factory = StemmerFactory()
        self.stemmer = factory.create_stemmer()
        
    def _build_keyword_set(self):
        self.keyword_set = set()
        for keyword in self.config.KEYWORDS:
            self.keyword_set.update(keyword.lower().split())
        
    def _build_stopwords(self):
        stop_indo = set(stopwords.words('indonesian'))
        
        self.stopwords = (
            stop_indo.union(self.config.CUSTOM_STOPWORDS)
            - self.config.NEGATION_WORDS 
            - self.keyword_set
        )

config = Config()
tools = PreprocessingTools(config)

In [5]:
class TextPreprocessor:
    def __init__(self, tools: PreprocessingTools):
        self.tools = tools
        
    def clean_text(self, text: str) -> str:
        if pd.isna(text):
            return ""
        
        text = str(text)
        text = re.sub(r'http\S+|www\.\S+', '', text)
        text = re.sub(r'<.*?>', '', text)
        text = demoji.replace(text, '')
        text = unicodedata.normalize('NFKD', text)
        text = text.lower()
        
        if INDONLP_AVAILABLE:
            text = replace_slang(text)
            text = replace_word_elongation(text)
        
        text = re.sub(r'[^a-z\s]', ' ', text)
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    def tokenize(self, text: str) -> List[str]:
        if not text:
            return []
        return text.split()
    
    def normalize_negation(self, tokens: List[str]) -> List[str]:
        normalized = []
        for token in tokens:
            if token in self.tools.config.NEGATION_WORDS:
                normalized.append('tidak')
            else:
                normalized.append(token)
        return normalized

    def remove_stopwords(self, tokens: List[str]) -> List[str]:
        filtered = []
        for token in tokens:
            if token == 'tidak':
                filtered.append(token)
                continue
            
            if token in self.tools.keyword_set:
                filtered.append(token)
                continue
                
            if token not in self.tools.stopwords:
                filtered.append(token)
                
        return filtered

    def stem_tokens(self, tokens: List[str]) -> List[str]:
        stemmed = []
        for token in tokens:
            if token == 'tidak' or token in self.tools.keyword_set:
                stemmed.append(token)
            else:
                stemmed.append(self.tools.stemmer.stem(token))
        
        return stemmed
    
    def process(self, text: str) -> Dict[str, any]:
        result = {'original': text}
        result['cleaned'] = self.clean_text(text)
        result['tokens_raw'] = self.tokenize(result['cleaned'])
        result['tokens_negation'] = self.normalize_negation(result['tokens_raw'])
        result['tokens_filtered'] = self.remove_stopwords(result['tokens_negation'])
        result['tokens_stemmed'] = self.stem_tokens(result['tokens_filtered'])
        result['tokens_final'] = result['tokens_stemmed']
        result['final_text'] = " ".join(result['tokens_final'])
        
        return result

preprocessor = TextPreprocessor(tools)
print("Preprocessing pipeline initialized (Optimized: Cleaning -> Negation -> Stopwords -> Stemming)")

Preprocessing pipeline initialized (Optimized: Cleaning -> Negation -> Stopwords -> Stemming)


In [6]:
def load_data(filepath: str) -> pd.DataFrame:
    try:
        df = pd.read_csv(filepath)
        print(f"Loaded {len(df):,} comments from '{filepath}'")
        return df
    except FileNotFoundError:
        print(f"Error: File '{filepath}' not found")
        raise
    except Exception as e:
        print(f"Error loading data: {e}")
        raise

df = load_data(config.INPUT_FILE)
print(f"Data shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

Loaded 1,072 comments from 'data_filtered.csv'
Data shape: (1072, 3)
Columns: ['Video_ID', 'Teks_Komentar', 'text_normalized_temp']


In [7]:
def process_dataframe(df: pd.DataFrame, preprocessor: TextPreprocessor) -> pd.DataFrame:
    print("\nProcessing data...")
    
    text_col = 'text_normalized_temp' if 'text_normalized_temp' in df.columns else 'Teks_Komentar'
    
    results = []
    for idx, text in enumerate(df[text_col]):
        if idx % 100 == 0:
            print(f"Progress: {idx}/{len(df)} ({idx/len(df)*100:.1f}%)", end='\r')
        
        result = preprocessor.process(text)
        results.append(result)
    
    print(f"Progress: {len(df)}/{len(df)} (100.0%)")
    
    df_processed = df.copy()
    df_processed['teks_cleaned'] = [r['cleaned'] for r in results]
    df_processed['tokens_final'] = [r['tokens_final'] for r in results]
    df_processed['teks_final'] = [r['final_text'] for r in results]
    
    initial_count = len(df_processed)
    df_processed = df_processed[df_processed['teks_final'].str.strip() != ''].copy()
    removed_count = initial_count - len(df_processed)
    
    print(f"Processing complete: {len(df_processed):,} valid, {removed_count:,} empty removed")
    
    return df_processed

df_processed = process_dataframe(df, preprocessor)


Processing data...
Progress: 1072/1072 (100.0%)
Processing complete: 1,072 valid, 0 empty removed


In [11]:
def merge_sentiment_labels(df: pd.DataFrame, label_file: str) -> pd.DataFrame:
    try:
        df_labels = pd.read_csv(label_file)
        print(f"Loaded {len(df_labels):,} labeled records")
        
        df_labels = df_labels.drop_duplicates(subset=['Teks_Komentar'], keep='first')
        print(f"After removing duplicates: {len(df_labels):,} unique records")
        
        df_merged = df.merge(
            df_labels[['Teks_Komentar', 'sentiment']], 
            on='Teks_Komentar', 
            how='left'
        )
        
        print(f"Merge complete: {len(df_merged):,} records")
        print("\nSentiment distribution:")
        print(df_merged['sentiment'].value_counts())
        
        return df_merged
        
    except FileNotFoundError:
        print(f"Warning: Label file '{label_file}' not found")
        return df
    except Exception as e:
        print(f"Warning: Error merging labels: {e}")
        return df

df_final = merge_sentiment_labels(df_processed, config.LABEL_FILE)

Loaded 1,072 labeled records
After removing duplicates: 1,068 unique records
Merge complete: 1,072 records

Sentiment distribution:
sentiment
Positif    691
Netral     202
Negatif    179
Name: count, dtype: int64


In [12]:
def save_results(df: pd.DataFrame, output_file: str):
    if 'sentiment' in df.columns:
        output_cols = ['Teks_Komentar', 'teks_final', 'sentiment']
    else:
        output_cols = ['Teks_Komentar', 'teks_final']
    
    df[output_cols].to_csv(output_file, index=False, encoding='utf-8')
    print(f"Data saved to '{output_file}' ({len(df):,} rows)")

save_results(df_final, config.OUTPUT_FILE)

Data saved to 'data_preprocessed.csv' (1,072 rows)


In [13]:
def display_examples(df: pd.DataFrame, n_examples: int = 3):
    print("\nPreprocessing Examples:")
    print("="*60)
    
    for i in range(min(n_examples, len(df))):
        print(f"\nExample {i+1}")
        print("-"*60)
        
        if 'sentiment' in df.columns:
            print(f"Sentiment: {df['sentiment'].iloc[i]}")
        
        print(f"\nOriginal:\n{df['Teks_Komentar'].iloc[i][:100]}...")
        print(f"\nCleaned:\n{df['teks_cleaned'].iloc[i][:100]}...")
        print(f"\nFinal:\n{df['teks_final'].iloc[i][:100]}...")
        
        if 'tokens_final' in df.columns:
            tokens = df['tokens_final'].iloc[i]
            print(f"\nTokens ({len(tokens)}): {tokens[:10]}...")

display_examples(df_final)

print("\n" + "="*60)
print("PREPROCESSING COMPLETE")
print("="*60)
print("\nPipeline stages:")
print("1. Text Cleaning")
print("2. Tokenization")
print("3. Negation Normalization")
print("4. Stemming (preserve keywords)")
print("5. Stopword Removal (preserve negations)")
print("6. Text Reconstruction")
print(f"\nFinal dataset: {len(df_final):,} records")
print(f"Output file: '{config.OUTPUT_FILE}'")
print("="*60)


Preprocessing Examples:

Example 1
------------------------------------------------------------
Sentiment: Negatif

Original:
Sempat mikir mau pindah ke negara sebelah, ngeapply citizenship. Tapi yah, aku cinta negara dan hara...

Cleaned:
sempat mikir mau pindah ke negara sebelah ngeapply citizenship tapi yah aku cinta negara dan harapan...

Final:
mikir pindah negara belah ngeapply citizenship yah cinta negara harap moga negara sembuh cepat...

Tokens (14): ['mikir', 'pindah', 'negara', 'belah', 'ngeapply', 'citizenship', 'yah', 'cinta', 'negara', 'harap']...

Example 2
------------------------------------------------------------
Sentiment: Negatif

Original:
Kalo kabur mau kemana ke Singapur ,emang di Singapur tinggal dimana rakyat Singapur aja nggak punya ...

Cleaned:
kalau kabur mau kemana ke singapur emang di singapur tinggal dimana rakyat singapur saja tidak punya...

Final:
kabur mana singapur emang singapur tinggal mana rakyat singapur tidak rumah nyicil negara kaya singa...