In [1]:
import os
import re
import pandas as pd

In [None]:
def categorize_and_save(input_csv):
    # Define categories
    categories = {
        "kubu_01": ["anies", "anis", "cak imin", "muhaimin iskandar", r"\b01\b"],
        "kubu_02": ["prabowo", "gibran", r"\b02\b"],
        "kubu_03": ["ganjar", "mahfud", r"\b03\b"]
    }
    
    # Load dataset
    df = pd.read_csv(input_csv)
    
    # Ensure 'full_text' column exists
    if 'full_text' not in df.columns:
        raise ValueError("The dataset must contain a 'full_text' column.")
    
    # Create empty DataFrames for each category
    categorized_dfs = {cat: pd.DataFrame(columns=df.columns) for cat in categories}
    general_df = pd.DataFrame(columns=df.columns)
    
    # Create tracking DataFrame for overlaps
    overlap_counts = {
        "01_02": 0,
        "01_03": 0,
        "02_03": 0,
        "01_02_03": 0
    }
    
    multi_category_df = pd.DataFrame(columns=df.columns.tolist() + ["categories"])
    
    for index, row in df.iterrows():
        text = str(row['full_text']).lower()
        matched_categories = []
        
        # Check for category matches, ensuring numbers are standalone
        for cat, keywords in categories.items():
            for keyword in keywords:
                if isinstance(keyword, str) and keyword in text:
                    matched_categories.append(cat)
                elif isinstance(keyword, str) and re.search(keyword, text):
                    matched_categories.append(cat)
        
        matched_categories = list(set(matched_categories))  # Remove duplicates
        
        # Assign to respective categories
        for cat in matched_categories:
            categorized_dfs[cat] = pd.concat([categorized_dfs[cat], pd.DataFrame([row])], ignore_index=True)
        
        # If no category matched, add to general
        if not matched_categories:
            general_df = pd.concat([general_df, pd.DataFrame([row])], ignore_index=True)
        
        # Track multiple category matches
        if len(matched_categories) > 1:
            row_with_cats = row.tolist() + [", ".join(matched_categories)]
            multi_category_df = pd.concat([multi_category_df, pd.DataFrame([row_with_cats], columns=multi_category_df.columns)], ignore_index=True)
            
            # Count category overlaps
            matched_set = set(matched_categories)
            if matched_set == {"kubu_01", "kubu_02"}:
                overlap_counts["01_02"] += 1
            elif matched_set == {"kubu_01", "kubu_03"}:
                overlap_counts["01_03"] += 1
            elif matched_set == {"kubu_02", "kubu_03"}:
                overlap_counts["02_03"] += 1
            elif matched_set == {"kubu_01", "kubu_02", "kubu_03"}:
                overlap_counts["01_02_03"] += 1
    
    # Save to CSV
    for cat, df_cat in categorized_dfs.items():
        df_cat.to_csv(f"{cat}.csv", index=False)
    general_df.to_csv("general.csv", index=False)
    multi_category_df.to_csv("multi_category.csv", index=False)
    
    # Save overlap counts
    overlap_df = pd.DataFrame(list(overlap_counts.items()), columns=["Overlap", "Count"])
    overlap_df.to_csv("overlap_counts.csv", index=False)
    
    print("Processing complete. Files saved.")


categorize_and_save('dataset/general.csv')

Processing complete. Files saved.
