In [None]:
"""
===============================================================================
TIER PROTOCOL – DATA APPENDIX SCRIPT
Script Name: generate_data_appendix.py

Authors: Justin Li, Aiden Lee, Nathaniel Chen
Date Created: 7/25/2025
Project: Polarization Networks

Purpose:
This script generates summary statistics, visualizations, and tables for the
Data Appendix. It uses the Analysis Data Files created by the Processing Scripts
and saves the output in the DataAppendixOutput/ folder.

!!! IMPORTANT !!!
This script must be run **after all Processing Scripts**, as it depends on
the cleaned and constructed data from the AnalysisData/ folder.
===============================================================================
"""

# ========== Import Libraries ==========
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# ========== Define Input and Output Paths ==========
analysis_data_path_directed = r"../../Data/AnalysisData/clean_tweets_directed.csv"
analysis_data_path_notdirected = r"../../Data/AnalysisData/clean_tweets_notdirected.csv"
output_dir = r"../../Output/DataAppendixOutput"
os.makedirs(output_dir, exist_ok=True)

# ========== Load Analysis Data Files ==========
df_directed = pd.read_csv(analysis_data_path_directed)
df_notdirected = pd.read_csv(analysis_data_path_notdirected)

# ========== Helper Function to Summarize Dataset ==========
def summarize_dataset(df, dataset_name):
    summary = []

    dataset_output_dir = os.path.join(output_dir, dataset_name)
    os.makedirs(dataset_output_dir, exist_ok=True)

    for col in df.columns:
        col_data = df[col]
        summary_row = {
            'Variable': col,
            'Type': str(col_data.dtype),
            'Missing': col_data.isna().sum(),
            'Unique': col_data.nunique()
        }

        # Numeric summary + plot
        if pd.api.types.is_numeric_dtype(col_data):
            summary_row.update({
                'Mean': col_data.mean(),
                'Std': col_data.std(),
                'Min': col_data.min(),
                '25%': col_data.quantile(0.25),
                'Median': col_data.median(),
                '75%': col_data.quantile(0.75),
                'Max': col_data.max()
            })

            # Save histogram
            plt.figure()
            sns.histplot(col_data.dropna(), bins=30, kde=True)
            plt.title(f"{col} Distribution")
            plt.xlabel(col)
            plt.tight_layout()
            plt.savefig(f"{dataset_output_dir}/{col}_distribution.png")
            plt.close()

        # Categorical summary + plot
        elif pd.api.types.is_object_dtype(col_data) or pd.api.types.is_categorical_dtype(col_data):
            top_counts = col_data.value_counts().head(10)
            for val, count in top_counts.items():
                summary_row[f"Top: {val}"] = count

            plt.figure()
            sns.countplot(y=col_data, order=top_counts.index)
            plt.title(f"{col} Top Categories")
            plt.tight_layout()
            plt.savefig(f"{dataset_output_dir}/{col}_top_categories.png")
            plt.close()

        summary.append(summary_row)

    # Save summary table
    summary_df = pd.DataFrame(summary)
    summary_csv_path = os.path.join(dataset_output_dir, f"{dataset_name}_summary.csv")
    summary_df.to_csv(summary_csv_path, index=False)
    print(f"Saved summary for {dataset_name} to {summary_csv_path}")


# ========== Run Summaries ==========
summarize_dataset(df_directed, "clean_tweets_directed")
summarize_dataset(df_notdirected, "clean_tweets_notdirected")


Saved summary for clean_tweets_directed to Polarization_Networks/Output/DataAppendixOutput\clean_tweets_directed\clean_tweets_directed_summary.csv
Saved summary for clean_tweets_notdirected to Polarization_Networks/Output/DataAppendixOutput\clean_tweets_notdirected\clean_tweets_notdirected_summary.csv
