In [1]:
# Pandas Descriptive Statistics with Grouped Analysis and Comments

# Import necessary libraries
import pandas as pd  # For handling tabular data
import os  # For checking file paths
import time  # For measuring performance
import matplotlib.pyplot as plt  # For plotting
import seaborn as sns  # For statistical visualization

# Ensure plots are displayed inside the notebook
%matplotlib inline

# Define dataset filenames
DATASETS = [
    "2024_fb_ads_president_scored_anon.csv",
    "2024_fb_posts_president_scored_anon.csv",
    "2024_tw_posts_president_scored_anon.csv"
]

# Filter datasets to include only those that exist
available_datasets = [f for f in DATASETS if os.path.exists(f)]

# Function to analyze each dataset
def analyze_dataset(csv_file):
    print(f"\n{'='*60}\nAnalyzing File: {csv_file}\n{'='*60}")  # Header for current file
    start_time = time.time()  # Record start time

    df = pd.read_csv(csv_file)  # Read the dataset into a DataFrame

    if df.empty:
        print("No data in file.")  # Exit if DataFrame is empty
        return

    print("\n--- Data Overview ---")
    print(f"Shape: {df.shape}")  # Print the number of rows and columns
    print(f"Columns: {list(df.columns)}")  # Print column names

    print("\n--- Descriptive Statistics for First 5 Columns ---")
    for col in df.columns[:5]:  # Limit output to first 5 columns
        if pd.api.types.is_numeric_dtype(df[col]):  # Check if column is numeric
            print(f"\nColumn: {col} (Numeric)")
            print(df[col].describe())  # Show descriptive stats
        else:
            print(f"\nColumn: {col} (Categorical)")
            print(df[col].value_counts().head(5))  # Show top 5 values

    if 'page_id' in df.columns:
        print("\n--- Grouped by 'page_id' (First 3 Groups Only) ---")
        grouped = df.groupby('page_id')  # Group by 'page_id'
        for name, group in list(grouped)[0:3]:  # Limit to 3 groups
            print(f"\nGroup: page_id = {name}")
            print(group.describe(include='all'))  # Show group stats

    if 'ad_id' in df.columns and 'page_id' in df.columns:
        print("\n--- Grouped by 'page_id' and 'ad_id' (First 3 Groups Only) ---")
        grouped = df.groupby(['page_id', 'ad_id'])  # Group by both keys
        for name, group in list(grouped)[0:3]:  # Limit to 3 groups
            print(f"\nGroup: page_id = {name[0]}, ad_id = {name[1]}")
            print(group.describe(include='all'))  # Show group stats

    print(f"\nFinished in {time.time() - start_time:.2f} seconds")  # Print time taken

# Run analysis for each dataset
for dataset in available_datasets:
    analyze_dataset(dataset)  # Execute function for each file



Analyzing File: 2024_fb_ads_president_scored_anon.csv

--- Data Overview ---
Shape: (246745, 41)
Columns: ['page_id', 'ad_id', 'ad_creation_time', 'bylines', 'currency', 'delivery_by_region', 'demographic_distribution', 'estimated_audience_size', 'estimated_impressions', 'estimated_spend', 'publisher_platforms', 'illuminating_scored_message', 'illuminating_mentions', 'scam_illuminating', 'election_integrity_Truth_illuminating', 'advocacy_msg_type_illuminating', 'issue_msg_type_illuminating', 'attack_msg_type_illuminating', 'image_msg_type_illuminating', 'cta_msg_type_illuminating', 'engagement_cta_subtype_illuminating', 'fundraising_cta_subtype_illuminating', 'voting_cta_subtype_illuminating', 'covid_topic_illuminating', 'economy_topic_illuminating', 'education_topic_illuminating', 'environment_topic_illuminating', 'foreign_policy_topic_illuminating', 'governance_topic_illuminating', 'health_topic_illuminating', 'immigration_topic_illuminating', 'lgbtq_issues_topic_illuminating', 'mil