In [6]:
# Install Polars library (run once in your notebook or environment)
#!pip install polars

# Import required libraries
import polars as pl  # High-performance data processing library (like pandas but faster)
import os           # Provides a way to check if files exist on the system
import time         # Helps measure how long tasks take

# List of CSV data files to check and analyze
DATASETS = [
    "2024_fb_ads_president_scored_anon.csv",
    "2024_fb_posts_president_scored_anon.csv",
    "2024_tw_posts_president_scored_anon.csv"
]

# Filter and keep only those files from the list that actually exist
available_datasets = [f for f in DATASETS if os.path.exists(f)]

# Function to analyze a single dataset
def analyze_dataset(csv_file):
    print(f"\n{'='*60}\nAnalyzing File: {csv_file}\n{'='*60}")  # Visual separator
    start_time = time.time()  # Start timing the operation

    # Read the CSV into a Polars DataFrame (fast and efficient tabular format)
    df = pl.read_csv(csv_file)

    # If Polars returns a LazyFrame (used for delayed operations), convert to regular DataFrame
    if hasattr(df, "collect"):
        df = df.collect()

    # Show what type of object we are working with (should be a Polars DataFrame)
    print(f"Data type after reading: {type(df)}")

    # If there's no data in the DataFrame, skip further analysis
    if df.is_empty():
        print("No data in file.")
        return

    # Display basic information about the dataset
    print("\n--- Data Overview ---")
    print(f"Number of rows and columns: {df.shape}")  # Shows dataset size
    print(f"Column names: {df.columns}")  # Lists all the column names

    # Show descriptive statistics for the first 5 columns (to avoid clutter)
    print("\n--- Descriptive Statistics for First 5 Columns ---")
    for col in df.columns[:5]:  # Limit to first 5 columns
        print(f"\nColumn: {col}")
        if df[col].dtype in [pl.Float64, pl.Int64, pl.Float32, pl.Int32]:  # Numeric columns
            print(df[col].describe())  # Show mean, min, max, etc.
        else:
            # For text or categorical data, show most frequent values
            vc_df = df[col].value_counts()
            count_col = [c for c in vc_df.columns if c.lower().startswith("count")][0]  # Get count column name
            print(vc_df.sort(count_col, descending=True).head(5))  # Show top 5 frequent values

    # If 'page_id' column exists, do grouped analysis
    if 'page_id' in df.columns:
        print("\n--- Grouped by 'page_id' (First 3 Groups Only) ---")
        grouped = df.group_by('page_id').agg([pl.count()])  # Group by page_id and count entries
        sample_ids = grouped.head(3)['page_id'].to_list()  # Pick first 3 group IDs to show as examples
        for pid in sample_ids:
            group_df = df.filter(pl.col('page_id') == pid)  # Filter rows belonging to this group
            print(f"\nGroup: page_id = {pid}")
            print(group_df.describe())  # Show descriptive stats for the group

    # If both 'page_id' and 'ad_id' exist, do combined grouped analysis
    if 'page_id' in df.columns and 'ad_id' in df.columns:
        print("\n--- Grouped by 'page_id' and 'ad_id' (First 3 Groups Only) ---")
        grouped = df.group_by(['page_id', 'ad_id']).agg([pl.count()])  # Group by both columns
        sample_keys = grouped.head(3).select(['page_id', 'ad_id']).to_dicts()  # Get 3 group keys as dicts
        for row in sample_keys:
            pid, aid = row['page_id'], row['ad_id']  # Extract group keys
            group_df = df.filter((pl.col('page_id') == pid) & (pl.col('ad_id') == aid))  # Filter rows
            print(f"\nGroup: page_id = {pid}, ad_id = {aid}")
            print(group_df.describe())  # Show stats for this group

    # Show how long the analysis took for this file
    print(f"\nFinished in {time.time() - start_time:.2f} seconds")

# Loop through each valid dataset and analyze it
for dataset in available_datasets:
    analyze_dataset(dataset)  # Run the function on each file



Analyzing File: 2024_fb_ads_president_scored_anon.csv
Data type after reading: <class 'polars.dataframe.frame.DataFrame'>

--- Data Overview ---
Number of rows and columns: (246745, 41)
Column names: ['page_id', 'ad_id', 'ad_creation_time', 'bylines', 'currency', 'delivery_by_region', 'demographic_distribution', 'estimated_audience_size', 'estimated_impressions', 'estimated_spend', 'publisher_platforms', 'illuminating_scored_message', 'illuminating_mentions', 'scam_illuminating', 'election_integrity_Truth_illuminating', 'advocacy_msg_type_illuminating', 'issue_msg_type_illuminating', 'attack_msg_type_illuminating', 'image_msg_type_illuminating', 'cta_msg_type_illuminating', 'engagement_cta_subtype_illuminating', 'fundraising_cta_subtype_illuminating', 'voting_cta_subtype_illuminating', 'covid_topic_illuminating', 'economy_topic_illuminating', 'education_topic_illuminating', 'environment_topic_illuminating', 'foreign_policy_topic_illuminating', 'governance_topic_illuminating', 'health_

(Deprecated in version 0.20.5)
  grouped = df.group_by('page_id').agg([pl.count()])  # Group by page_id and count entries
(Deprecated in version 0.20.5)
  grouped = df.group_by(['page_id', 'ad_id']).agg([pl.count()])  # Group by both columns



Group: page_id = 7db6e071ce6b8a2090db715fe0206fa17f8d23f7f0f2d1feaefe3fece96952c6
shape: (9, 42)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ statistic ┆ page_id   ┆ ad_id     ┆ ad_creati ┆ … ┆ womens_is ┆ incivilit ┆ freefair_ ┆ fraud_il │
│ ---       ┆ ---       ┆ ---       ┆ on_time   ┆   ┆ sue_topic ┆ y_illumin ┆ illuminat ┆ luminati │
│ str       ┆ str       ┆ str       ┆ ---       ┆   ┆ _illumina ┆ ating     ┆ ing       ┆ ng       │
│           ┆           ┆           ┆ str       ┆   ┆ tin…      ┆ ---       ┆ ---       ┆ ---      │
│           ┆           ┆           ┆           ┆   ┆ ---       ┆ f64       ┆ f64       ┆ f64      │
│           ┆           ┆           ┆           ┆   ┆ f64       ┆           ┆           ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ count     ┆ 6         ┆ 6         ┆ 6         ┆ … ┆ 6.0       ┆ 6.0       ┆ 6.0       ┆ 6.0 