In [6]:
import polars as pl
import numpy as np
import csv

In [9]:
# --- Helper functions remain the same ---
def parse_video_length(time_str: str) -> float:
    # ... (assuming this function is defined as before)
    if not isinstance(time_str, str) or not time_str: return np.nan
    parts = time_str.split(':')
    try:
        parts = [int(p) for p in parts]
        if len(parts) == 3: h, m, s = parts; return h * 60 + m + s / 60
        elif len(parts) == 2: m, s = parts; return m + s / 60
        return np.nan
    except (ValueError, TypeError): return np.nan

def analyze_dataset_polars(filepath, numeric_cols, date_cols, group_by_col, custom_conversions={}):
    """
    Loads, processes, and analyzes a dataset using the Polars library,
    now with robust loading to prevent ShapeErrors.
    """
    print(f"\n{'='*60}\n🔄 Processing file with Polars: {filepath}\n{'='*60}")

    # --- Using robust Python csv.reader to handle corrupted files ---
    header, good_lines = [], []
    try:
        with open(filepath, mode='r', encoding='utf-8', newline='') as infile:
            reader = csv.reader(infile)
            header = next(reader)
            n_cols = len(header)  # Getting the expected number of columns.
            
            for row in reader:
                # --- THIS IS THE FIX ---
                # Enforcing that each row has the same number of columns as the header.
                if len(row) == n_cols:
                    good_lines.append(row)
                else:
                    # Reporting the problematic row but continuing execution.
                    print(f"  ⚠️ Skipping malformed row with {len(row)} columns (expected {n_cols}).")

    except Exception as e:
        print(f"⚠️ Handled a reading error: {e}")

    if not good_lines:
        print(f"❌ No data loaded from {filepath}.")
        return

    # Creating a Polars DataFrame from the salvaged lines.
    # Adding orient='row' to silence the warning.
    df = pl.DataFrame(data=good_lines, schema=header, orient="row")
    
    # --- Cleaning and Type Casting with Polars Expressions ---
    expressions = []
    for col in numeric_cols:
        if col in df.columns:
            expressions.append(pl.col(col).cast(pl.Float64, strict=False))
    
    for col in date_cols:
        if col in df.columns:
            expressions.append(pl.col(col).str.to_datetime(strict=False))

    # Using .map_elements() for custom functions.
    for col, func in custom_conversions.items():
        if col in df.columns:
            expressions.append(pl.col(col).map_elements(func, return_dtype=pl.Float64).alias(col))
            
    df = df.with_columns(expressions)
    
    # --- Statistical Analysis ---
    print("\n#### 1. High-Level Descriptive Statistics ####")
    print(df.describe())

    print(f"\n#### 3. Grouped Analysis by '{group_by_col}' ####")
    target_numeric_col = next((col for col in numeric_cols if col in df.columns), None)
    if target_numeric_col:
        grouped_df = df.group_by(group_by_col).agg([
            pl.count().alias("count"),
            pl.col(target_numeric_col).mean().alias(f"mean_{target_numeric_col}")
        ]).sort("count", descending=True)
        print(grouped_df.head(5))

In [10]:
# ==============================================================================
# --- Executing Analysis for All Datasets ---
# ==============================================================================

# --- Facebook Ads Analysis ---
analyze_dataset_polars(
    filepath='data/2024_fb_ads_president_scored_anon.csv',
    numeric_cols=['estimated_spend', 'estimated_impressions'],
    date_cols=['ad_creation_time'],
    group_by_col='bylines'
)

# --- Facebook Posts Analysis ---
analyze_dataset_polars(
    filepath='data/2024_fb_posts_president_scored_anon.csv',
    numeric_cols=['Total Interactions', 'Likes', 'Comments', 'Shares', 'Post Views'],
    date_cols=['Post Created Date'],
    group_by_col='Page Category',
    custom_conversions={'Video Length': parse_video_length}
)

# --- Twitter Posts Analysis ---
analyze_dataset_polars(
    filepath='data/2024_tw_posts_president_scored_anon.csv',
    numeric_cols=['retweetCount', 'replyCount', 'likeCount', 'viewCount'],
    date_cols=['createdAt'],
    group_by_col='source'
)


🔄 Processing file with Polars: data/2024_fb_ads_president_scored_anon.csv


  ⚠️ Skipping malformed row with 6 columns (expected 41).

#### 1. High-Level Descriptive Statistics ####
shape: (9, 42)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ statistic ┆ page_id   ┆ ad_id     ┆ ad_creati ┆ … ┆ womens_is ┆ incivilit ┆ freefair_ ┆ fraud_il │
│ ---       ┆ ---       ┆ ---       ┆ on_time   ┆   ┆ sue_topic ┆ y_illumin ┆ illuminat ┆ luminati │
│ str       ┆ str       ┆ str       ┆ ---       ┆   ┆ _illumina ┆ ating     ┆ ing       ┆ ng       │
│           ┆           ┆           ┆ str       ┆   ┆ tin…      ┆ ---       ┆ ---       ┆ ---      │
│           ┆           ┆           ┆           ┆   ┆ ---       ┆ str       ┆ str       ┆ str      │
│           ┆           ┆           ┆           ┆   ┆ str       ┆           ┆           ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ count     ┆ 31906     ┆ 31906     ┆ 31906     ┆ … ┆ 31906     ┆ 31906

(Deprecated in version 0.20.5)
  pl.count().alias("count"),



#### 1. High-Level Descriptive Statistics ####
shape: (9, 57)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ statistic ┆ Facebook_ ┆ post_id   ┆ Page      ┆ … ┆ incivilit ┆ scam_illu ┆ freefair_ ┆ fraud_il │
│ ---       ┆ Id        ┆ ---       ┆ Category  ┆   ┆ y_illumin ┆ minating  ┆ illuminat ┆ luminati │
│ str       ┆ ---       ┆ str       ┆ ---       ┆   ┆ ating     ┆ ---       ┆ ing       ┆ ng       │
│           ┆ str       ┆           ┆ str       ┆   ┆ ---       ┆ str       ┆ ---       ┆ ---      │
│           ┆           ┆           ┆           ┆   ┆ str       ┆           ┆ str       ┆ str      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ count     ┆ 19009     ┆ 19009     ┆ 19009     ┆ … ┆ 19009     ┆ 19009     ┆ 19009     ┆ 19009    │
│ null_coun ┆ 0         ┆ 0         ┆ 0         ┆ … ┆ 0         ┆ 0         ┆ 0         ┆ 0        │
│ t         ┆           ┆   

(Deprecated in version 0.20.5)
  pl.count().alias("count"),



#### 1. High-Level Descriptive Statistics ####
shape: (9, 48)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ statistic ┆ id        ┆ url       ┆ source    ┆ … ┆ incivilit ┆ scam_illu ┆ freefair_ ┆ fraud_il │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ y_illumin ┆ minating  ┆ illuminat ┆ luminati │
│ str       ┆ str       ┆ str       ┆ str       ┆   ┆ ating     ┆ ---       ┆ ing       ┆ ng       │
│           ┆           ┆           ┆           ┆   ┆ ---       ┆ str       ┆ ---       ┆ ---      │
│           ┆           ┆           ┆           ┆   ┆ str       ┆           ┆ str       ┆ str      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ count     ┆ 27304     ┆ 27304     ┆ 27304     ┆ … ┆ 27304     ┆ 27304     ┆ 27304     ┆ 27304    │
│ null_coun ┆ 0         ┆ 0         ┆ 0         ┆ … ┆ 0         ┆ 0         ┆ 0         ┆ 0        │
│ t         ┆           ┆   