### ANALYZE DATA WITH POLAR 

In [5]:
import time

start = time.time()
import polars as pl

df = pl.read_csv("Downloads/period_03/2024_fb_ads_president_scored_anon.csv")

# Descriptive statistics
print("📊 === Descriptive Statistics ===")
print(df.describe())

# Unique value counts
print("\n🔢 === Unique Value Counts ===")
print(df.n_unique())

# Top 3 most common values per string column
print("\n📋 === Top 3 Most Common Values Per Categorical Column ===")
cat_cols = [col for col, dtype in zip(df.columns, df.dtypes) if dtype == pl.Utf8]

for col in cat_cols:
    print(f"\n🔸 {col}")
    top_vals = df.group_by(col).len().sort("len", descending=True).head(3)
    print(top_vals)



📊 === Descriptive Statistics ===
shape: (9, 42)
┌────────────┬──────────────┬──────────────┬──────────────┬───┬──────────────┬─────────────┬─────────────┬─────────────┐
│ statistic  ┆ page_id      ┆ ad_id        ┆ ad_creation_ ┆ … ┆ womens_issue ┆ incivility_ ┆ freefair_il ┆ fraud_illum │
│ ---        ┆ ---          ┆ ---          ┆ time         ┆   ┆ _topic_illum ┆ illuminatin ┆ luminating  ┆ inating     │
│ str        ┆ str          ┆ str          ┆ ---          ┆   ┆ inatin…      ┆ g           ┆ ---         ┆ ---         │
│            ┆              ┆              ┆ str          ┆   ┆ ---          ┆ ---         ┆ f64         ┆ f64         │
│            ┆              ┆              ┆              ┆   ┆ f64          ┆ f64         ┆             ┆             │
╞════════════╪══════════════╪══════════════╪══════════════╪═══╪══════════════╪═════════════╪═════════════╪═════════════╡
│ count      ┆ 246745       ┆ 246745       ┆ 246745       ┆ … ┆ 246745.0     ┆ 246745.0    ┆ 246745.0    

In [6]:
!pip install polars
!pip install --upgrade polars




[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
import polars as pl

# Define the analysis function
def analyze_dataset(path, name):
    print(f"\n📊 === Analyzing: {name} ===")

    # Load CSV as eager DataFrame
    df = pl.read_csv(path)

    # Descriptive statistics
    print("\n🔹 Descriptive Statistics:")
    print(df.describe())

    # Unique value counts
    print("\n🔹 Unique Value Counts:")
    print(df.n_unique())

    # Value counts (Top 3) for string/categorical columns
    print("\n📋 === Top 3 Most Common Values Per Categorical Column ===")
    cat_cols = [col for col, dtype in zip(df.columns, df.dtypes) if dtype == pl.Utf8]

    for col in cat_cols:
        print(f"\n🔸 {col}")
        top_vals = df.group_by(col).len().sort("len", descending=True).head(3)
        print(top_vals)

# Paths and names for all datasets
datasets = {
    
    "Facebook Posts": "Downloads/period_03/2024_fb_posts_president_scored_anon.csv",
    "Twitter Posts": "Downloads/period_03/2024_tw_posts_president_scored_anon.csv"
}

# Run the analysis for all datasets
for name, path in datasets.items():
    analyze_dataset(path, name)




📊 === Analyzing: Facebook Posts ===

🔹 Descriptive Statistics:
shape: (9, 57)
┌────────────┬──────────────┬──────────────┬──────────────┬───┬──────────────┬─────────────┬─────────────┬─────────────┐
│ statistic  ┆ Facebook_Id  ┆ post_id      ┆ Page         ┆ … ┆ incivility_i ┆ scam_illumi ┆ freefair_il ┆ fraud_illum │
│ ---        ┆ ---          ┆ ---          ┆ Category     ┆   ┆ lluminating  ┆ nating      ┆ luminating  ┆ inating     │
│ str        ┆ str          ┆ str          ┆ ---          ┆   ┆ ---          ┆ ---         ┆ ---         ┆ ---         │
│            ┆              ┆              ┆ str          ┆   ┆ f64          ┆ f64         ┆ f64         ┆ f64         │
╞════════════╪══════════════╪══════════════╪══════════════╪═══╪══════════════╪═════════════╪═════════════╪═════════════╡
│ count      ┆ 19009        ┆ 19009        ┆ 16537        ┆ … ┆ 19009.0      ┆ 18060.0     ┆ 19009.0     ┆ 19009.0     │
│ null_count ┆ 0            ┆ 0            ┆ 2472         ┆ … ┆ 0.0       

# Computing Shared column in Datasets

In [8]:
import polars as pl

# Load all three datasets
df_ads = pl.read_csv("Downloads/period_03/2024_fb_ads_president_scored_anon.csv")
df_fb = pl.read_csv("Downloads/period_03/2024_fb_posts_president_scored_anon.csv")
df_tw = pl.read_csv("Downloads/period_03/2024_tw_posts_president_scored_anon.csv")

# Get sets of column names
cols_ads = set(df_ads.columns)
cols_fb = set(df_fb.columns)
cols_tw = set(df_tw.columns)

# Compute shared columns
shared_columns = sorted(cols_ads & cols_fb & cols_tw)

print("📌 Shared Columns Across All Three Datasets:")
for col in shared_columns:
    print(" -", col)


📌 Shared Columns Across All Three Datasets:
 - advocacy_msg_type_illuminating
 - attack_msg_type_illuminating
 - covid_topic_illuminating
 - cta_msg_type_illuminating
 - economy_topic_illuminating
 - education_topic_illuminating
 - engagement_cta_subtype_illuminating
 - environment_topic_illuminating
 - foreign_policy_topic_illuminating
 - fraud_illuminating
 - freefair_illuminating
 - fundraising_cta_subtype_illuminating
 - governance_topic_illuminating
 - health_topic_illuminating
 - image_msg_type_illuminating
 - immigration_topic_illuminating
 - incivility_illuminating
 - issue_msg_type_illuminating
 - lgbtq_issues_topic_illuminating
 - military_topic_illuminating
 - race_and_ethnicity_topic_illuminating
 - safety_topic_illuminating
 - scam_illuminating
 - social_and_cultural_topic_illuminating
 - technology_and_privacy_topic_illuminating
 - voting_cta_subtype_illuminating
 - womens_issue_topic_illuminating


# Analyzing Data between Shared columns

In [9]:
import polars as pl

# Load datasets
df_ads = pl.read_csv("Downloads/period_03/2024_fb_ads_president_scored_anon.csv")
df_fb = pl.read_csv("Downloads/period_03/2024_fb_posts_president_scored_anon.csv")
df_tw = pl.read_csv("Downloads/period_03/2024_tw_posts_president_scored_anon.csv")

# Identify shared columns
shared_cols = list(set(df_ads.columns) & set(df_fb.columns) & set(df_tw.columns))

def analyze_shared_polars(df, name):
    print(f"\n📊 === Analyzing Shared Columns in {name} ===")

    # Descriptive stats
    print("\n🔹 Descriptive Statistics:")
    print(df.select(shared_cols).describe())

    # Unique values per column
    print("\n🔹 Unique Value Counts:")
    for col in shared_cols:
        unique_count = df[col].n_unique()
        print(f" - {col}: {unique_count}")

    # Top 3 most frequent values
    print("\n📋 === Top 3 Most Common Values Per Shared Column ===")
    for col in shared_cols:
        # Check if dtype is categorical or numeric
        if df[col].dtype in [pl.Utf8, pl.Int64, pl.Float64]:
            try:
                freq = df.group_by(col).len().sort("len", descending=True).head(3)
                print(f"\n🔸 {col} (Top 3 Frequencies):")
                print(freq)
            except:
                print(f"⚠️ Could not compute value counts for {col}")

# Run for each dataset
analyze_shared_polars(df_ads, "Facebook Ads")
analyze_shared_polars(df_fb, "Facebook Posts")
analyze_shared_polars(df_tw, "Twitter Posts")

end = time.time()
print(f"⏱️ Time taken: {end - start:.2f} seconds")



📊 === Analyzing Shared Columns in Facebook Ads ===

🔹 Descriptive Statistics:
shape: (9, 28)
┌────────────┬──────────────┬──────────────┬──────────────┬───┬──────────────┬─────────────┬─────────────┬─────────────┐
│ statistic  ┆ fundraising_ ┆ issue_msg_ty ┆ immigration_ ┆ … ┆ safety_topic ┆ engagement_ ┆ foreign_pol ┆ advocacy_ms │
│ ---        ┆ cta_subtype_ ┆ pe_illuminat ┆ topic_illumi ┆   ┆ _illuminatin ┆ cta_subtype ┆ icy_topic_i ┆ g_type_illu │
│ str        ┆ illumi…      ┆ ing          ┆ nating       ┆   ┆ g            ┆ _illumin…   ┆ lluminat…   ┆ minating    │
│            ┆ ---          ┆ ---          ┆ ---          ┆   ┆ ---          ┆ ---         ┆ ---         ┆ ---         │
│            ┆ f64          ┆ f64          ┆ f64          ┆   ┆ f64          ┆ f64         ┆ f64         ┆ f64         │
╞════════════╪══════════════╪══════════════╪══════════════╪═══╪══════════════╪═════════════╪═════════════╪═════════════╡
│ count      ┆ 246745.0     ┆ 246745.0     ┆ 246745.0     ┆