# FB ads

In [12]:
import polars as pl

# Load the CSV file
df = pl.read_csv("/Users/pranavdalvi/Research Analyst ischool/Task_03_Descriptive_Stats/facebook_ads_cleaned.csv")

# See available column names
print("Columns:", df.columns)

# Sanity check for required columns
required_cols = ["page_id", "ad_id", "estimated_spend", "estimated_impressions"]
missing = [col for col in required_cols if col not in df.columns]
if missing:
    print(f"Missing expected columns: {missing}")
else:
    # Convert to appropriate numeric types if needed
    df = df.with_columns([
        pl.col("estimated_spend").cast(pl.Float64),
        pl.col("estimated_impressions").cast(pl.Int64)
    ])

    # ✅ Group by page_id
    print("\n--- Grouped by page_id ---")
    grouped_by_page = df.group_by("page_id").agg([
        pl.count().alias("row_count"),
        pl.col("estimated_spend").mean().alias("avg_spend"),
        pl.col("estimated_spend").min().alias("min_spend"),
        pl.col("estimated_spend").max().alias("max_spend"),
        pl.col("estimated_impressions").mean().alias("avg_impressions")
    ])
    print(grouped_by_page)

    # ✅ Group by page_id + ad_id
    print("\n--- Grouped by page_id and ad_id ---")
    grouped_by_page_ad = df.group_by(["page_id", "ad_id"]).agg([
        pl.count().alias("row_count"),
        pl.col("estimated_spend").mean().alias("avg_spend"),
        pl.col("estimated_impressions").mean().alias("avg_impressions")
    ])
    print(grouped_by_page_ad)


Columns: ['page_id', 'ad_id', 'ad_creation_time', 'bylines', 'currency', 'delivery_by_region', 'estimated_audience_size', 'estimated_impressions', 'estimated_spend', 'publisher_platforms', 'illuminating_scored_message', 'illuminating_mentions', 'scam_illuminating', 'election_integrity_Truth_illuminating', 'advocacy_msg_type_illuminating', 'issue_msg_type_illuminating', 'attack_msg_type_illuminating', 'image_msg_type_illuminating', 'cta_msg_type_illuminating', 'engagement_cta_subtype_illuminating', 'fundraising_cta_subtype_illuminating', 'voting_cta_subtype_illuminating', 'covid_topic_illuminating', 'economy_topic_illuminating', 'education_topic_illuminating', 'environment_topic_illuminating', 'foreign_policy_topic_illuminating', 'governance_topic_illuminating', 'health_topic_illuminating', 'immigration_topic_illuminating', 'lgbtq_issues_topic_illuminating', 'military_topic_illuminating', 'race_and_ethnicity_topic_illuminating', 'safety_topic_illuminating', 'social_and_cultural_topic_il

(Deprecated in version 0.20.5)
  pl.count().alias("row_count"),


shape: (4_259, 6)
┌──────────────────────────────┬───────────┬─────────────┬───────────┬───────────┬─────────────────┐
│ page_id                      ┆ row_count ┆ avg_spend   ┆ min_spend ┆ max_spend ┆ avg_impressions │
│ ---                          ┆ ---       ┆ ---         ┆ ---       ┆ ---       ┆ ---             │
│ str                          ┆ u32       ┆ f64         ┆ f64       ┆ f64       ┆ f64             │
╞══════════════════════════════╪═══════════╪═════════════╪═══════════╪═══════════╪═════════════════╡
│ 04df4069408580c1b41901cd7549 ┆ 5018      ┆ 767.07493   ┆ 49.0      ┆ 54999.0   ┆ 38040.461538    │
│ d4…                          ┆           ┆             ┆           ┆           ┆                 │
│ b4229f5047d50fc5d3ab426f81a4 ┆ 24        ┆ 486.5       ┆ 449.0     ┆ 549.0     ┆ 54686.5         │
│ bd…                          ┆           ┆             ┆           ┆           ┆                 │
│ 68f943da6e1050fd8e96b38b4faa ┆ 18        ┆ 1949.0      ┆ 149.0     ┆ 37

(Deprecated in version 0.20.5)
  pl.count().alias("row_count"),


shape: (215_756, 5)
┌────────────────────────────┬───────────────────────────┬───────────┬───────────┬─────────────────┐
│ page_id                    ┆ ad_id                     ┆ row_count ┆ avg_spend ┆ avg_impressions │
│ ---                        ┆ ---                       ┆ ---       ┆ ---       ┆ ---             │
│ str                        ┆ str                       ┆ u32       ┆ f64       ┆ f64             │
╞════════════════════════════╪═══════════════════════════╪═══════════╪═══════════╪═════════════════╡
│ d2c9aab423e210c69f5b644301 ┆ 176a98ad845115dd2dd45c159 ┆ 18        ┆ 149.0     ┆ 22499.0         │
│ 235c…                      ┆ c4fe6…                    ┆           ┆           ┆                 │
│ 1487716df214a4964261a98485 ┆ 8810286d5f14fd8552d8e6d4c ┆ 18        ┆ 49.0      ┆ 27499.0         │
│ 3103…                      ┆ c5cb3…                    ┆           ┆           ┆                 │
│ e3342051b60393770363ffc029 ┆ 8e095f28702369a2d0c73945d ┆ 15        ┆ 

# FB posts

In [13]:
import polars as pl

df = pl.read_csv("/Users/pranavdalvi/Research Analyst ischool/Task_03_Descriptive_Stats/facebook_posts_cleaned.csv")

# Columns to clean and summarize
numeric_cols = [
    "Total Interactions", "Likes", "Comments", "Shares", "Love", "Wow", "Haha",
    "Sad", "Angry", "Care", "Post Views", "Total Views", "Total Views For All Crossposts", 
    "Overperforming Score"
]

# Clean commas and cast to float
for col in numeric_cols:
    if col in df.columns:
        df = df.with_columns(
            pl.col(col)
            .cast(pl.Utf8)
            .str.replace_all(",", "")
            .cast(pl.Float64)
            .alias(col)
        )

# Generate summary statistics
summary_exprs = []
for col in numeric_cols:
    if col in df.columns:
        summary_exprs.extend([
            pl.col(col).mean().alias(f"{col}_mean"),
            pl.col(col).min().alias(f"{col}_min"),
            pl.col(col).max().alias(f"{col}_max"),
            pl.col(col).std().alias(f"{col}_std"),
            pl.col(col).count().alias(f"{col}_count"),
        ])

print("\n=== Descriptive Statistics ===")
print(df.select(summary_exprs))

# Value counts for categorical fields
categorical_cols = ["Type", "Video Share Status", "Is Video Owner?"]
for col in categorical_cols:
    if col in df.columns:
        print(f"\n{col} value counts:")
        print(df.select(pl.col(col).value_counts()))



=== Descriptive Statistics ===
shape: (1, 70)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ Total Int ┆ Total Int ┆ Total Int ┆ Total Int ┆ … ┆ Overperfo ┆ Overperfo ┆ Overperfo ┆ Overperf │
│ eractions ┆ eractions ┆ eractions ┆ eractions ┆   ┆ rming     ┆ rming     ┆ rming     ┆ orming   │
│ _mean     ┆ _min      ┆ _max      ┆ _std      ┆   ┆ Score_min ┆ Score_max ┆ Score_std ┆ Score_co │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ unt      │
│ f64       ┆ f64       ┆ f64       ┆ f64       ┆   ┆ f64       ┆ f64       ┆ f64       ┆ ---      │
│           ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆ u32      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ 3089.6066 ┆ 3.0       ┆ 696853.0  ┆ 16654.385 ┆ … ┆ -198.75   ┆ 246.78    ┆ 7.863076  ┆ 16280    │
│ 34        ┆           ┆           ┆ 377   

# Twitter posts

In [None]:
import polars as pl

# Load dataset
df = pl.read_csv("/Users/pranavdalvi/Research Analyst ischool/Task_03_Descriptive_Stats/twitter_posts_cleaned.csv")

# Clean numeric columns
numeric_cols = [
    "retweetCount", "replyCount", "likeCount", "quoteCount", "viewCount",
    "bookmarkCount"
]

for col in numeric_cols:
    if col in df.columns:
        df = df.with_columns(
            pl.col(col)
            .cast(pl.Utf8)
            .str.replace_all(",", "")
            .cast(pl.Float64)
            .alias(col)
        )

# Compute descriptive stats
summary_exprs = []
for col in numeric_cols:
    if col in df.columns:
        summary_exprs.extend([
            pl.col(col).mean().alias(f"{col}_mean"),
            pl.col(col).min().alias(f"{col}_min"),
            pl.col(col).max().alias(f"{col}_max"),
            pl.col(col).std().alias(f"{col}_std"),
            pl.col(col).count().alias(f"{col}_count"),
        ])

print("\n=== Descriptive Statistics ===")
print(df.select(summary_exprs))

# Categorical value counts
categorical_cols = ["source", "lang", "isReply", "isQuote", "isRetweet"]
for col in categorical_cols:
    if col in df.columns:
        print(f"\n{col} value counts:")
        print(df.select(pl.col(col).value_counts()))



=== Descriptive Statistics ===
shape: (1, 30)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ retweetCo ┆ retweetCo ┆ retweetCo ┆ retweetCo ┆ … ┆ bookmarkC ┆ bookmarkC ┆ bookmarkC ┆ bookmark │
│ unt_mean  ┆ unt_min   ┆ unt_max   ┆ unt_std   ┆   ┆ ount_min  ┆ ount_max  ┆ ount_std  ┆ Count_co │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ unt      │
│ f64       ┆ f64       ┆ f64       ┆ f64       ┆   ┆ f64       ┆ f64       ┆ f64       ┆ ---      │
│           ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆ u32      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ 1322.0551 ┆ 0.0       ┆ 144615.0  ┆ 3405.0042 ┆ … ┆ 0.0       ┆ 42693.0   ┆ 712.58029 ┆ 27304    │
│ 93        ┆           ┆           ┆ 4         ┆   ┆           ┆           ┆ 4         ┆          │
└───────────┴───────────┴───────────┴───────