In [3]:
import csv

with open("C:\\Users\\s29sh\\OneDrive\\Documents\\Datasets\\period_03\\2024_fb_ads_president_scored_anon.csv", mode="r", encoding="utf-8") as file:
    reader = csv.DictReader(file)
    data = list(reader)

# Show number of rows and preview first 2 rows
print("Total rows:", len(data))
for row in data[:2]:
    print(row)
    print("-" * 60)

Total rows: 246745
{'page_id': '4ff23a48b53d988df50ddfebb0e442a984ab8f94e874ef9b9cb34394e0c5d230', 'ad_id': '0ddb025b8544e2d58e6977ad417e742a52522b3e1fc1c9d9b61c57148f8d72fc', 'ad_creation_time': '2024-10-21', 'bylines': 'Texas Organizing Project PAC', 'currency': 'USD', 'delivery_by_region': "{'Texas': {'spend': 249, 'impressions': 47499}}", 'demographic_distribution': "{'female_18-24': {'spend': 28, 'impressions': 5507}, 'male_45-54': {'spend': 14, 'impressions': 2757}, 'male_65+': {'spend': 3, 'impressions': 714}, 'female_65+': {'spend': 3, 'impressions': 725}, 'unknown_55-64': {'spend': 0, 'impressions': 21}, 'male_55-64': {'spend': 7, 'impressions': 1502}, 'female_55-64': {'spend': 7, 'impressions': 1520}, 'unknown_45-54': {'spend': 0, 'impressions': 40}, 'female_45-54': {'spend': 14, 'impressions': 2735}, 'male_18-24': {'spend': 21, 'impressions': 4055}, 'unknown_35-44': {'spend': 0, 'impressions': 69}, 'male_35-44': {'spend': 26, 'impressions': 5149}, 'female_35-44': {'spend': 2

## Script 1 — Pure Python (no pandas or polars)



In [4]:
from statistics import mean, stdev

fields_numeric = ["estimated_audience_size", "estimated_impressions", "estimated_spend"]

for field in fields_numeric:
    # Extract all values for the field
    raw_column_values = [entry[field].replace(",", "").strip() for entry in data]
    
    # Convert to float 
    numbers = []
    for val in raw_column_values:
        try:
            numbers.append(float(val))
        except ValueError:
            continue
    
    # Printing the statistics
    if numbers:
        print(f"\nField: {field}")
        print(f"Total Count: {len(numbers)}")
        print(f"Average: {mean(numbers):.2f}")
        print(f"Minimum: {min(numbers)}")
        print(f"Maximum: {max(numbers)}")
        if len(numbers) > 1:
            print(f"Standard Deviation: {stdev(numbers):.2f}")
    else:
        print(f"\nField: {field} has no numeric data")



Field: estimated_audience_size
Total Count: 246745
Average: 556462.86
Minimum: 0.0
Maximum: 1000001.0
Standard Deviation: 409864.76

Field: estimated_impressions
Total Count: 246745
Average: 45601.53
Minimum: 499.0
Maximum: 1000000.0
Standard Deviation: 136790.77

Field: estimated_spend
Total Count: 246745
Average: 1061.29
Minimum: 49.0
Maximum: 474999.0
Standard Deviation: 4992.56


In [5]:
from collections import Counter

fields_categorical = ["currency", "publisher_platforms", "bylines"]

for field in fields_categorical:
    # Gather non-empty, stripped values for this field
    cleaned_values = []
    for entry in data:
        value = entry[field].strip()
        if value:
            cleaned_values.append(value)
    
    freq_counter = Counter(cleaned_values)
    
    print(f"\nField: {field}")
    print(f"Unique Values: {len(freq_counter)}")
    print("Top 5 Most Frequent Values:")
    for val, freq in freq_counter.most_common(5):
        print(f"  {val} → {freq}")
    print("-" * 50)



Field: currency
Unique Values: 18
Top 5 Most Frequent Values:
  USD → 246599
  INR → 63
  GBP → 17
  EUR → 11
  PKR → 8
--------------------------------------------------

Field: publisher_platforms
Unique Values: 9
Top 5 Most Frequent Values:
  ['facebook', 'instagram'] → 214434
  ['facebook'] → 23259
  ['instagram'] → 8395
  ['facebook', 'instagram', 'audience_network', 'messenger'] → 459
  ['facebook', 'instagram', 'audience_network'] → 79
--------------------------------------------------

Field: bylines
Unique Values: 3786
Top 5 Most Frequent Values:
  HARRIS FOR PRESIDENT → 49788
  HARRIS VICTORY FUND → 32612
  BIDEN VICTORY FUND → 15539
  DONALD J. TRUMP FOR PRESIDENT 2024, INC. → 15112
  Trump National Committee JFC → 7279
--------------------------------------------------


In [6]:
from collections import defaultdict
from statistics import mean

fields_numeric = ["estimated_audience_size", "estimated_impressions", "estimated_spend"]

# Group by 'page_id'
page_groups = defaultdict(lambda: {field: [] for field in fields_numeric})

for entry in data:
    page_id = entry["page_id"]
    for field in fields_numeric:
        value = entry[field].replace(",", "").strip()
        try:
            num_value = float(value)
            page_groups[page_id][field].append(num_value)
        except ValueError:
            continue

# Print stats for first 5 page_ids
for pid, stats in list(page_groups.items())[:5]:
    print(f"\nPage ID: {pid}")
    for field, numbers in stats.items():
        if numbers:
            print(f"  {field} → Count: {len(numbers)}, Mean: {mean(numbers):.2f}, Min: {min(numbers)}, Max: {max(numbers)}")
    print("-" * 60)



Page ID: 4ff23a48b53d988df50ddfebb0e442a984ab8f94e874ef9b9cb34394e0c5d230
  estimated_audience_size → Count: 33, Mean: 43636.36, Min: 30000.0, Max: 75000.0
  estimated_impressions → Count: 33, Mean: 66908.09, Min: 499.0, Max: 374999.0
  estimated_spend → Count: 33, Mean: 467.18, Min: 49.0, Max: 2249.0
------------------------------------------------------------

Page ID: b9eb7e353e596d5fc99568d4ef77d4b11ced3243537cbd0895dde3195b69b6be
  estimated_audience_size → Count: 3, Mean: 1000001.00, Min: 1000001.0, Max: 1000001.0
  estimated_impressions → Count: 3, Mean: 16165.67, Min: 499.0, Max: 47499.0
  estimated_spend → Count: 3, Mean: 82.33, Min: 49.0, Max: 149.0
------------------------------------------------------------

Page ID: 7167146d80bba9d877a246d9682c7eecf3cae7b63337cf8ec01fff2eff27c909
  estimated_audience_size → Count: 7, Mean: 1000001.00, Min: 1000001.0, Max: 1000001.0
  estimated_impressions → Count: 7, Mean: 641.86, Min: 499.0, Max: 1499.0
  estimated_spend → Count: 7, Mean

In [7]:
from collections import defaultdict
from statistics import mean

fields_numeric = ["estimated_audience_size", "estimated_impressions", "estimated_spend"]

# Group by (page_id, ad_id)
page_ad_groups = defaultdict(lambda: {field: [] for field in fields_numeric})

for entry in data:
    page_ad_key = (entry["page_id"], entry["ad_id"])
    for field in fields_numeric:
        value = entry[field].replace(",", "").strip()
        try:
            numeric_value = float(value)
            page_ad_groups[page_ad_key][field].append(numeric_value)
        except ValueError:
            continue

# Print summary for first 5 (page_id, ad_id) combinations
for pair_key, stats in list(page_ad_groups.items())[:5]:
    print(f"\n(page_id, ad_id): {pair_key}")
    for field, numbers in stats.items():
        if numbers:
            print(f"  {field} → Count: {len(numbers)}, Mean: {mean(numbers):.2f}, Min: {min(numbers)}, Max: {max(numbers)}")
    print("-" * 60)



(page_id, ad_id): ('4ff23a48b53d988df50ddfebb0e442a984ab8f94e874ef9b9cb34394e0c5d230', '0ddb025b8544e2d58e6977ad417e742a52522b3e1fc1c9d9b61c57148f8d72fc')
  estimated_audience_size → Count: 1, Mean: 30000.00, Min: 30000.0, Max: 30000.0
  estimated_impressions → Count: 1, Mean: 47499.00, Min: 47499.0, Max: 47499.0
  estimated_spend → Count: 1, Mean: 249.00, Min: 249.0, Max: 249.0
------------------------------------------------------------

(page_id, ad_id): ('4ff23a48b53d988df50ddfebb0e442a984ab8f94e874ef9b9cb34394e0c5d230', '86229868e6bde3661724fe02da93504bb4fb5da8c2550d7b7cf193c687e89fa6')
  estimated_audience_size → Count: 1, Mean: 75000.00, Min: 75000.0, Max: 75000.0
  estimated_impressions → Count: 1, Mean: 22499.00, Min: 22499.0, Max: 22499.0
  estimated_spend → Count: 1, Mean: 49.00, Min: 49.0, Max: 49.0
------------------------------------------------------------

(page_id, ad_id): ('4ff23a48b53d988df50ddfebb0e442a984ab8f94e874ef9b9cb34394e0c5d230', '07b5aefc27e872e971f793e49a

## Script 2: With Pandas 

In [12]:
import pandas as pd

# Load the dataset
df = pd.read_csv("C:\\Users\\s29sh\\OneDrive\\Documents\\Datasets\\period_03\\2024_fb_ads_president_scored_anon.csv")

# Basic info
print("Shape:", df.shape)
df.head()


Shape: (246745, 41)


Unnamed: 0,page_id,ad_id,ad_creation_time,bylines,currency,delivery_by_region,demographic_distribution,estimated_audience_size,estimated_impressions,estimated_spend,...,lgbtq_issues_topic_illuminating,military_topic_illuminating,race_and_ethnicity_topic_illuminating,safety_topic_illuminating,social_and_cultural_topic_illuminating,technology_and_privacy_topic_illuminating,womens_issue_topic_illuminating,incivility_illuminating,freefair_illuminating,fraud_illuminating
0,4ff23a48b53d988df50ddfebb0e442a984ab8f94e874ef...,0ddb025b8544e2d58e6977ad417e742a52522b3e1fc1c9...,2024-10-21,Texas Organizing Project PAC,USD,"{'Texas': {'spend': 249, 'impressions': 47499}}","{'female_18-24': {'spend': 28, 'impressions': ...",30000,47499,249,...,0,0,0,0,0,0,0,0,0,0
1,4ff23a48b53d988df50ddfebb0e442a984ab8f94e874ef...,86229868e6bde3661724fe02da93504bb4fb5da8c2550d...,2024-10-18,Texas Organizing Project PAC,USD,"{'Texas': {'spend': 49, 'impressions': 22499}}","{'female_18-24': {'spend': 8, 'impressions': 3...",75000,22499,49,...,0,0,0,0,0,0,0,0,0,0
2,4ff23a48b53d988df50ddfebb0e442a984ab8f94e874ef...,07b5aefc27e872e971f793e49aac38496fa62e484f3928...,2024-10-13,Texas Organizing Project PAC,USD,"{'Texas': {'spend': 149, 'impressions': 32499}}","{'female_18-24': {'spend': 26, 'impressions': ...",75000,32499,149,...,0,0,0,0,0,0,0,0,0,0
3,b9eb7e353e596d5fc99568d4ef77d4b11ced3243537cbd...,c62978153c04116d88ead49379916855f2cb58bf788631...,2024-11-02,,USD,{},{},1000001,499,49,...,0,0,0,0,0,0,0,0,0,0
4,b9eb7e353e596d5fc99568d4ef77d4b11ced3243537cbd...,785e91ef18a5794565af03a6df4e7077fe1d915bfb3402...,2024-11-02,,USD,{},{},1000001,499,49,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# Numeric summary statistics
df.describe().T


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
estimated_audience_size,246745.0,556462.855969,409864.758823,0.0,75000.0,300000.0,1000001.0,1000001.0
estimated_impressions,246745.0,45601.525952,136790.769901,499.0,499.0,3499.0,22499.0,1000000.0
estimated_spend,246745.0,1061.291434,4992.560749,49.0,49.0,49.0,449.0,474999.0
scam_illuminating,246745.0,0.071633,0.257879,0.0,0.0,0.0,0.0,1.0
election_integrity_Truth_illuminating,246745.0,0.050088,0.218127,0.0,0.0,0.0,0.0,1.0
advocacy_msg_type_illuminating,246745.0,0.548631,0.49763,0.0,0.0,1.0,1.0,1.0
issue_msg_type_illuminating,246745.0,0.381649,0.485792,0.0,0.0,0.0,1.0,1.0
attack_msg_type_illuminating,246745.0,0.271856,0.444917,0.0,0.0,0.0,1.0,1.0
image_msg_type_illuminating,246745.0,0.222704,0.416062,0.0,0.0,0.0,0.0,1.0
cta_msg_type_illuminating,246745.0,0.572769,0.494677,0.0,0.0,1.0,1.0,1.0


In [14]:
cat_cols = ["currency", "publisher_platforms", "bylines"]

for col in cat_cols:
    print(f"\nColumn: {col}")
    print(f"Unique values: {df[col].nunique()}")
    print("Top 5 most frequent values:")
    print(df[col].value_counts().head(5))
    print("-" * 50)



Column: currency
Unique values: 18
Top 5 most frequent values:
currency
USD    246599
INR        63
GBP        17
EUR        11
EGP         8
Name: count, dtype: int64
--------------------------------------------------

Column: publisher_platforms
Unique values: 9
Top 5 most frequent values:
publisher_platforms
['facebook', 'instagram']                                     214434
['facebook']                                                   23259
['instagram']                                                   8395
['facebook', 'instagram', 'audience_network', 'messenger']       459
['facebook', 'instagram', 'audience_network']                     79
Name: count, dtype: int64
--------------------------------------------------

Column: bylines
Unique values: 3790
Top 5 most frequent values:
bylines
HARRIS FOR PRESIDENT                        49788
HARRIS VICTORY FUND                         32612
BIDEN VICTORY FUND                          15539
DONALD J. TRUMP FOR PRESIDENT 2024, INC. 

## Script 3: With Polars 

In [18]:
!pip install polars


Defaulting to user installation because normal site-packages is not writeable


[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip



Collecting polars
  Downloading polars-1.31.0-cp39-abi3-win_amd64.whl.metadata (15 kB)
Downloading polars-1.31.0-cp39-abi3-win_amd64.whl (35.2 MB)
   ---------------------------------------- 0.0/35.2 MB ? eta -:--:--
   -- ------------------------------------- 1.8/35.2 MB 10.0 MB/s eta 0:00:04
   ---- ----------------------------------- 4.2/35.2 MB 11.4 MB/s eta 0:00:03
   ----- ---------------------------------- 4.5/35.2 MB 9.6 MB/s eta 0:00:04
   ------- -------------------------------- 6.3/35.2 MB 7.3 MB/s eta 0:00:04
   ---------- ----------------------------- 8.9/35.2 MB 8.4 MB/s eta 0:00:04
   ------------- -------------------------- 11.5/35.2 MB 9.0 MB/s eta 0:00:03
   --------------- ------------------------ 13.9/35.2 MB 9.5 MB/s eta 0:00:03
   ------------------ --------------------- 16.5/35.2 MB 9.8 MB/s eta 0:00:02
   -------------------- ------------------- 18.1/35.2 MB 9.9 MB/s eta 0:00:02
   -------------------- ------------------- 18.4/35.2 MB 9.1 MB/s eta 0:00:02
   --

In [19]:
import polars as pl

# Load the dataset
df_pl = pl.read_csv("C:\\Users\\s29sh\\OneDrive\\Documents\\Datasets\\period_03\\2024_fb_ads_president_scored_anon.csv")

# Preview shape and a few rows
print("Shape:", df_pl.shape)
df_pl.head()


Shape: (246745, 41)


page_id,ad_id,ad_creation_time,bylines,currency,delivery_by_region,demographic_distribution,estimated_audience_size,estimated_impressions,estimated_spend,publisher_platforms,illuminating_scored_message,illuminating_mentions,scam_illuminating,election_integrity_Truth_illuminating,advocacy_msg_type_illuminating,issue_msg_type_illuminating,attack_msg_type_illuminating,image_msg_type_illuminating,cta_msg_type_illuminating,engagement_cta_subtype_illuminating,fundraising_cta_subtype_illuminating,voting_cta_subtype_illuminating,covid_topic_illuminating,economy_topic_illuminating,education_topic_illuminating,environment_topic_illuminating,foreign_policy_topic_illuminating,governance_topic_illuminating,health_topic_illuminating,immigration_topic_illuminating,lgbtq_issues_topic_illuminating,military_topic_illuminating,race_and_ethnicity_topic_illuminating,safety_topic_illuminating,social_and_cultural_topic_illuminating,technology_and_privacy_topic_illuminating,womens_issue_topic_illuminating,incivility_illuminating,freefair_illuminating,fraud_illuminating
str,str,str,str,str,str,str,i64,i64,i64,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
"""4ff23a48b53d988df50ddfebb0e442…","""0ddb025b8544e2d58e6977ad417e74…","""2024-10-21""","""Texas Organizing Project PAC""","""USD""","""{'Texas': {'spend': 249, 'impr…","""{'female_18-24': {'spend': 28,…",30000,47499,249,"""['facebook', 'instagram']""","""362d68d42e34e070bc9f999033642b…","""['Kamala Harris', 'Tim Walz']""",0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""4ff23a48b53d988df50ddfebb0e442…","""86229868e6bde3661724fe02da9350…","""2024-10-18""","""Texas Organizing Project PAC""","""USD""","""{'Texas': {'spend': 49, 'impre…","""{'female_18-24': {'spend': 8, …",75000,22499,49,"""['facebook', 'instagram']""","""dc522d5aa4f91c326d105ec4c482cf…","""['Kamala Harris', 'Tim Walz']""",0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""4ff23a48b53d988df50ddfebb0e442…","""07b5aefc27e872e971f793e49aac38…","""2024-10-13""","""Texas Organizing Project PAC""","""USD""","""{'Texas': {'spend': 149, 'impr…","""{'female_18-24': {'spend': 26,…",75000,32499,149,"""['facebook', 'instagram']""","""6dc61896f4a44cf4fdbe564604bbeb…","""['Kamala Harris', 'Tim Walz']""",0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""b9eb7e353e596d5fc99568d4ef77d4…","""c62978153c04116d88ead493799168…","""2024-11-02""",,"""USD""","""{}""","""{}""",1000001,499,49,"""['facebook', 'instagram', 'aud…","""5ffb1d89916e779d01193e726cd880…","""['Tim Walz']""",0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""b9eb7e353e596d5fc99568d4ef77d4…","""785e91ef18a5794565af03a6df4e70…","""2024-11-02""",,"""USD""","""{}""","""{}""",1000001,499,49,"""['facebook', 'instagram', 'aud…","""b7360494f7dd93ffa2320d88b10a58…","""['Tim Walz']""",0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [20]:
# Descriptive stats for all numeric columns
df_pl.describe()


statistic,page_id,ad_id,ad_creation_time,bylines,currency,delivery_by_region,demographic_distribution,estimated_audience_size,estimated_impressions,estimated_spend,publisher_platforms,illuminating_scored_message,illuminating_mentions,scam_illuminating,election_integrity_Truth_illuminating,advocacy_msg_type_illuminating,issue_msg_type_illuminating,attack_msg_type_illuminating,image_msg_type_illuminating,cta_msg_type_illuminating,engagement_cta_subtype_illuminating,fundraising_cta_subtype_illuminating,voting_cta_subtype_illuminating,covid_topic_illuminating,economy_topic_illuminating,education_topic_illuminating,environment_topic_illuminating,foreign_policy_topic_illuminating,governance_topic_illuminating,health_topic_illuminating,immigration_topic_illuminating,lgbtq_issues_topic_illuminating,military_topic_illuminating,race_and_ethnicity_topic_illuminating,safety_topic_illuminating,social_and_cultural_topic_illuminating,technology_and_privacy_topic_illuminating,womens_issue_topic_illuminating,incivility_illuminating,freefair_illuminating,fraud_illuminating
str,str,str,str,str,str,str,str,f64,f64,f64,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""","""246745""","""246745""","""246745""","""245736""","""246745""","""246745""","""246745""",246745.0,246745.0,246745.0,"""246745""","""246745""","""246745""",246745.0,246745.0,246745.0,246745.0,246745.0,246745.0,246745.0,246745.0,246745.0,246745.0,246745.0,246745.0,246745.0,246745.0,246745.0,246745.0,246745.0,246745.0,246745.0,246745.0,246745.0,246745.0,246745.0,246745.0,246745.0,246745.0,246745.0,246745.0
"""null_count""","""0""","""0""","""0""","""1009""","""0""","""0""","""0""",0.0,0.0,0.0,"""0""","""0""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",,,,,,,,556462.855969,45601.525952,1061.291434,,,,0.071633,0.050088,0.548631,0.381649,0.271856,0.222704,0.572769,0.12487,0.228487,0.143845,0.024876,0.122122,0.014327,0.021249,0.005265,0.025642,0.10919,0.033569,0.00323,0.002176,0.012434,0.033723,0.105838,0.001155,0.080909,0.187526,0.006416,0.002638
"""std""",,,,,,,,409864.758823,136790.769901,4992.560749,,,,0.257879,0.218127,0.49763,0.485792,0.444917,0.416062,0.494677,0.330572,0.419859,0.350933,0.155747,0.327427,0.118833,0.144212,0.072366,0.158065,0.311878,0.180118,0.056742,0.046601,0.110812,0.180516,0.307631,0.033966,0.272697,0.390334,0.07984,0.051297
"""min""","""0005687f4ac876c135dda2bd5aa609…","""0000a88a64484883df6ca1fabd357e…","""2021-07-06""",""" 40th Senate District DFL ""","""AED""","""{'Alabama': {'spend': 0, 'impr…","""{'female_13-17': {'spend': 0, …",0.0,499.0,49.0,"""['facebook', 'audience_network…","""00002c515c57e0af2028d2b884fd30…","""['Asa Hutchinson']""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""25%""",,,,,,,,75000.0,499.0,49.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""50%""",,,,,,,,300000.0,3499.0,49.0,,,,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""75%""",,,,,,,,1000001.0,22499.0,449.0,,,,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""max""","""fffd9545553ab18ab56d93b9029630…","""fffffab6864bde31aa57a40152b4a0…","""2024-11-05""","""the Wes Kitchens Campaign""","""VND""","""{}""","""{}""",1000001.0,1000000.0,474999.0,"""['instagram']""","""ffff2ee32a78ae9ad3ac558dd998be…","""[]""",1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [21]:
cat_cols = ["currency", "publisher_platforms", "bylines"]

for col in cat_cols:
    print(f"\nColumn: {col}")
    
    # Unique count
    unique_count = df_pl.select(pl.col(col).n_unique()).item()
    print(f"Unique values: {unique_count}")
    
    # Top 5 most frequent values
    top_5 = (
        df_pl.group_by(col)
        .count()
        .sort("count", descending=True)
        .select([col, "count"])
        .head(5)
    )
    print("Top 5 most frequent values:")
    print(top_5)
    print("-" * 50)



Column: currency
Unique values: 18
Top 5 most frequent values:
shape: (5, 2)
┌──────────┬────────┐
│ currency ┆ count  │
│ ---      ┆ ---    │
│ str      ┆ u32    │
╞══════════╪════════╡
│ USD      ┆ 246599 │
│ INR      ┆ 63     │
│ GBP      ┆ 17     │
│ EUR      ┆ 11     │
│ EGP      ┆ 8      │
└──────────┴────────┘
--------------------------------------------------

Column: publisher_platforms
Unique values: 9
Top 5 most frequent values:
shape: (5, 2)
┌─────────────────────────────────┬────────┐
│ publisher_platforms             ┆ count  │
│ ---                             ┆ ---    │
│ str                             ┆ u32    │
╞═════════════════════════════════╪════════╡
│ ['facebook', 'instagram']       ┆ 214434 │
│ ['facebook']                    ┆ 23259  │
│ ['instagram']                   ┆ 8395   │
│ ['facebook', 'instagram', 'aud… ┆ 459    │
│ ['facebook', 'instagram', 'aud… ┆ 79     │
└─────────────────────────────────┴────────┘
----------------------------------------------

  .count()
