# Facebook Posts Descriptive Statistics

## Script 1 — Pure Python (No Pandas, No Polars)


In [5]:
import csv

# Load file
file_path = "C:\\Users\\s29sh\\OneDrive\\Documents\\Datasets\\period_03\\2024_fb_posts_president_scored_anon.csv"

# Preview header and first row
with open(file_path, mode='r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    header = next(reader)
    first_row = next(reader)

print("Header:\n", header)
print("\nFirst Row:\n", first_row)


Header:
 {'Facebook_Id': 'a6cb7db6850459b954f7272e14d770617022639f8847cf25e6859b9453a03813', 'post_id': '8570b69695e00d8f06b12398ed525497e1712b5369c6fc2138fe98f69811c138', 'Page Category': 'PERSON', 'Page Admin Top Country': 'US', 'Post Created': '2023-09-04 19:31:16 EDT', 'Post Created Date': '2023-09-04', 'Post Created Time': '19:31:16', 'Type': 'Photo', 'Total Interactions': '261', 'Likes': '104', 'Comments': '121', 'Shares': '8', 'Love': '12', 'Wow': '0', 'Haha': '14', 'Sad': '0', 'Angry': '2', 'Care': '0', 'Video Share Status': '', 'Is Video Owner?': '-', 'Post Views': '0.0', 'Total Views': '0.0', 'Total Views For All Crossposts': '0.0', 'Video Length': '', 'Sponsor Id': '', 'Sponsor Name': '', 'Sponsor Category': '', 'Overperforming Score': '2.29', 'illuminating_scored_messageelection_integrity_Truth_illuminating': '', 'advocacy_msg_type_illuminating': '0', 'issue_msg_type_illuminating': '0', 'attack_msg_type_illuminating': '0', 'image_msg_type_illuminating': '0', 'cta_msg_type_i

In [6]:
print("Column Names:\n")
for i, col in enumerate(header.keys(), start=1):
    print(f"{i}. {col}")


Column Names:

1. Facebook_Id
2. post_id
3. Page Category
4. Page Admin Top Country
5. Post Created
6. Post Created Date
7. Post Created Time
8. Type
9. Total Interactions
10. Likes
11. Comments
12. Shares
13. Love
14. Wow
15. Haha
16. Sad
17. Angry
18. Care
19. Video Share Status
20. Is Video Owner?
21. Post Views
22. Total Views
23. Total Views For All Crossposts
24. Video Length
25. Sponsor Id
26. Sponsor Name
27. Sponsor Category
28. Overperforming Score
29. illuminating_scored_messageelection_integrity_Truth_illuminating
30. advocacy_msg_type_illuminating
31. issue_msg_type_illuminating
32. attack_msg_type_illuminating
33. image_msg_type_illuminating
34. cta_msg_type_illuminating
35. engagement_cta_subtype_illuminating
36. fundraising_cta_subtype_illuminating
37. voting_cta_subtype_illuminating
38. covid_topic_illuminating
39. economy_topic_illuminating
40. education_topic_illuminating
41. environment_topic_illuminating
42. foreign_policy_topic_illuminating
43. governance_topic_il

In [7]:
import pandas as pd

# Load the data
df = pd.read_csv("2024_fb_posts_president_scored_anon.csv")

# Convert all illuminating columns + interaction metrics to numeric (if not already)
illuminating_cols = [col for col in df.columns if col.endswith("_illuminating")]
interaction_cols = [
    'Likes', 'Comments', 'Shares', 'Love', 'Wow', 'Haha', 'Sad', 'Angry', 'Care', 
    'Total Interactions', 'Overperforming Score'
]

# Combine all numeric columns
numeric_cols = illuminating_cols + interaction_cols

# Convert to numeric, forcing errors to NaN
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

# Descriptive statistics
df[numeric_cols].describe().T.round(2)


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
illuminating_scored_messageelection_integrity_Truth_illuminating,0.0,,,,,,,
advocacy_msg_type_illuminating,19009.0,0.55,0.5,0.0,0.0,1.0,1.0,1.0
issue_msg_type_illuminating,19009.0,0.46,0.5,0.0,0.0,0.0,1.0,1.0
attack_msg_type_illuminating,19009.0,0.22,0.41,0.0,0.0,0.0,0.0,1.0
image_msg_type_illuminating,19009.0,0.15,0.36,0.0,0.0,0.0,0.0,1.0
cta_msg_type_illuminating,19009.0,0.13,0.34,0.0,0.0,0.0,0.0,1.0
engagement_cta_subtype_illuminating,19009.0,0.09,0.29,0.0,0.0,0.0,0.0,1.0
fundraising_cta_subtype_illuminating,19009.0,0.02,0.13,0.0,0.0,0.0,0.0,1.0
voting_cta_subtype_illuminating,19009.0,0.02,0.15,0.0,0.0,0.0,0.0,1.0
covid_topic_illuminating,19009.0,0.05,0.22,0.0,0.0,0.0,0.0,1.0


In [8]:
import csv
from collections import Counter

csv_filename = "C:\\Users\\s29sh\\OneDrive\\Documents\\Datasets\\period_03\\2024_fb_posts_president_scored_anon.csv"
cat_fields = ["Page Category", "Page Admin Top Country", "Type"]

# Build a dictionary to hold all values for each categorical field
cat_value_lists = {field: [] for field in cat_fields}

with open(csv_filename, encoding="utf-8") as csvfile:
    data_reader = csv.DictReader(csvfile)
    for record in data_reader:
        for field in cat_fields:
            value = record[field].strip()
            cat_value_lists[field].append(value)

# Analyze and print statistics for each categorical column
for field in cat_fields:
    freq_counts = Counter(cat_value_lists[field])
    print(f"\nField: {field}")
    print(f"Unique Values: {len(freq_counts)}")
    print("Top 5 Most Frequent Values:")
    for val, freq in freq_counts.most_common(5):
        print(f"  {val}: {freq}")
    print("-" * 40)



Field: Page Category
Unique Values: 7
Top 5 Most Frequent Values:
  PERSON: 9453
  ACTOR: 3304
  POLITICIAN: 2595
  : 2472
  POLITICAL_CANDIDATE: 1161
----------------------------------------

Field: Page Admin Top Country
Unique Values: 2
Top 5 Most Frequent Values:
  US: 16280
  : 2729
----------------------------------------

Field: Type
Unique Values: 10
Top 5 Most Frequent Values:
  Link: 7404
  Photo: 3820
  Native Video: 2931
  : 2465
  Status: 1387
----------------------------------------


In [9]:
import csv
from collections import defaultdict
from math import sqrt

csv_path = "2024_fb_posts_president_scored_anon.csv"
with open(csv_path, encoding="utf-8") as csvfile:
    posts_data = list(csv.DictReader(csvfile))

reaction_fields = ["Likes", "Comments", "Shares", "Love", "Wow", "Haha", "Sad", "Angry", "Care"]

# --- GROUP BY PAGE CATEGORY ---
category_groups = defaultdict(list)
for post in posts_data:
    cat = post["Page Category"]
    category_groups[cat].append(post)

print("=" * 80)
print("GROUPED BY: PAGE CATEGORY\n")

for cat, group_posts in category_groups.items():
    print(f"Page Category: {cat}")
    print(f"Number of Posts: {len(group_posts)}")
    for reaction in reaction_fields:
        vals = [int(post[reaction]) for post in group_posts if post[reaction].isdigit()]
        if not vals:
            continue
        cnt = len(vals)
        avg = sum(vals) / cnt
        min_val = min(vals)
        max_val = max(vals)
        std = sqrt(sum((x - avg) ** 2 for x in vals) / cnt)
        print(f"{reaction}: count={cnt}, mean={round(avg,2)}, min={min_val}, max={max_val}, std={round(std,2)}")
    print("-" * 60)

# --- GROUP BY PAGE CATEGORY + TYPE ---
cat_type_groups = defaultdict(list)
for post in posts_data:
    composite_key = (post["Page Category"], post["Type"])
    cat_type_groups[composite_key].append(post)

print("\n" + "=" * 80)
print("GROUPED BY: PAGE CATEGORY + TYPE\n")

for (cat, typ), group_posts in cat_type_groups.items():
    print(f"Category: {cat} | Type: {typ}")
    print(f"Number of Posts: {len(group_posts)}")
    for reaction in reaction_fields:
        vals = [int(post[reaction]) for post in group_posts if post[reaction].isdigit()]
        if not vals:
            continue
        cnt = len(vals)
        avg = sum(vals) / cnt
        min_val = min(vals)
        max_val = max(vals)
        std = sqrt(sum((x - avg) ** 2 for x in vals) / cnt)
        print(f"{reaction}: count={cnt}, mean={round(avg,2)}, min={min_val}, max={max_val}, std={round(std,2)}")
    print("-" * 60)


GROUPED BY: PAGE CATEGORY

Page Category: PERSON
Number of Posts: 9453
Likes: count=9453, mean=403.61, min=1, max=83391, std=2050.43
Comments: count=9453, mean=159.81, min=0, max=18094, std=471.29
Shares: count=9453, mean=56.94, min=0, max=28313, std=512.44
Love: count=9453, mean=68.71, min=0, max=22342, std=381.06
Wow: count=9453, mean=2.8, min=0, max=608, std=11.79
Haha: count=9453, mean=38.43, min=0, max=32351, std=399.57
Sad: count=9453, mean=4.29, min=0, max=4922, std=63.13
Angry: count=9453, mean=8.97, min=0, max=2244, std=43.24
Care: count=9453, mean=8.76, min=0, max=13706, std=148.19
------------------------------------------------------------
Page Category: POLITICAL_CANDIDATE
Number of Posts: 1161
Likes: count=1161, mean=11401.89, min=0, max=315973, std=27339.8
Comments: count=1161, mean=4720.96, min=0, max=93872, std=10250.72
Shares: count=1161, mean=1608.84, min=0, max=52268, std=3624.49
Love: count=1161, mean=4530.11, min=0, max=244482, std=13997.06
Wow: count=1161, mean=4

In [10]:
import pandas as pd

# Load the Facebook Posts dataset
df = pd.read_csv("2024_fb_posts_president_scored_anon.csv")

# Preview shape and columns
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
df.head()


Shape: (19009, 56)
Columns: ['Facebook_Id', 'post_id', 'Page Category', 'Page Admin Top Country', 'Post Created', 'Post Created Date', 'Post Created Time', 'Type', 'Total Interactions', 'Likes', 'Comments', 'Shares', 'Love', 'Wow', 'Haha', 'Sad', 'Angry', 'Care', 'Video Share Status', 'Is Video Owner?', 'Post Views', 'Total Views', 'Total Views For All Crossposts', 'Video Length', 'Sponsor Id', 'Sponsor Name', 'Sponsor Category', 'Overperforming Score', 'illuminating_scored_messageelection_integrity_Truth_illuminating', 'advocacy_msg_type_illuminating', 'issue_msg_type_illuminating', 'attack_msg_type_illuminating', 'image_msg_type_illuminating', 'cta_msg_type_illuminating', 'engagement_cta_subtype_illuminating', 'fundraising_cta_subtype_illuminating', 'voting_cta_subtype_illuminating', 'covid_topic_illuminating', 'economy_topic_illuminating', 'education_topic_illuminating', 'environment_topic_illuminating', 'foreign_policy_topic_illuminating', 'governance_topic_illuminating', 'health_t

Unnamed: 0,Facebook_Id,post_id,Page Category,Page Admin Top Country,Post Created,Post Created Date,Post Created Time,Type,Total Interactions,Likes,...,military_topic_illuminating,race_and_ethnicity_topic_illuminating,safety_topic_illuminating,social_and_cultural_topic_illuminating,technology_and_privacy_topic_illuminating,womens_issue_topic_illuminating,incivility_illuminating,scam_illuminating,freefair_illuminating,fraud_illuminating
0,a6cb7db6850459b954f7272e14d770617022639f8847cf...,8570b69695e00d8f06b12398ed525497e1712b5369c6fc...,PERSON,US,2023-09-04 19:31:16 EDT,2023-09-04,19:31:16,Photo,261,104,...,0,0,0,0,0,0,0,0.0,0,0
1,a6cb7db6850459b954f7272e14d770617022639f8847cf...,41ec27cecd8af40007a9faf8c3e5c9225bcff0b8d58856...,PERSON,US,2023-09-06 20:00:56 EDT,2023-09-06,20:00:56,Photo,135,71,...,0,0,0,0,0,0,0,0.0,0,0
2,a6cb7db6850459b954f7272e14d770617022639f8847cf...,1dcb5e00cd1c8d7ee141922f50f29e59e96328231b6937...,PERSON,US,2023-09-21 09:48:09 EDT,2023-09-21,09:48:09,Link,124,33,...,0,0,1,0,0,0,1,0.0,0,0
3,a6cb7db6850459b954f7272e14d770617022639f8847cf...,3e5e0a047865ab02fe0f49c343963239fe7774b63e8ab6...,PERSON,US,2023-09-06 20:01:39 EDT,2023-09-06,20:01:39,Photo,109,34,...,0,0,0,0,0,0,0,0.0,0,0
4,7ec2cb4abf8effe3d91de57944c56b938b4f33059a6e33...,b83adc0e8ac0aedd39f55b72d723729e114a83d41fd48b...,POLITICAL_CANDIDATE,US,2023-09-27 20:13:08 EDT,2023-09-27,20:13:08,Live Video Complete,268841,126850,...,0,0,0,0,0,0,0,0.0,0,0


In [11]:
# Descriptive statistics for all numeric columns
df.describe().T


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Likes,19009.0,2377.695407,11253.469669,0.0,31.0,139.0,738.0,351979.0
Comments,19009.0,901.583197,3681.980096,0.0,8.0,48.0,354.0,93872.0
Shares,19009.0,320.538955,1722.159879,0.0,3.0,21.0,102.0,76150.0
Love,19009.0,413.877321,3730.94045,0.0,0.0,4.0,70.0,244482.0
Wow,19009.0,5.868326,52.946979,0.0,0.0,1.0,3.0,4345.0
Haha,19009.0,105.719712,942.034248,0.0,0.0,2.0,29.0,99276.0
Sad,19009.0,10.172182,418.325007,0.0,0.0,0.0,2.0,56111.0
Angry,19009.0,20.0555,156.020089,0.0,0.0,1.0,9.0,11814.0
Care,19009.0,34.929034,790.103213,0.0,0.0,0.0,6.0,85236.0
Post Views,16544.0,6485.058511,90392.958445,0.0,0.0,0.0,0.0,4276477.0


In [12]:
# Let's pick a few key categorical columns
cat_cols = ['Page Category', 'Page Admin Top Country', 'Type']

for col in cat_cols:
    print(f"\nColumn: {col}")
    print(f"Unique values: {df[col].nunique()}")
    print("Top 5 most frequent values:")
    print(df[col].value_counts().head(5))
    print("-" * 50)



Column: Page Category
Unique values: 6
Top 5 most frequent values:
Page Category
PERSON                 9453
ACTOR                  3304
POLITICIAN             2595
POLITICAL_CANDIDATE    1161
ENTREPRENEUR             23
Name: count, dtype: int64
--------------------------------------------------

Column: Page Admin Top Country
Unique values: 1
Top 5 most frequent values:
Page Admin Top Country
US    16280
Name: count, dtype: int64
--------------------------------------------------

Column: Type
Unique values: 9
Top 5 most frequent values:
Type
Link            7404
Photo           3820
Native Video    2931
Status          1387
YouTube          353
Name: count, dtype: int64
--------------------------------------------------


In [13]:
import polars as pl

# Load the dataset
df_pl = pl.read_csv("2024_fb_posts_president_scored_anon.csv")

# Preview shape and first few rows
print("Shape:", df_pl.shape)
df_pl.head()


Shape: (19009, 56)


Facebook_Id,post_id,Page Category,Page Admin Top Country,Post Created,Post Created Date,Post Created Time,Type,Total Interactions,Likes,Comments,Shares,Love,Wow,Haha,Sad,Angry,Care,Video Share Status,Is Video Owner?,Post Views,Total Views,Total Views For All Crossposts,Video Length,Sponsor Id,Sponsor Name,Sponsor Category,Overperforming Score,illuminating_scored_messageelection_integrity_Truth_illuminating,advocacy_msg_type_illuminating,issue_msg_type_illuminating,attack_msg_type_illuminating,image_msg_type_illuminating,cta_msg_type_illuminating,engagement_cta_subtype_illuminating,fundraising_cta_subtype_illuminating,voting_cta_subtype_illuminating,covid_topic_illuminating,economy_topic_illuminating,education_topic_illuminating,environment_topic_illuminating,foreign_policy_topic_illuminating,governance_topic_illuminating,health_topic_illuminating,immigration_topic_illuminating,lgbtq_issues_topic_illuminating,military_topic_illuminating,race_and_ethnicity_topic_illuminating,safety_topic_illuminating,social_and_cultural_topic_illuminating,technology_and_privacy_topic_illuminating,womens_issue_topic_illuminating,incivility_illuminating,scam_illuminating,freefair_illuminating,fraud_illuminating
str,str,str,str,str,str,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,str,f64,f64,f64,str,str,str,str,f64,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,i64,i64
"""a6cb7db6850459b954f7272e14d770…","""8570b69695e00d8f06b12398ed5254…","""PERSON""","""US""","""2023-09-04 19:31:16 EDT""","""2023-09-04""","""19:31:16""","""Photo""","""261""",104,121,8,12,0,14,0,2,0,,"""-""",0.0,0.0,0.0,,,,,2.29,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0
"""a6cb7db6850459b954f7272e14d770…","""41ec27cecd8af40007a9faf8c3e5c9…","""PERSON""","""US""","""2023-09-06 20:00:56 EDT""","""2023-09-06""","""20:00:56""","""Photo""","""135""",71,41,7,11,1,3,1,0,0,,"""-""",0.0,0.0,0.0,,,,,1.18,,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0
"""a6cb7db6850459b954f7272e14d770…","""1dcb5e00cd1c8d7ee141922f50f29e…","""PERSON""","""US""","""2023-09-21 09:48:09 EDT""","""2023-09-21""","""09:48:09""","""Link""","""124""",33,74,4,1,1,10,0,0,1,,"""-""",0.0,0.0,0.0,,,,,1.44,,1,1,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,1,0.0,0,0
"""a6cb7db6850459b954f7272e14d770…","""3e5e0a047865ab02fe0f49c3439632…","""PERSON""","""US""","""2023-09-06 20:01:39 EDT""","""2023-09-06""","""20:01:39""","""Photo""","""109""",34,64,3,6,0,2,0,0,0,,"""-""",0.0,0.0,0.0,,,,,-1.05,,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0
"""7ec2cb4abf8effe3d91de57944c56b…","""b83adc0e8ac0aedd39f55b72d72372…","""POLITICAL_CANDIDATE""","""US""","""2023-09-27 20:13:08 EDT""","""2023-09-27""","""20:13:08""","""Live Video Complete""","""268,841""",126850,53191,20701,62827,162,2241,128,642,2099,"""crosspost""","""Yes""",1332813.0,1468579.0,1494352.0,"""01:08:20""",,,,2.79,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0


In [14]:
# Descriptive statistics for all numeric columns
df_pl.describe()


statistic,Facebook_Id,post_id,Page Category,Page Admin Top Country,Post Created,Post Created Date,Post Created Time,Type,Total Interactions,Likes,Comments,Shares,Love,Wow,Haha,Sad,Angry,Care,Video Share Status,Is Video Owner?,Post Views,Total Views,Total Views For All Crossposts,Video Length,Sponsor Id,Sponsor Name,Sponsor Category,Overperforming Score,illuminating_scored_messageelection_integrity_Truth_illuminating,advocacy_msg_type_illuminating,issue_msg_type_illuminating,attack_msg_type_illuminating,image_msg_type_illuminating,cta_msg_type_illuminating,engagement_cta_subtype_illuminating,fundraising_cta_subtype_illuminating,voting_cta_subtype_illuminating,covid_topic_illuminating,economy_topic_illuminating,education_topic_illuminating,environment_topic_illuminating,foreign_policy_topic_illuminating,governance_topic_illuminating,health_topic_illuminating,immigration_topic_illuminating,lgbtq_issues_topic_illuminating,military_topic_illuminating,race_and_ethnicity_topic_illuminating,safety_topic_illuminating,social_and_cultural_topic_illuminating,technology_and_privacy_topic_illuminating,womens_issue_topic_illuminating,incivility_illuminating,scam_illuminating,freefair_illuminating,fraud_illuminating
str,str,str,str,str,str,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,f64,f64,str,str,str,str,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""","""19009""","""19009""","""16537""","""16280""","""19009""","""19009""","""19009""","""16544""","""19009""",19009.0,19009.0,19009.0,19009.0,19009.0,19009.0,19009.0,19009.0,19009.0,"""3271""","""16544""",16544.0,16544.0,16544.0,"""3271""","""0""","""0""","""0""",16544.0,"""0""",19009.0,19009.0,19009.0,19009.0,19009.0,19009.0,19009.0,19009.0,19009.0,19009.0,19009.0,19009.0,19009.0,19009.0,19009.0,19009.0,19009.0,19009.0,19009.0,19009.0,19009.0,19009.0,19009.0,19009.0,18060.0,19009.0,19009.0
"""null_count""","""0""","""0""","""2472""","""2729""","""0""","""0""","""0""","""2465""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""15738""","""2465""",2465.0,2465.0,2465.0,"""15738""","""19009""","""19009""","""19009""",2465.0,"""19009""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,949.0,0.0,0.0
"""mean""",,,,,,,,,,2377.695407,901.583197,320.538955,413.877321,5.868326,105.719712,10.172182,20.0555,34.929034,,,6485.058511,7461.847014,3555.937984,,,,,-2.74416,,0.549266,0.460308,0.216687,0.148561,0.132832,0.090852,0.018412,0.02341,0.052396,0.090378,0.014993,0.022253,0.036877,0.031091,0.048661,0.040823,0.003367,0.005576,0.021569,0.032195,0.061708,0.002052,0.025462,0.127887,0.02021,0.002841,0.008627
"""std""",,,,,,,,,,11253.469669,3681.980096,1722.159879,3730.94045,52.946979,942.034248,418.325007,156.020089,790.103213,,,90392.958445,95976.278848,88094.048319,,,,,7.808477,,0.49758,0.498435,0.411998,0.355665,0.339402,0.287406,0.134441,0.151206,0.222831,0.28673,0.121527,0.147508,0.188465,0.173567,0.215164,0.197885,0.057928,0.074468,0.145274,0.176523,0.24063,0.04525,0.157527,0.333972,0.140723,0.053224,0.092485
"""min""","""058bd86861262fa71733f8515c34b7…","""0001e4a1dbadf84f0f43719972b605…","""ACTOR""","""US""","""2023-09-01 00:50:59 EDT""","""2023-09-01""","""00:00:03""","""Link""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""crosspost""","""-""",0.0,0.0,0.0,"""00:00:00""",,,,-198.75,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""25%""",,,,,,,,,,31.0,8.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,,,,,-3.87,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""50%""",,,,,,,,,,139.0,48.0,21.0,4.0,1.0,2.0,0.0,1.0,0.0,,,0.0,0.0,0.0,,,,,-1.62,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""75%""",,,,,,,,,,738.0,354.0,102.0,70.0,3.0,29.0,2.0,9.0,6.0,,,0.0,0.0,0.0,,,,,1.16,,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""max""","""fedbfaab19b5112a6889b862314ca9…","""ffff3f550f95017a77a35388243b4f…","""YOUTH_ORGANIZATION""","""US""","""2024-11-06T20:00:35.000Z""","""2024-11-06""","""23:59:50""","""YouTube""","""999""",351979.0,93872.0,76150.0,244482.0,4345.0,99276.0,56111.0,11814.0,85236.0,"""share""","""Yes""",4276477.0,4462155.0,4499458.0,"""08:00:00""",,,,246.78,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [15]:
# Categorical columns to inspect
cat_cols = ["Page Category", "Page Admin Top Country", "Type"]

for col in cat_cols:
    print(f"\nColumn: {col}")
    
    # Unique value count
    unique_count = df_pl.select(pl.col(col).n_unique()).item()
    print(f"Unique values: {unique_count}")
    
    # Top 5 most frequent values using group_by
    top_5 = (
        df_pl.group_by(col)
        .count()
        .sort("count", descending=True)
        .select([col, "count"])
        .head(5)
    )
    print("Top 5 most frequent values:")
    print(top_5)
    print("-" * 50)



Column: Page Category
Unique values: 7
Top 5 most frequent values:
shape: (5, 2)
┌─────────────────────┬───────┐
│ Page Category       ┆ count │
│ ---                 ┆ ---   │
│ str                 ┆ u32   │
╞═════════════════════╪═══════╡
│ PERSON              ┆ 9453  │
│ ACTOR               ┆ 3304  │
│ POLITICIAN          ┆ 2595  │
│ null                ┆ 2472  │
│ POLITICAL_CANDIDATE ┆ 1161  │
└─────────────────────┴───────┘
--------------------------------------------------

Column: Page Admin Top Country
Unique values: 2
Top 5 most frequent values:
shape: (2, 2)
┌────────────────────────┬───────┐
│ Page Admin Top Country ┆ count │
│ ---                    ┆ ---   │
│ str                    ┆ u32   │
╞════════════════════════╪═══════╡
│ US                     ┆ 16280 │
│ null                   ┆ 2729  │
└────────────────────────┴───────┘
--------------------------------------------------

Column: Type
Unique values: 10
Top 5 most frequent values:
shape: (5, 2)
┌──────────────┬───

  .count()
