# Analyze code with help of Pure Python

In [7]:
import csv
import statistics
import time

start = time.time()
from collections import defaultdict, Counter

def load_data(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        return list(csv.DictReader(f))

def is_number(s):
    try: float(s); return True
    except: return False

def get_numeric_columns(data):
    return [col for col in data[0] if all(is_number(row[col]) or row[col]=='' for row in data[:100])]

def compute_stats(data, column):
    values = [float(row[column]) for row in data if is_number(row[column])]
    if not values:
        return {
            'count': 0,
            'mean': None,
            'min': None,
            'max': None,
            'std': None
        }
    return {
        'count': len(values),
        'mean': statistics.mean(values),
        'min': min(values),
        'max': max(values),
        'std': statistics.stdev(values) if len(values) > 1 else 0
    }


def compute_categorical_stats(data, column):
    values = [row[column] for row in data if row[column]]
    freq = Counter(values)
    return {
        'count': len(values),
        'unique': len(set(values)),
        'most_common': freq.most_common(1)[0] if freq else None
    }

def group_by(data, keys):
    grouped = defaultdict(list)
    for row in data:
        # ✅ Check all keys exist in the row before accessing
        if all(k in row for k in keys):
            key = tuple(row[k] for k in keys)
            grouped[key].append(row)
    return grouped





In [8]:
name = "Facebook Ads"
path = "Downloads/period_03/2024_fb_ads_president_scored_anon.csv"

print(f"\n\n📊 === Analyzing {name} ===")
data = load_data(path)
numeric_cols = get_numeric_columns(data)

print("\n=== Overall Stats ===")
for col in data[0]:
    print(f"\n{col}:")
    if col in numeric_cols:
        print(compute_stats(data, col))
    else:
        print(compute_categorical_stats(data, col))

print("\n=== Grouped by page_id ===")
for key, rows in list(group_by(data, ['page_id']).items())[:3]:
    print(f"\nGroup: {key}")
    for col in numeric_cols:
        print(f"  {col}:", compute_stats(rows, col))

if "ad_id" in data[0]:
    print("\n=== Grouped by page_id and ad_id ===")
    for key, rows in list(group_by(data, ['page_id', 'ad_id']).items())[:3]:
        print(f"\nGroup: {key}")
        for col in numeric_cols:
            print(f"  {col}:", compute_stats(rows, col))




📊 === Analyzing Facebook Ads ===

=== Overall Stats ===

page_id:
{'count': 246745, 'unique': 4475, 'most_common': ('4d66f5853f0365dba032a87704a634f023d15babde973bb7a284ed8cd2707b2d', 55503)}

ad_id:
{'count': 246745, 'unique': 246745, 'most_common': ('0ddb025b8544e2d58e6977ad417e742a52522b3e1fc1c9d9b61c57148f8d72fc', 1)}

ad_creation_time:
{'count': 246745, 'unique': 547, 'most_common': ('2024-10-27', 8619)}

bylines:
{'count': 245736, 'unique': 3790, 'most_common': ('HARRIS FOR PRESIDENT', 49788)}

currency:
{'count': 246745, 'unique': 18, 'most_common': ('USD', 246599)}

delivery_by_region:
{'count': 246745, 'unique': 141122, 'most_common': ('{}', 30989)}

demographic_distribution:
{'count': 246745, 'unique': 215622, 'most_common': ('{}', 30989)}

estimated_audience_size:
{'count': 246745, 'mean': 556462.8559687126, 'min': 0.0, 'max': 1000001.0, 'std': 409864.75882290734}

estimated_impressions:
{'count': 246745, 'mean': 45601.52595189366, 'min': 499.0, 'max': 1000000.0, 'std': 13

In [9]:
name = "Facebook Posts"
path = "Downloads/period_03/2024_fb_posts_president_scored_anon.csv"

print(f"\n\n📊 === Analyzing {name} ===")
data = load_data(path)
numeric_cols = get_numeric_columns(data)

print("\n=== Overall Stats ===")
for col in data[0]:
    print(f"\n{col}:")
    if col in numeric_cols:
        print(compute_stats(data, col))
    else:
        print(compute_categorical_stats(data, col))

print("\n=== Grouped by page_id ===")
groups = list(group_by(data, ['page_id']).items())
print(f"📌 Total groups: {len(groups)}")

for key, rows in groups[:3]:
    print(f"\nGroup: {key}")
    for col in numeric_cols:
        print(f"  {col}:", compute_stats(rows, col))


if "ad_id" in data[0]:
    print("\n=== Grouped by page_id and ad_id ===")
    for key, rows in list(group_by(data, ['page_id', 'ad_id']).items())[:3]:
        print(f"\nGroup: {key}")
        for col in numeric_cols:
            print(f"  {col}:", compute_stats(rows, col))




📊 === Analyzing Facebook Posts ===

=== Overall Stats ===

Facebook_Id:
{'count': 19009, 'unique': 21, 'most_common': ('32fc18da91029ff09bf74fe9887eace6b5d2145809d583f696e344530508b064', 9013)}

post_id:
{'count': 19009, 'unique': 19009, 'most_common': ('8570b69695e00d8f06b12398ed525497e1712b5369c6fc2138fe98f69811c138', 1)}

Page Category:
{'count': 16537, 'unique': 6, 'most_common': ('PERSON', 9453)}

Page Admin Top Country:
{'count': 16280, 'unique': 1, 'most_common': ('US', 16280)}

Post Created:
{'count': 19009, 'unique': 18951, 'most_common': ('2023-11-14 11:11:44 EST', 2)}

Post Created Date:
{'count': 19009, 'unique': 425, 'most_common': ('2024-10-31', 103)}

Post Created Time:
{'count': 19009, 'unique': 16102, 'most_common': ('19:42:00', 7)}

Type:
{'count': 16544, 'unique': 9, 'most_common': ('Link', 7404)}

Total Interactions:
{'count': 19009, 'unique': 5665, 'most_common': ('23', 115)}

Likes:
{'count': 19009, 'mean': 2377.6954074385817, 'min': 0.0, 'max': 351979.0, 'std':

In [10]:
name = "Twitter Posts"
path = "Downloads/period_03/2024_tw_posts_president_scored_anon.csv"

print(f"\n\n📊 === Analyzing {name} ===")
data = load_data(path)
numeric_cols = get_numeric_columns(data)

print("\n=== Overall Stats ===")
for col in data[0]:
    print(f"\n{col}:")
    if col in numeric_cols:
        print(compute_stats(data, col))
    else:
        print(compute_categorical_stats(data, col))

print("\n=== Grouped by page_id ===")
for key, rows in list(group_by(data, ['page_id']).items())[:3]:
    print(f"\nGroup: {key}")
    for col in numeric_cols:
        print(f"  {col}:", compute_stats(rows, col))

if "ad_id" in data[0]:
    print("\n=== Grouped by page_id and ad_id ===")
    for key, rows in list(group_by(data, ['page_id', 'ad_id']).items())[:3]:
        print(f"\nGroup: {key}")
        for col in numeric_cols:
            print(f"  {col}:", compute_stats(rows, col))

end = time.time()
print(f"⏱️ Time taken: {end - start:.2f} seconds")




📊 === Analyzing Twitter Posts ===

=== Overall Stats ===

id:
{'count': 27304, 'unique': 27304, 'most_common': ('cc46051622b8a9c1b883a3bbf12c640b12ac1cbdc7f48a773b6cc2a65f03aa2d', 1)}

url:
{'count': 27304, 'unique': 27304, 'most_common': ('f70a206472e9deaf6e313297c1efb891729ced346a0aeb34e16935d78f74b937', 1)}

source:
{'count': 27304, 'unique': 14, 'most_common': ('Twitter Web App', 14930)}

retweetCount:
{'count': 27304, 'mean': 1322.0551933782597, 'min': 0.0, 'max': 144615.0, 'std': 3405.0042401645187}

replyCount:
{'count': 27304, 'mean': 1063.7850131848813, 'min': 0.0, 'max': 121270.0, 'std': 3174.981654139348}

likeCount:
{'count': 27304, 'mean': 6913.69282888954, 'min': 0.0, 'max': 915221.0, 'std': 21590.307989209447}

quoteCount:
{'count': 27304, 'mean': 128.08156314093173, 'min': 0.0, 'max': 123320.0, 'std': 1131.5334680019284}

viewCount:
{'count': 27304, 'mean': 507084.7318341635, 'min': 5.0, 'max': 333502775.0, 'std': 3212173.9862966556}

createdAt:
{'count': 27304, 'uniq

### 📌 Note:
We couldn't perform a **group by `page_id`** operation on the other two datasets because the `page_id` column was not present in those files.  
As a result, the grouped data for `page_id` is blank for those datasets.

In [1]:
import csv
import math
from collections import defaultdict, Counter
import time

start = time.time()

file_path = "Downloads/period_03/trump_truths_dataset.csv"

# Load data
with open(file_path, newline='', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    data = list(reader)

# Determine numeric fields
sample = data[0]
numeric_cols = []
for key in sample:
    try:
        float(sample[key])
        numeric_cols.append(key)
    except:
        continue

# Stats container
stats = defaultdict(lambda: {"count": 0, "sum": 0.0, "min": float('inf'), "max": float('-inf'), "values": []})

# Loop over rows
for row in data:
    for key, val in row.items():
        if val == '':
            continue
        stats[key]["count"] += 1
        if key in numeric_cols:
            try:
                num = float(val)
                stats[key]["sum"] += num
                stats[key]["min"] = min(stats[key]["min"], num)
                stats[key]["max"] = max(stats[key]["max"], num)
                stats[key]["values"].append(num)
            except:
                pass
        else:
            stats[key]["values"].append(val)

# Compute and print results
for key, s in stats.items():
    print(f"\n📊 Column: {key}")
    print(f" - Count: {s['count']}")
    if key in numeric_cols:
        mean = s['sum'] / s['count'] if s['count'] else 0
        std_dev = math.sqrt(sum((x - mean) ** 2 for x in s['values']) / s['count']) if s['count'] else 0
        print(f" - Mean: {mean:.2f}")
        print(f" - Min: {s['min']}")
        print(f" - Max: {s['max']}")
        print(f" - Std Dev: {std_dev:.2f}")
    else:
        counter = Counter(s["values"])
        most_common = counter.most_common(3)
        print(f" - Unique: {len(counter)}")
        print(f" - Top 3 Most Common: {most_common}")

end = time.time()
print(f"\n⏱️ Time taken: {end - start:.2f} seconds")



📊 Column: account_name
 - Count: 5159
 - Unique: 126
 - Top 3 Most Common: [('Donald J. Trump', 4398), ('Team Trump', 284), ('Real America’s Voice', 81)]

📊 Column: account_handle
 - Count: 5159
 - Unique: 125
 - Top 3 Most Common: [('realDonaldTrump', 4398), ('TeamTrump', 284), ('realamericasvoice', 81)]

📊 Column: verified_badge
 - Count: 5159
 - Unique: 2
 - Top 3 Most Common: [('TRUE', 4975), ('FALSE', 184)]

📊 Column: post_date
 - Count: 5159
 - Unique: 4099
 - Top 3 Most Common: [('2024-09-21 18:22:00', 6), ('2024-09-15 01:36:00', 6), ('2024-08-27 18:20:00', 6)]

📊 Column: profile_link
 - Count: 5158
 - Unique: 124
 - Top 3 Most Common: [('https://truthsocial.com/@realDonaldTrump', 4398), ('https://truthsocial.com/@TeamTrump', 284), ('https://truthsocial.com/@realamericasvoice', 81)]

📊 Column: avatar_url
 - Count: 5159
 - Unique: 127
 - Top 3 Most Common: [('https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/avatars/107/780/257/626/128/497/original/454286ac07