In [2]:
# Pure Python Descriptive Statistics with Grouped Analysis and Error Handling

# Import necessary libraries from the Python Standard Library
import csv  # For reading CSV files
import math  # For performing mathematical operations
from collections import defaultdict, Counter  # For data aggregation and counting
import os  # For file path checking
import time  # For timing performance of the script

# Define the dataset filenames
DATASETS = [
    "2024_fb_ads_president_scored_anon.csv",
    "2024_fb_posts_president_scored_anon.csv",
    "2024_tw_posts_president_scored_anon.csv"
]

# Filter the dataset list to include only files that actually exist
available_datasets = [f for f in DATASETS if os.path.exists(f)]

# Function to check if a string value is numeric
def is_numeric(value):
    try:
        float(value)
        return True
    except (ValueError, TypeError):
        return False

# Function to calculate standard deviation manually
def calculate_std(values, mean):
    if len(values) < 2:
        return 0
    return math.sqrt(sum((x - mean) ** 2 for x in values) / (len(values) - 1))

# Function to compute statistics for a column
def compute_column_stats(values):
    numeric_vals = [float(v) for v in values if is_numeric(v)]  # Convert numeric values to float
    if numeric_vals:
        count = len(numeric_vals)  # Number of valid numeric entries
        mean = sum(numeric_vals) / count  # Calculate mean
        std = calculate_std(numeric_vals, mean)  # Calculate standard deviation
        return {
            "Count": count,
            "Mean": round(mean, 2),
            "Min": min(numeric_vals),
            "Max": max(numeric_vals),
            "StdDev": round(std, 2)
        }
    else:
        clean_vals = [v for v in values if v != '']  # Filter out empty values
        freq = Counter(clean_vals)  # Count occurrences
        most_common = freq.most_common(1)[0] if freq else (None, 0)  # Get most common value
        return {
            "Count": len(clean_vals),
            "Unique Values": len(set(clean_vals)),
            "Most Common": most_common
        }

# Function to perform grouped statistics by one or more keys
def grouped_statistics(data, group_keys, columns):
    grouped_data = defaultdict(list)  # Dictionary to group rows by key
    for row in data:
        try:
            key = tuple(row[k] for k in group_keys)  # Create key based on grouping columns
        except KeyError:
            continue  # Skip rows missing the grouping key
        grouped_data[key].append(row)  # Add row to the appropriate group

    grouped_result = {}
    for key, rows in grouped_data.items():
        result = {}
        for col in columns:
            values = [row[col] for row in rows if col in row and row[col] != '']  # Get values for column
            result[col] = compute_column_stats(values)  # Compute stats for column
        grouped_result[key] = result  # Save stats for group
    return grouped_result

# Function to analyze a CSV file
def analyze_dataset(csv_file):
    print(f"\n{'='*60}\nAnalyzing File: {csv_file}\n{'='*60}")  # Print header for file
    start_time = time.time()  # Record start time

    with open(csv_file, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)  # Read file into dictionary format
        data = [row for row in reader]  # Load all rows into memory

    if not data:
        print("No data available in file.")  # Inform if file is empty
        return

    columns = list(data[0].keys())  # Extract column names

    print("\n--- Overall Descriptive Statistics (First 5 Columns Only) ---")
    for col in columns[:5]:  # Limit analysis to first 5 columns
        stats = compute_column_stats([row[col] for row in data])  # Compute stats
        print(f"\nColumn: {col}")
        for k, v in stats.items():
            print(f"  {k}: {v}")  # Print each statistic

    if 'page_id' in columns:
        print("\n--- Grouped by 'page_id' ---")
        grouped_by_page = grouped_statistics(data, ['page_id'], columns[:5])  # Group by 'page_id'
        for key, result in list(grouped_by_page.items())[:3]:  # Limit output to first 3 groups
            print(f"\nGroup: page_id = {key}")
            for col, stats in result.items():
                print(f"  Column: {col}")
                for k, v in stats.items():
                    print(f"    {k}: {v}")

    if 'ad_id' in columns and 'page_id' in columns:
        print("\n--- Grouped by 'page_id' and 'ad_id' ---")
        grouped_by_both = grouped_statistics(data, ['page_id', 'ad_id'], columns[:5])  # Group by both keys
        for key, result in list(grouped_by_both.items())[:3]:  # Limit output to first 3 groups
            print(f"\nGroup: page_id = {key[0]}, ad_id = {key[1]}")
            for col, stats in result.items():
                print(f"  Column: {col}")
                for k, v in stats.items():
                    print(f"    {k}: {v}")

    print(f"\nFinished in {time.time() - start_time:.2f} seconds")  # Print total time taken

# Run analysis on all available datasets
for dataset in available_datasets:
    analyze_dataset(dataset)  # Run the analysis function for each file


Analyzing File: 2024_fb_ads_president_scored_anon.csv

--- Overall Descriptive Statistics (First 5 Columns Only) ---

Column: page_id
  Count: 246745
  Unique Values: 4475
  Most Common: ('4d66f5853f0365dba032a87704a634f023d15babde973bb7a284ed8cd2707b2d', 55503)

Column: ad_id
  Count: 246745
  Unique Values: 246745
  Most Common: ('0ddb025b8544e2d58e6977ad417e742a52522b3e1fc1c9d9b61c57148f8d72fc', 1)

Column: ad_creation_time
  Count: 246745
  Unique Values: 547
  Most Common: ('2024-10-27', 8619)

Column: bylines
  Count: 245736
  Unique Values: 3790
  Most Common: ('HARRIS FOR PRESIDENT', 49788)

Column: currency
  Count: 246745
  Unique Values: 18
  Most Common: ('USD', 246599)

--- Grouped by 'page_id' ---

Group: page_id = ('4ff23a48b53d988df50ddfebb0e442a984ab8f94e874ef9b9cb34394e0c5d230',)
  Column: page_id
    Count: 33
    Unique Values: 1
    Most Common: ('4ff23a48b53d988df50ddfebb0e442a984ab8f94e874ef9b9cb34394e0c5d230', 33)
  Column: ad_id
    Count: 33
    Unique Values