In [1]:
import csv
import math
from collections import Counter, defaultdict
from datetime import datetime

In [6]:
# ==============================================================================
# --- Helper Functions ---
# ==============================================================================

def calculate_stats(data_list):
    """
    Calculating basic statistics for a list of numbers without third-party libraries.
    """
    # Filtering out non-numeric or None values first.
    numbers = [x for x in data_list if isinstance(x, (int, float))]
    if not numbers:
        return {'count': 0, 'mean': 0, 'min': 0, 'max': 0, 'std_dev': 0}

    count = len(numbers)
    mean = sum(numbers) / count
    min_val = min(numbers)
    max_val = max(numbers)

    # Calculating standard deviation.
    sum_sq_diff = sum((x - mean) ** 2 for x in numbers)
    std_dev = math.sqrt(sum_sq_diff / count) if count > 0 else 0

    return {
        'count': count, 'mean': f"{mean:.2f}", 'min': min_val,
        'max': max_val, 'std_dev': f"{std_dev:.2f}"
    }

def parse_row(row_dict, numeric_cols, date_cols):
    """
    Processing a single row (a dictionary) to convert strings to numbers and dates.
    """
    for col, value in row_dict.items():
        if col in numeric_cols:
            try:
                row_dict[col] = int(value)
            except (ValueError, TypeError):
                try:
                    row_dict[col] = float(value)
                except (ValueError, TypeError):
                    row_dict[col] = None
        elif col in date_cols:
            try:
                # Attempting to parse different datetime formats.
                row_dict[col] = datetime.fromisoformat(value.replace('Z', '+00:00'))
            except (ValueError, TypeError):
                row_dict[col] = None
    return row_dict

# ==============================================================================
# --- Main Analysis Function ---
# ==============================================================================

def analyze_dataset_pure_python(filepath, numeric_cols, date_cols, group_by_col):
    """
    Loads, processes, and analyzes a dataset using only standard Python libraries.
    """
    print(f"\n{'='*60}\n🔄 Processing file with Pure Python: {filepath}\n{'='*60}")
    
    # --- Loading and Cleaning Data ---
    header, data = [], []
    try:
        with open(filepath, mode='r', encoding='utf-8', newline='') as infile:
            reader = csv.reader(infile)
            header = next(reader)
            for row in reader:
                row_dict = dict(zip(header, row))
                cleaned_row = parse_row(row_dict, numeric_cols, date_cols)
                data.append(cleaned_row)
    except Exception as e:
        print(f"⚠️ Handled a reading error: {e}")

    if not data:
        print(f"❌ No data loaded from {filepath}.")
        return

    # --- 1. High-Level Descriptive Statistics ---
    print("\n#### 1. High-Level Descriptive Statistics ####")
    print(f"Total Rows: {len(data)}")

    for col in numeric_cols:
        if col in header:
            # CORRECTED LINE: Using row.get(col) to safely access the value.
            # This prevents KeyErrors if a row is missing a column.
            values = [row.get(col) for row in data]
            stats = calculate_stats(values)
            print(f"  - Column '{col}': Mean={stats['mean']}, Max={stats['max']}")

    # --- 2. Single-Column Analysis ---
    print("\n#### 2. Single-Column Analysis (Univariate) ####")
    if group_by_col in header:
        values = [row.get(group_by_col) for row in data]
        counts = Counter(values)
        print(f"Top 5 most common values in '{group_by_col}':")
        for value, count in counts.most_common(5):
            print(f"  - '{value}': {count} times")

    # --- 3. Grouped Analysis ---
    print(f"\n#### 3. Grouped Analysis by '{group_by_col}' ####")
    grouped_data = defaultdict(list)
    for row in data:
        if row.get(group_by_col): # Ensuring the group-by key exists.
            grouped_data[row[group_by_col]].append(row)
    
    target_numeric_col = next((col for col in numeric_cols if col in header), None)
    
    if target_numeric_col:
        print(f"Analyzing '{target_numeric_col}' for each group:")
        group_stats = {}
        for group, rows in grouped_data.items():
            col_values = [row.get(target_numeric_col) for row in rows]
            stats = calculate_stats(col_values)
            group_stats[group] = (stats['count'], float(stats['mean']))
        
        sorted_groups = sorted(group_stats.items(), key=lambda item: item[1][0], reverse=True)
        for group, (count, mean) in sorted_groups[:5]:
             print(f"  - Group '{group}': Count={count}, Avg '{target_numeric_col}'={mean:.2f}")

### Executing Analysis for All Datasets

In [7]:
# --- Facebook Ads Analysis ---
fb_ads_filepath = 'data/2024_fb_ads_president_scored_anon.csv'
fb_ads_numeric = ['estimated_spend', 'estimated_impressions', 'platform_count']
fb_ads_dates = ['ad_creation_time']
analyze_dataset_pure_python(fb_ads_filepath, fb_ads_numeric, fb_ads_dates, 'bylines')

# --- Facebook Posts Analysis ---
fb_posts_filepath = 'data/2024_fb_posts_president_scored_anon.csv'
fb_posts_numeric = ['Total Interactions', 'Likes', 'Comments', 'Shares', 'Post Views']
fb_posts_dates = ['Post Created Date']
analyze_dataset_pure_python(fb_posts_filepath, fb_posts_numeric, fb_posts_dates, 'Page Category')

# --- Twitter Posts Analysis ---
tw_posts_filepath = 'data/2024_tw_posts_president_scored_anon.csv'
tw_posts_numeric = ['retweetCount', 'replyCount', 'likeCount', 'viewCount']
tw_posts_dates = ['createdAt']
analyze_dataset_pure_python(tw_posts_filepath, tw_posts_numeric, tw_posts_dates, 'source')


🔄 Processing file with Pure Python: data/2024_fb_ads_president_scored_anon.csv



#### 1. High-Level Descriptive Statistics ####
Total Rows: 31907
  - Column 'estimated_spend': Mean=1653.13, Max=474999
  - Column 'estimated_impressions': Mean=71890.15, Max=1000000

#### 2. Single-Column Analysis (Univariate) ####
Top 5 most common values in 'bylines':
  - 'HARRIS FOR PRESIDENT': 9687 times
  - 'HARRIS VICTORY FUND': 4629 times
  - 'DONALD J. TRUMP FOR PRESIDENT 2024, INC.': 4323 times
  - 'Working America': 1356 times
  - 'Trump National Committee JFC': 1200 times

#### 3. Grouped Analysis by 'bylines' ####
Analyzing 'estimated_spend' for each group:
  - Group 'HARRIS FOR PRESIDENT': Count=9687, Avg 'estimated_spend'=1566.04
  - Group 'HARRIS VICTORY FUND': Count=4629, Avg 'estimated_spend'=1428.23
  - Group 'DONALD J. TRUMP FOR PRESIDENT 2024, INC.': Count=4323, Avg 'estimated_spend'=1027.68
  - Group 'Working America': Count=1355, Avg 'estimated_spend'=1221.69
  - Group 'Trump National Committee JFC': Count=1200, Avg 'estimated_spend'=1660.58

🔄 Processing file w