# Phase 1: Descriptive Statistics

**Swiss Ballot Chatbot Study - Measurement Analysis**

2×2 Factorial Design: Transparency (T0/T1) × Control (C0/C1)

| Condition | Transparency | Control |
|-----------|--------------|--------|
| A | T0 (Low) | C0 (Low) |
| B | T1 (High) | C0 (Low) |
| C | T0 (Low) | C1 (High) |
| D | T1 (High) | C1 (High) |

---

## Setup & Configuration

In [None]:
# Import required libraries
import os
import sys
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# Set plot style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('Set2')

# Figure size defaults
plt.rcParams['figure.figsize'] = [10, 6]
plt.rcParams['figure.dpi'] = 100

print("Libraries loaded successfully!")

In [None]:
# Import functions from phase1_descriptive_statistics.py
from phase1_descriptive_statistics import (
    AnalysisConfig,
    get_db_connection,
    load_participant_data,
    prepare_variables,
    compute_sample_flow,
    compute_n_per_condition,
    compute_donation_rates,
    compute_demographics,
    compute_manipulation_checks,
    compute_risk_trust,
    compute_dashboard_frequencies,
    compute_q14_response_rate,
    wilson_ci
)

print("Analysis functions imported successfully!")

In [None]:
# ============================================================
# CONFIGURATION: Select participant type
# ============================================================
# Set to True for AI participants, False for human participants
IS_AI_PARTICIPANT = True

# Initialize configuration
config = AnalysisConfig(is_ai_participant=IS_AI_PARTICIPANT)

participant_label = "AI Test Users" if IS_AI_PARTICIPANT else "Human Participants"
print(f"Analyzing: {participant_label}")

## Data Loading & Preparation

In [None]:
# Load raw data from database
df_raw = load_participant_data(config)

print(f"\nRaw data shape: {df_raw.shape}")
print(f"Columns: {list(df_raw.columns)}")

In [None]:
# Prepare derived variables
df = prepare_variables(df_raw, config)

# Display first few rows
df.head()

---
## 1.1 Sample Flow & Exclusions

**Exclusion criteria:**
1. Failed attention check (attention_check_correct = 0)
2. Missing condition
3. Missing donation_decision

In [None]:
# Compute sample flow
sample_flow = compute_sample_flow(df)
df_filtered = sample_flow['df_filtered']

print(f"\nFinal sample size for analysis: N = {len(df_filtered)}")

In [None]:
# Visualize sample flow
fig, ax = plt.subplots(figsize=(8, 5))

stages = ['Initial', 'After Attention\nCheck', 'After Missing\nCondition', 'Final Sample']
n_values = [
    sample_flow['initial_n'],
    sample_flow['initial_n'] - sample_flow['excluded_attention'],
    sample_flow['initial_n'] - sample_flow['excluded_attention'] - sample_flow['excluded_missing_condition'],
    sample_flow['final_n']
]

colors = ['#3498db', '#2ecc71', '#f39c12', '#e74c3c']
bars = ax.bar(stages, n_values, color=colors, edgecolor='black', linewidth=1.2)

# Add value labels on bars
for bar, val in zip(bars, n_values):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 5, 
            f'N={val}', ha='center', va='bottom', fontsize=12, fontweight='bold')

ax.set_ylabel('Number of Participants', fontsize=12)
ax.set_title(f'Sample Flow ({participant_label})', fontsize=14, fontweight='bold')
ax.set_ylim(0, max(n_values) * 1.15)

plt.tight_layout()
plt.savefig('output/phase1/fig_sample_flow.png', dpi=150, bbox_inches='tight')
plt.show()

---
## 1.2 N per Condition (A/B/C/D)

In [None]:
# Compute N per condition
n_per_condition = compute_n_per_condition(df_filtered)
n_per_condition

In [None]:
# Visualize N per condition
fig, ax = plt.subplots(figsize=(8, 5))

conditions = ['A\n(T0C0)', 'B\n(T1C0)', 'C\n(T0C1)', 'D\n(T1C1)']
n_values = n_per_condition[n_per_condition['Condition'] != 'Total']['n'].values

colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4']
bars = ax.bar(conditions, n_values, color=colors, edgecolor='black', linewidth=1.2)

# Add value labels
for bar, val in zip(bars, n_values):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 2, 
            f'n={val}', ha='center', va='bottom', fontsize=11, fontweight='bold')

ax.set_ylabel('Number of Participants', fontsize=12)
ax.set_xlabel('Condition', fontsize=12)
ax.set_title(f'Sample Size per Condition ({participant_label})', fontsize=14, fontweight='bold')
ax.axhline(y=len(df_filtered)/4, color='gray', linestyle='--', alpha=0.7, label='Expected (balanced)')
ax.legend()

plt.tight_layout()
plt.savefig('output/phase1/fig_n_per_condition.png', dpi=150, bbox_inches='tight')
plt.show()

---
## 1.3 Donation Rate per Condition + 95% CI (Wilson)

In [None]:
# Compute donation rates
donation_rates = compute_donation_rates(df_filtered)
donation_rates

In [None]:
# Visualize donation rates with 95% CI
fig, ax = plt.subplots(figsize=(10, 6))

# Filter to conditions only (exclude Overall)
dr_conditions = donation_rates[donation_rates['Condition'] != 'Overall'].copy()

conditions = ['A\n(T0C0)', 'B\n(T1C0)', 'C\n(T0C1)', 'D\n(T1C1)']
rates = dr_conditions['Rate (%)'].values
ci_lower = dr_conditions['95% CI Lower'].values
ci_upper = dr_conditions['95% CI Upper'].values

# Calculate error bars
errors = [rates - ci_lower, ci_upper - rates]

colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4']
bars = ax.bar(conditions, rates, color=colors, edgecolor='black', linewidth=1.2, 
              yerr=errors, capsize=8, error_kw={'linewidth': 2})

# Add value labels
for bar, rate, lower, upper in zip(bars, rates, ci_lower, ci_upper):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + (upper - rate) + 2, 
            f'{rate:.1f}%', ha='center', va='bottom', fontsize=12, fontweight='bold')

# Add overall rate line
overall_rate = donation_rates[donation_rates['Condition'] == 'Overall']['Rate (%)'].values[0]
ax.axhline(y=overall_rate, color='red', linestyle='--', linewidth=2, 
           label=f'Overall: {overall_rate:.1f}%')

ax.set_ylabel('Donation Rate (%)', fontsize=12)
ax.set_xlabel('Condition', fontsize=12)
ax.set_title(f'Donation Rate per Condition with 95% Wilson CI ({participant_label})', 
             fontsize=14, fontweight='bold')
ax.set_ylim(0, 100)
ax.legend(loc='upper right')

plt.tight_layout()
plt.savefig('output/phase1/fig_donation_rates.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Donation rate by Transparency Level (T0 vs T1)
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# By Transparency
t0_rate = df_filtered[df_filtered['transparency_level'] == 0]['donation_decision'].mean() * 100
t1_rate = df_filtered[df_filtered['transparency_level'] == 1]['donation_decision'].mean() * 100
t0_n = len(df_filtered[df_filtered['transparency_level'] == 0])
t1_n = len(df_filtered[df_filtered['transparency_level'] == 1])
t0_ci = wilson_ci(int(df_filtered[df_filtered['transparency_level'] == 0]['donation_decision'].sum()), t0_n)
t1_ci = wilson_ci(int(df_filtered[df_filtered['transparency_level'] == 1]['donation_decision'].sum()), t1_n)

bars = axes[0].bar(['T0 (Low)\nConditions A, C', 'T1 (High)\nConditions B, D'], 
                   [t0_rate, t1_rate], 
                   color=['#3498db', '#e74c3c'], 
                   edgecolor='black', linewidth=1.2,
                   yerr=[[t0_rate - t0_ci[0]*100, t1_rate - t1_ci[0]*100],
                         [t0_ci[1]*100 - t0_rate, t1_ci[1]*100 - t1_rate]],
                   capsize=8)
axes[0].set_ylabel('Donation Rate (%)', fontsize=12)
axes[0].set_title('By Transparency Level', fontsize=13, fontweight='bold')
axes[0].set_ylim(0, 100)
for bar, rate in zip(bars, [t0_rate, t1_rate]):
    axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 5, 
                 f'{rate:.1f}%', ha='center', fontsize=12, fontweight='bold')

# By Control
c0_rate = df_filtered[df_filtered['control_level'] == 0]['donation_decision'].mean() * 100
c1_rate = df_filtered[df_filtered['control_level'] == 1]['donation_decision'].mean() * 100
c0_n = len(df_filtered[df_filtered['control_level'] == 0])
c1_n = len(df_filtered[df_filtered['control_level'] == 1])
c0_ci = wilson_ci(int(df_filtered[df_filtered['control_level'] == 0]['donation_decision'].sum()), c0_n)
c1_ci = wilson_ci(int(df_filtered[df_filtered['control_level'] == 1]['donation_decision'].sum()), c1_n)

bars = axes[1].bar(['C0 (Low)\nConditions A, B', 'C1 (High)\nConditions C, D'], 
                   [c0_rate, c1_rate], 
                   color=['#9b59b6', '#2ecc71'], 
                   edgecolor='black', linewidth=1.2,
                   yerr=[[c0_rate - c0_ci[0]*100, c1_rate - c1_ci[0]*100],
                         [c0_ci[1]*100 - c0_rate, c1_ci[1]*100 - c1_rate]],
                   capsize=8)
axes[1].set_ylabel('Donation Rate (%)', fontsize=12)
axes[1].set_title('By Control Level', fontsize=13, fontweight='bold')
axes[1].set_ylim(0, 100)
for bar, rate in zip(bars, [c0_rate, c1_rate]):
    axes[1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 5, 
                 f'{rate:.1f}%', ha='center', fontsize=12, fontweight='bold')

plt.suptitle(f'Donation Rates by Factor Level ({participant_label})', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('output/phase1/fig_donation_rates_by_factor.png', dpi=150, bbox_inches='tight')
plt.show()

---
## 1.4 Demographics

In [None]:
# Compute demographics
demographics = compute_demographics(df_filtered, by_condition=False)

In [None]:
# Visualize demographics
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

demo_vars = [
    ('age', 'Age Distribution'),
    ('gender', 'Gender Distribution'),
    ('primary_language', 'Primary Language'),
    ('education', 'Education Level'),
    ('eligible_to_vote_ch', 'Voting Eligibility (CH)')
]

for idx, (var, title) in enumerate(demo_vars):
    ax = axes[idx // 3, idx % 3]
    key = f'{var}_overall'
    if key in demographics:
        data = demographics[key]
        # Sort by count for better visualization
        data_sorted = data.sort_values('n', ascending=True)
        
        bars = ax.barh(data_sorted['Category'].astype(str), data_sorted['n'], 
                       color=plt.cm.Set2(np.linspace(0, 1, len(data_sorted))))
        ax.set_xlabel('Count')
        ax.set_title(title, fontweight='bold')
        
        # Add percentage labels
        for bar, pct in zip(bars, data_sorted['%'].values):
            ax.text(bar.get_width() + 1, bar.get_y() + bar.get_height()/2, 
                    f'{pct:.1f}%', va='center', fontsize=9)

# Hide empty subplot
axes[1, 2].axis('off')

plt.suptitle(f'Demographic Distributions ({participant_label})', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('output/phase1/fig_demographics.png', dpi=150, bbox_inches='tight')
plt.show()

---
## 1.5 Manipulation Checks (MC-T, MC-C)

In [None]:
# Compute manipulation checks
manipulation_checks = compute_manipulation_checks(df_filtered)

In [None]:
# Display manipulation check tables
print("\nMC-T (Perceived Transparency) by Condition:")
display(manipulation_checks['mc_t_by_condition'])

print("\nMC-T by Transparency Level (T0 vs T1):")
display(manipulation_checks['mc_t_by_t_level'])

print("\nMC-C (Perceived Control) by Condition:")
display(manipulation_checks['mc_c_by_condition'])

print("\nMC-C by Control Level (C0 vs C1):")
display(manipulation_checks['mc_c_by_c_level'])

In [None]:
# Visualize manipulation checks
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# MC-T by Transparency Level
mc_t_data = manipulation_checks['mc_t_by_t_level']
bars = axes[0].bar(['T0 (Low Transparency)', 'T1 (High Transparency)'], 
                   mc_t_data['Mean'].values,
                   yerr=mc_t_data['SD'].values,
                   color=['#3498db', '#e74c3c'],
                   edgecolor='black', linewidth=1.2,
                   capsize=8)
axes[0].set_ylabel('Mean Perceived Transparency (1-7)', fontsize=12)
axes[0].set_title('MC-T: Manipulation Check for Transparency', fontsize=13, fontweight='bold')
axes[0].set_ylim(1, 7)
for bar, mean, sd in zip(bars, mc_t_data['Mean'].values, mc_t_data['SD'].values):
    axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + sd + 0.1, 
                 f'M={mean:.2f}', ha='center', fontsize=11, fontweight='bold')

# MC-C by Control Level
mc_c_data = manipulation_checks['mc_c_by_c_level']
bars = axes[1].bar(['C0 (Low Control)', 'C1 (High Control)'], 
                   mc_c_data['Mean'].values,
                   yerr=mc_c_data['SD'].values,
                   color=['#9b59b6', '#2ecc71'],
                   edgecolor='black', linewidth=1.2,
                   capsize=8)
axes[1].set_ylabel('Mean Perceived Control (1-7)', fontsize=12)
axes[1].set_title('MC-C: Manipulation Check for Control', fontsize=13, fontweight='bold')
axes[1].set_ylim(1, 7)
for bar, mean, sd in zip(bars, mc_c_data['Mean'].values, mc_c_data['SD'].values):
    axes[1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + sd + 0.1, 
                 f'M={mean:.2f}', ha='center', fontsize=11, fontweight='bold')

plt.suptitle(f'Manipulation Check Results ({participant_label})', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('output/phase1/fig_manipulation_checks.png', dpi=150, bbox_inches='tight')
plt.show()

# Validation note
print("\n" + "="*60)
print("MANIPULATION CHECK VALIDATION")
print("="*60)
print(f"MC-T: T1 ({mc_t_data.loc['T1 (High)', 'Mean']:.2f}) should be > T0 ({mc_t_data.loc['T0 (Low)', 'Mean']:.2f})")
print(f"MC-C: C1 ({mc_c_data.loc['C1 (High)', 'Mean']:.2f}) should be > C0 ({mc_c_data.loc['C0 (Low)', 'Mean']:.2f})")

---
## 1.6 Risk + Trust Descriptives

In [None]:
# Compute Risk + Trust
risk_trust = compute_risk_trust(df_filtered)
risk_trust

In [None]:
# Visualize Risk and Trust by condition
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

conditions = ['A', 'B', 'C', 'D']
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4']

# OUT-RISK
risk_means = risk_trust.loc[conditions, 'OUT-RISK Mean'].values
risk_sds = risk_trust.loc[conditions, 'OUT-RISK SD'].values
bars = axes[0].bar(conditions, risk_means, yerr=risk_sds, color=colors,
                   edgecolor='black', linewidth=1.2, capsize=8)
axes[0].set_ylabel('Mean Risk Perception (1-7)', fontsize=12)
axes[0].set_xlabel('Condition', fontsize=12)
axes[0].set_title('OUT-RISK by Condition', fontsize=13, fontweight='bold')
axes[0].set_ylim(1, 7)
for bar, mean in zip(bars, risk_means):
    axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.3, 
                 f'{mean:.2f}', ha='center', fontsize=10, fontweight='bold')

# OUT-TRUST
trust_means = risk_trust.loc[conditions, 'OUT-TRUST Mean'].values
trust_sds = risk_trust.loc[conditions, 'OUT-TRUST SD'].values
bars = axes[1].bar(conditions, trust_means, yerr=trust_sds, color=colors,
                   edgecolor='black', linewidth=1.2, capsize=8)
axes[1].set_ylabel('Mean Trust (1-7)', fontsize=12)
axes[1].set_xlabel('Condition', fontsize=12)
axes[1].set_title('OUT-TRUST by Condition', fontsize=13, fontweight='bold')
axes[1].set_ylim(1, 7)
for bar, mean in zip(bars, trust_means):
    axes[1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.3, 
                 f'{mean:.2f}', ha='center', fontsize=10, fontweight='bold')

plt.suptitle(f'Risk Perception and Trust ({participant_label})', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('output/phase1/fig_risk_trust.png', dpi=150, bbox_inches='tight')
plt.show()

---
## 1.7 Dashboard Option Frequencies (Conditions C & D only)

In [None]:
# Compute dashboard frequencies
dashboard_freq = compute_dashboard_frequencies(df_filtered)

In [None]:
# Visualize dashboard selections (if data exists)
df_cd = df_filtered[df_filtered['condition'].isin(['C', 'D'])]

if len(df_cd) > 0:
    dashboard_vars = ['dashboard_scope', 'dashboard_purpose', 'dashboard_storage', 'dashboard_retention']
    
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    axes = axes.flatten()
    
    for idx, var in enumerate(dashboard_vars):
        ax = axes[idx]
        
        # Get counts for both C and D
        c_counts = df_cd[df_cd['condition'] == 'C'][var].value_counts()
        d_counts = df_cd[df_cd['condition'] == 'D'][var].value_counts()
        
        # Combine into DataFrame
        all_options = sorted(set(c_counts.index) | set(d_counts.index))
        
        x = np.arange(len(all_options))
        width = 0.35
        
        c_vals = [c_counts.get(opt, 0) for opt in all_options]
        d_vals = [d_counts.get(opt, 0) for opt in all_options]
        
        ax.bar(x - width/2, c_vals, width, label='Condition C', color='#45B7D1')
        ax.bar(x + width/2, d_vals, width, label='Condition D', color='#96CEB4')
        
        ax.set_xlabel('Option')
        ax.set_ylabel('Count')
        ax.set_title(var.replace('dashboard_', '').title(), fontweight='bold')
        ax.set_xticks(x)
        ax.set_xticklabels([str(o)[:15] for o in all_options], rotation=45, ha='right')
        ax.legend()
    
    plt.suptitle(f'Dashboard Selections - Conditions C & D ({participant_label})', 
                 fontsize=14, fontweight='bold', y=1.02)
    plt.tight_layout()
    plt.savefig('output/phase1/fig_dashboard_frequencies.png', dpi=150, bbox_inches='tight')
    plt.show()
else:
    print("No participants in conditions C or D with dashboard data.")

---
## 1.8 Q14 Free-Text Response Rate

In [None]:
# Compute Q14 response rate
q14_response = compute_q14_response_rate(df_filtered)
q14_response

In [None]:
# Visualize Q14 response rate
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Response rate by condition
q14_cond = q14_response[q14_response['Condition'] != 'Overall']
bars = axes[0].bar(q14_cond['Condition'], q14_cond['Response Rate (%)'], 
                   color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'],
                   edgecolor='black', linewidth=1.2)
axes[0].set_ylabel('Response Rate (%)', fontsize=12)
axes[0].set_xlabel('Condition', fontsize=12)
axes[0].set_title('Q14 Response Rate by Condition', fontsize=13, fontweight='bold')
axes[0].set_ylim(0, 100)
for bar, rate in zip(bars, q14_cond['Response Rate (%)'].values):
    axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 2, 
                 f'{rate:.1f}%', ha='center', fontsize=11, fontweight='bold')

# Median character length
bars = axes[1].bar(q14_cond['Condition'], q14_cond['Median Char Length'], 
                   color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'],
                   edgecolor='black', linewidth=1.2)
axes[1].set_ylabel('Median Character Length', fontsize=12)
axes[1].set_xlabel('Condition', fontsize=12)
axes[1].set_title('Q14 Response Length by Condition', fontsize=13, fontweight='bold')
for bar, length in zip(bars, q14_cond['Median Char Length'].values):
    axes[1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 2, 
                 f'{length}', ha='center', fontsize=11, fontweight='bold')

plt.suptitle(f'Q14 Open Text Analysis ({participant_label})', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('output/phase1/fig_q14_response.png', dpi=150, bbox_inches='tight')
plt.show()

---
## Summary: Phase 1 Key Results

In [None]:
# Print summary of key results
print("="*70)
print(f"PHASE 1 SUMMARY - {participant_label.upper()}")
print("="*70)

print(f"\n1. SAMPLE SIZE")
print(f"   Final N: {len(df_filtered)}")
print(f"   Per condition: A={len(df_filtered[df_filtered['condition']=='A'])}, "
      f"B={len(df_filtered[df_filtered['condition']=='B'])}, "
      f"C={len(df_filtered[df_filtered['condition']=='C'])}, "
      f"D={len(df_filtered[df_filtered['condition']=='D'])}")

print(f"\n2. DONATION RATES")
for _, row in donation_rates.iterrows():
    print(f"   {row['Condition']}: {row['Rate (%)']:.1f}% {row['95% CI']}")

print(f"\n3. MANIPULATION CHECKS")
print(f"   MC-T (Transparency): T0={mc_t_data.loc['T0 (Low)', 'Mean']:.2f}, T1={mc_t_data.loc['T1 (High)', 'Mean']:.2f}")
print(f"   MC-C (Control): C0={mc_c_data.loc['C0 (Low)', 'Mean']:.2f}, C1={mc_c_data.loc['C1 (High)', 'Mean']:.2f}")

print(f"\n4. VALIDATION CRITERIA")
mc_t_valid = mc_t_data.loc['T1 (High)', 'Mean'] > mc_t_data.loc['T0 (Low)', 'Mean']
mc_c_valid = mc_c_data.loc['C1 (High)', 'Mean'] > mc_c_data.loc['C0 (Low)', 'Mean']
print(f"   MC-T (T1 > T0): {'PASS' if mc_t_valid else 'FAIL'}")
print(f"   MC-C (C1 > C0): {'PASS' if mc_c_valid else 'FAIL'}")

print("\n" + "="*70)

In [None]:
# List all saved files
print("\nSaved output files:")
import glob
for f in sorted(glob.glob('output/phase1/*')):
    print(f"  - {f}")