## 1. Setup & Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy.stats import pearsonr, spearmanr
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("✓ Setup complete")

## 2. Create Thesis-Aligned Synthetic Data

Simulate N=194 subjects with realistic correlations based on behavioral science literature.

In [None]:
np.random.seed(42)
n_subjects = 194  # Match thesis sample size

print(f"Simulating behavioral data for N={n_subjects} subjects")
print("="*80)

# Create synthetic data with realistic correlations
# Based on thesis structure and behavioral science theory

# 1. Cognitive Reflection Test (CRT) — 0-3 scale
# Normally distributed, slightly left-skewed (people do poorly)
crt_score = np.random.normal(loc=1.5, scale=0.9, size=n_subjects).clip(0, 3)

# 2. Need for Cognition (NFC) — 1-5 scale
# Positive correlation with CRT (r ≈ 0.40)
nfc_base = np.random.normal(loc=3.7, scale=0.8, size=n_subjects)
nfc_score = 1 + (nfc_base + 0.4 * (crt_score / 3 - 0.5)) * 4 / 5
nfc_score = nfc_score.clip(1, 5)

# 3. Conspiracy Mentality (CM) — 1-5 scale
# Negative correlation with CRT and NFC (people who think critically believe fewer conspiracies)
cm_base = np.random.normal(loc=2.8, scale=1.0, size=n_subjects)
cm_score = 1 + (cm_base - 0.3 * (crt_score / 3 - 0.5) - 0.25 * (nfc_score / 5 - 0.5)) * 4 / 5
cm_score = cm_score.clip(1, 5)

# 4. Bullshit Receptivity (BSR) — 0-5 scale
# Negative correlation with CRT, NFC; positive with CM
bsr_base = np.random.normal(loc=2.4, scale=1.1, size=n_subjects)
bsr_score = (bsr_base - 0.35 * (crt_score / 3 - 0.5) - 0.2 * (nfc_score / 5 - 0.5) + 
             0.3 * (cm_score / 5 - 0.5)).clip(0, 5)

# 5. Rational Decision Style — 1-5 scale
# Positive correlation with CRT, NFC
rational_style = 1 + (np.random.normal(loc=0.3, scale=0.8, size=n_subjects) + 
                      0.4 * (crt_score / 3 - 0.5) + 0.35 * (nfc_score / 5 - 0.5)) * 4 / 5
rational_style = rational_style.clip(1, 5)

# 6. Intuitive Decision Style — 1-5 scale
# Negative correlation with Rational Style
intuitive_style = 5 - rational_style + np.random.normal(0, 0.5, n_subjects)
intuitive_style = intuitive_style.clip(1, 5)

# 7. Verification Behavior (fact-checking tendency) — 0-5 scale
# Positive correlation with CRT, NFC, Rational style
verification_behavior = (np.random.normal(loc=2.9, scale=1.0, size=n_subjects) + 
                        0.4 * (crt_score / 3 - 0.5) + 0.3 * (nfc_score / 5 - 0.5) +
                        0.25 * (rational_style / 5 - 0.5)).clip(0, 5)

# 8. TARGET: Misinformation Susceptibility — 0-5 scale
# Strong negative correlations with CRT, NFC, Rational style, Verification behavior
# Strong positive correlations with CM, BSR, Intuitive style
misinfo_susceptibility = (np.random.normal(loc=2.3, scale=1.0, size=n_subjects) -
                         0.5 * (crt_score / 3 - 0.5) - 0.4 * (nfc_score / 5 - 0.5) +
                         0.45 * (cm_score / 5 - 0.5) + 0.4 * (bsr_score / 5 - 0.5) -
                         0.35 * (rational_style / 5 - 0.5) +
                         0.25 * (intuitive_style / 5 - 0.5) -
                         0.35 * (verification_behavior / 5 - 0.5)).clip(0, 5)

# Demographics
age = np.random.normal(loc=36, scale=13, size=n_subjects).clip(18, 75).astype(int)
education_years = np.random.normal(loc=14.2, scale=2.3, size=n_subjects).clip(8, 20).astype(int)
gender = np.random.choice(['M', 'F', 'Other'], size=n_subjects, p=[0.45, 0.52, 0.03])

# Create DataFrame
df_raw = pd.DataFrame({
    'subject_id': range(1, n_subjects + 1),
    'crt_score': crt_score,
    'nfc_score': nfc_score,
    'conspiracy_mentality': cm_score,
    'bs_receptivity': bsr_score,
    'rational_style': rational_style,
    'intuitive_style': intuitive_style,
    'verification_behavior': verification_behavior,
    'misinformation_susceptibility': misinfo_susceptibility,
    'age': age,
    'education_years': education_years,
    'gender': gender
})

print(f"✓ Created synthetic dataset: {df_raw.shape[0]} subjects, {df_raw.shape[1]} features")
print(f"\nFirst 5 rows:")
print(df_raw.head())
print(f"\nBasic statistics:")
print(df_raw.describe().round(2))

## 3. Validate Data Quality & Thesis Alignment

In [None]:
# Check for missing data
missing_summary = df_raw.isnull().sum()
print("Missing data:")
print(missing_summary[missing_summary > 0] if missing_summary.sum() > 0 else "None")

# Check scale ranges
print("\nFeature ranges (should match expected scales):")
print("="*80)
scale_checks = pd.DataFrame({
    'Feature': df_raw.columns[1:9],
    'Min': df_raw.iloc[:, 1:9].min(),
    'Max': df_raw.iloc[:, 1:9].max(),
    'Mean': df_raw.iloc[:, 1:9].mean().round(2),
    'SD': df_raw.iloc[:, 1:9].std().round(2)
})
print(scale_checks.to_string(index=False))

## 4. Validate Expected Correlations

Check that simulated data has realistic relationships from behavioral science literature.

In [None]:
# Core cognitive features
cognitive_features = [
    'crt_score', 'nfc_score', 'conspiracy_mentality', 'bs_receptivity',
    'rational_style', 'intuitive_style', 'verification_behavior',
    'misinformation_susceptibility'
]

# Correlation with target (misinformation susceptibility)
target = 'misinformation_susceptibility'
target_corr = []

print("\nTable 1: Correlations with Misinformation Susceptibility")
print("="*80)
print(f"{'Feature':<30} {'Pearson r':>15} {'Direction':>15}")
print("-"*80)

for feature in cognitive_features:
    if feature != target:
        r, p = pearsonr(df_raw[feature], df_raw[target])
        direction = "Protective" if r < -0.2 else ("Risk" if r > 0.2 else "Weak")
        print(f"{feature:<30} {r:>15.3f} {direction:>15}")
        target_corr.append({'Feature': feature, 'Correlation': r})

print("\nExpected patterns (from behavioral science):")
print("  Protective factors: CRT, NFC, Rational style, Verification behavior (negative r)")
print("  Risk factors: Conspiracy mentality, BS receptivity, Intuitive style (positive r)")

## 5. Create Composite Behavioral Scores

In [None]:
# Create composite scores based on theory

# 1. ANALYTICAL THINKING COMPOSITE
# High CRT + High NFC + High Rational Style + High Verification Behavior
df_raw['analytical_thinking'] = (
    (df_raw['crt_score'] / 3) +  # Normalize to 0-1
    ((df_raw['nfc_score'] - 1) / 4) +  # Normalize to 0-1
    ((df_raw['rational_style'] - 1) / 4) +  # Normalize to 0-1
    (df_raw['verification_behavior'] / 5)  # Already 0-1
) / 4  # Average, results in 0-1 scale

# 2. CONSPIRATORIAL THINKING COMPOSITE
# High CM + High BS receptivity + Low Verification
df_raw['conspiratorial_thinking'] = (
    ((df_raw['conspiracy_mentality'] - 1) / 4) +  # Normalize to 0-1
    (df_raw['bs_receptivity'] / 5) +  # Normalize to 0-1
    (1 - (df_raw['verification_behavior'] / 5))  # Inverse: low verification
) / 3  # Average

# 3. COGNITIVE INTEGRITY COMPOSITE
# High Analytical - High Conspiratorial
df_raw['cognitive_integrity'] = (
    df_raw['analytical_thinking'] - df_raw['conspiratorial_thinking']
)

# Normalize composite scores to 0-1 range
df_raw['analytical_thinking'] = (
    (df_raw['analytical_thinking'] - df_raw['analytical_thinking'].min()) /
    (df_raw['analytical_thinking'].max() - df_raw['analytical_thinking'].min())
)

df_raw['conspiratorial_thinking'] = (
    (df_raw['conspiratorial_thinking'] - df_raw['conspiratorial_thinking'].min()) /
    (df_raw['conspiratorial_thinking'].max() - df_raw['conspiratorial_thinking'].min())
)

df_raw['cognitive_integrity'] = (
    (df_raw['cognitive_integrity'] - df_raw['cognitive_integrity'].min()) /
    (df_raw['cognitive_integrity'].max() - df_raw['cognitive_integrity'].min())
)

print("\nTable 2: Composite Behavioral Scores")
print("="*80)
print(f"\nAnalytical Thinking (0-1 scale):")
print(f"  Mean: {df_raw['analytical_thinking'].mean():.3f}")
print(f"  SD: {df_raw['analytical_thinking'].std():.3f}")

print(f"\nCopnspiratorial Thinking (0-1 scale):")
print(f"  Mean: {df_raw['conspiratorial_thinking'].mean():.3f}")
print(f"  SD: {df_raw['conspiratorial_thinking'].std():.3f}")

print(f"\nCognitive Integrity (combined, 0-1 scale):")
print(f"  Mean: {df_raw['cognitive_integrity'].mean():.3f}")
print(f"  SD: {df_raw['cognitive_integrity'].std():.3f}")

# Validate composites correlate with target
print(f"\n\nComposite Score Validation:")
print("="*80)
for composite in ['analytical_thinking', 'conspiratorial_thinking', 'cognitive_integrity']:
    r, p = pearsonr(df_raw[composite], df_raw['misinformation_susceptibility'])
    print(f"{composite}: r = {r:.3f}, p = {p:.2e}")

## 6. Feature Standardization for Modeling

## 7. Create Train/Test Splits