In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os
from pathlib import Path
import datetime
import numpy as np

In [2]:
def get_working_dir():
    return Path.cwd()

processed_dir = get_working_dir() / 'data' / 'processed'
os.makedirs(processed_dir, exist_ok=True)

results_dir = get_working_dir() / 'plots' / 'mist'
os.makedirs(results_dir, exist_ok=True)

df_t = pd.read_csv(get_working_dir() / 'data' / 'processed' / 't_claims.csv', encoding='utf-8')
df_m = pd.read_csv(get_working_dir() /  'data' / 'processed' / 'm_claims.csv', encoding='utf-8')
df_m_t = pd.read_csv(get_working_dir() /  'data' / 'processed' / 'm_t_claims.csv', encoding='utf-8')
dataset_mist = pd.read_csv(get_working_dir() / 'data' / 'dataset_mist.csv', encoding='utf-8')

# V, R, F, d and n values for the three studies

In [4]:
# Define a function to compute the metrics.
def compute_metrics(df):
    # Identify claim columns (assumes columns starting with "claim_"
    # and that the first four are fake claims and the last four are real claims)
    claim_cols = [col for col in df.columns if col.startswith("claim_")]
    fake_cols = claim_cols[:4]
    real_cols = claim_cols[4:8]
    
    metrics = pd.DataFrame()
    metrics["prolific_id"] = df["prolific_id"]
    # f: Count of fake claims correctly identified ("Fake News")
    metrics["f"] = df[fake_cols].apply(lambda row: sum(row == "Fake News"), axis=1)
    # r: Count of real claims correctly identified ("Real News")
    metrics["r"] = df[real_cols].apply(lambda row: sum(row == "Real News"), axis=1)
    # v: Overall veracity discernment = f + r
    metrics["v"] = metrics["f"] + metrics["r"]
    # d: Distrust bias = number of real claims misclassified as "Fake News"
    metrics["d"] = df[real_cols].apply(lambda row: sum(row == "Fake News"), axis=1)
    # n: Naïvité = number of fake claims misclassified as "Real News"
    metrics["n"] = df[fake_cols].apply(lambda row: sum(row == "Real News"), axis=1)
    
    return metrics

# Compute metrics for each study.
metrics_t = compute_metrics(df_t)
metrics_m = compute_metrics(df_m)
metrics_m_t = compute_metrics(df_m_t)

# Add a study identifier.
metrics_t["study"] = "Text"
metrics_m["study"] = "Memes"
metrics_m_t["study"] = "Memes+Context"

# Combine all metrics into one DataFrame.
metrics_all = pd.concat([metrics_t, metrics_m, metrics_m_t], ignore_index=True)

# List of metrics for plotting.
metric_list = ["v", "r", "f", "d", "n"]

# Function to compute mean and standard error for a given dataframe and metric list.
def compute_summary_stats(df, metrics):
    summary = {}
    for m in metrics:
        mean_val = df[m].mean()
        sem_val = df[m].std() / np.sqrt(df[m].count())
        summary[m] = (mean_val, sem_val)
    return summary

# -----------------------------
# 1. Overall Summary: Bar plot for all participants (across all studies)
overall_stats = compute_summary_stats(metrics_all, metric_list)
overall_means = [overall_stats[m][0] for m in metric_list]
overall_sems  = [overall_stats[m][1] for m in metric_list]

fig, ax = plt.subplots(figsize=(8, 6))
bars = ax.bar(metric_list, overall_means, yerr=overall_sems, capsize=5)
ax.set_xlabel("Metric")
ax.set_ylabel("Mean Score")
ax.set_title("Overall Mean Scores with Standard Error (All Studies)")
fig.tight_layout()
fig.savefig(results_dir / "overall_barplot.png")
plt.close(fig)

# -----------------------------
# 2. Study-Specific Bar Plots: One bar plot per study
for study in metrics_all["study"].unique():
    df_study = metrics_all[metrics_all["study"] == study]
    stats = compute_summary_stats(df_study, metric_list)
    means = [stats[m][0] for m in metric_list]
    sems = [stats[m][1] for m in metric_list]
    
    fig, ax = plt.subplots(figsize=(8, 6))
    ax.bar(metric_list, means, yerr=sems, capsize=5)
    ax.set_xlabel("Metric")
    ax.set_ylabel("Mean Score")
    ax.set_title(f"Mean Scores with Standard Error ({study} Study)")
    fig.tight_layout()
    filename = f"barplot_{study.lower().replace(' ', '_')}.png"
    fig.savefig(results_dir / filename)
    plt.close(fig)

# -----------------------------
# 3. Comparison Across Studies: Bar plots for each metric grouped by study.
# We'll create one figure with five subplots (one for each metric).
study_order = ["Text", "Memes", "Memes+Context"]
fig, axs = plt.subplots(1, len(metric_list), figsize=(20, 5))
for i, m in enumerate(metric_list):
    means = []
    sems = []
    for study in study_order:
        df_temp = metrics_all[metrics_all["study"] == study]
        mean_val = df_temp[m].mean()
        sem_val = df_temp[m].std() / np.sqrt(df_temp[m].count())
        means.append(mean_val)
        sems.append(sem_val)
    axs[i].bar(study_order, means, yerr=sems, capsize=5)
    axs[i].set_title(f"{m} (by Study)")
    axs[i].set_xlabel("Study")
    axs[i].set_ylabel("Mean Score")
fig.suptitle("Comparison of Mean Scores Across Studies for Each Metric")
fig.tight_layout(rect=[0, 0.03, 1, 0.95])
fig.savefig(results_dir / "comparison_barplots.png")
plt.close(fig)

# CFA Realiability

In [5]:
from semopy import Model, calc_stats

# --- Step 1: Load the three CSV files for MIST‑8 data ---
df_t   = pd.read_csv(get_working_dir() / 'data' / 'processed' / 't_claims.csv', encoding='utf-8')
df_m   = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_claims.csv', encoding='utf-8')
df_m_t = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_t_claims.csv', encoding='utf-8')

# --- Step 2: Recode claim responses into binary scores ---
# Define the fake and real claim columns.
# (Assumes the first four claims are fake and the last four are real.)
fake_cols = ['claim_stock_manipulation', 'claim_left_wing_salary_lie', 
             'claim_vaccine_toxins', 'claim_government_disease_spread']
real_cols = ['claim_eu_positive_attitudes', 'claim_hyatt_small_bottles', 
             'claim_republicans_trump_conduct', 'claim_climate_worry_age_gap']

def recode_mist8(df):
    # For fake items: correct response is "Fake News"
    for col in fake_cols:
        df[col] = df[col].apply(lambda x: 1 if x.strip().lower() == "fake news" else 0)
    # For real items: correct response is "Real News"
    for col in real_cols:
        df[col] = df[col].apply(lambda x: 1 if x.strip().lower() == "real news" else 0)
    return df

df_t   = recode_mist8(df_t)
df_m   = recode_mist8(df_m)
df_m_t = recode_mist8(df_m_t)

# --- Step 3: Combine the data to form the overall sample ---
df_overall = pd.concat([df_t, df_m, df_m_t], ignore_index=True)

# Define the list of MIST‑8 item columns.
mist8_cols = fake_cols + real_cols

# Remove any cases with missing values on the mist‑8 items.
df_overall_complete = df_overall.dropna(subset=mist8_cols)

# --- Step 4: Define the Two-Factor CFA Model ---
# Factor F represents fake news detection and R represents real news detection.
model_desc = """
F =~ claim_stock_manipulation + claim_left_wing_salary_lie + claim_vaccine_toxins + claim_government_disease_spread
R =~ claim_eu_positive_attitudes + claim_hyatt_small_bottles + claim_republicans_trump_conduct + claim_climate_worry_age_gap
"""

# --- Step 5: Fit the CFA Model using semopy ---
model = Model(model_desc)
model.fit(df_overall_complete)

# --- Step 6: Calculate and Print Fit Statistics ---
stats   = calc_stats(model)
chi2    = stats.loc["Value", "chi2"]
dof     = stats.loc["Value", "DoF"]
cfi     = stats.loc["Value", "CFI"]
rmsea   = stats.loc["Value", "RMSEA"]
cmin_df = chi2 / dof

print("Baseline CFA Model Fit Indices for Mist-8 Overall Sample:")
print(f"  Chi-square: {chi2:.2f}")
print(f"  Degrees of Freedom: {dof:.0f}")
print(f"  CMIN/df: {cmin_df:.2f}")
print(f"  CFI: {cfi:.2f}")
print(f"  RMSEA: {rmsea:.3f}")

# --- Step 7: Save the plots to a text file ---
results_dir = get_working_dir() / 'plots' / 'cfa'
results_dir.mkdir(parents=True, exist_ok=True)
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = results_dir / f"mist8_cfa_results_overall_{timestamp}.txt"

with open(output_file, "w") as f:
    f.write("Baseline CFA Model Fit Indices for Mist-8 Overall Sample:\n")
    f.write(f"  Chi-square: {chi2:.2f}\n")
    f.write(f"  Degrees of Freedom: {dof:.0f}\n")
    f.write(f"  CMIN/df: {cmin_df:.2f}\n")
    f.write(f"  CFI: {cfi:.2f}\n")
    f.write(f"  RMSEA: {rmsea:.3f}\n")


Baseline CFA Model Fit Indices for Mist-8 Overall Sample:
  Chi-square: 28.96
  Degrees of Freedom: 19
  CMIN/df: 1.52
  CFI: 0.95
  RMSEA: 0.041


# Data validity

In [7]:
import pingouin as pg  # Install via: pip install pingouin


# --- Step 1: Load the three CSV files for MIST‑8 data ---
df_t   = pd.read_csv(get_working_dir() / 'data' / 'processed' / 't_claims.csv', encoding='utf-8')
df_m   = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_claims.csv', encoding='utf-8')
df_m_t = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_t_claims.csv', encoding='utf-8')

# --- Step 2: Recode claim responses into binary scores ---
# Define columns: (Assumes the first four are fake items and the last four are real items)
fake_cols = ['claim_stock_manipulation', 'claim_left_wing_salary_lie', 
             'claim_vaccine_toxins', 'claim_government_disease_spread']
real_cols = ['claim_eu_positive_attitudes', 'claim_hyatt_small_bottles', 
             'claim_republicans_trump_conduct', 'claim_climate_worry_age_gap']
mist8_cols = fake_cols + real_cols

def recode_mist8(df):
    # For fake news items: correct response is "Fake News"
    for col in fake_cols:
        df[col] = df[col].apply(lambda x: 1 if isinstance(x, str) and x.strip().lower() == "fake news" else 0)
    # For real news items: correct response is "Real News"
    for col in real_cols:
        df[col] = df[col].apply(lambda x: 1 if isinstance(x, str) and x.strip().lower() == "real news" else 0)
    return df

df_t   = recode_mist8(df_t)
df_m   = recode_mist8(df_m)
df_m_t = recode_mist8(df_m_t)

# --- Step 3: Combine the data to form the overall MIST‑8 sample ---
df_mist8 = pd.concat([df_t, df_m, df_m_t], ignore_index=True)
df_mist8 = df_mist8.dropna(subset=mist8_cols)  # Remove cases with missing responses

# --- Step 4: Compute overall MIST‑8 scale score ---
# (Typically, the MIST‑8 score is the sum of correct responses across the 8 items.)
df_mist8['mist8_score'] = df_mist8[mist8_cols].sum(axis=1)

# --- Step 5: Compute Reliability (Cronbach's alpha) for the 8-item scale ---
alpha_full, ci_full = pg.cronbach_alpha(data=df_mist8[mist8_cols])

# --- Step 6: Compute Descriptive Statistics for the MIST‑8 Scale Score ---
mean_score = df_mist8['mist8_score'].mean()
sd_score   = df_mist8['mist8_score'].std()

# --- Step 7: Print the Results ---
print("MIST-8 Full Scale Reliability:")
print(f"  Cronbach's alpha: {alpha_full:.2f}")
print(f"  95% CI: [{ci_full[0]:.2f}, {ci_full[1]:.2f}]")
print(f"  Mean (score): {mean_score:.2f}")
print(f"  SD (score): {sd_score:.2f}")

# --- (Optional) Compute Reliability for Fake and Real Items Separately ---
alpha_fake, ci_fake = pg.cronbach_alpha(data=df_mist8[fake_cols])
alpha_real, ci_real = pg.cronbach_alpha(data=df_mist8[real_cols])
mean_fake = df_mist8[fake_cols].sum(axis=1).mean()
sd_fake   = df_mist8[fake_cols].sum(axis=1).std()
mean_real = df_mist8[real_cols].sum(axis=1).mean()
sd_real   = df_mist8[real_cols].sum(axis=1).std()

print("\nMIST-8 Fake Items:")
print(f"  Cronbach's alpha: {alpha_fake:.2f}")
print(f"  95% CI: [{ci_fake[0]:.2f}, {ci_fake[1]:.2f}]")
print(f"  Mean (score): {mean_fake:.2f}")
print(f"  SD (score): {sd_fake:.2f}")

print("\nMIST-8 Real Items:")
print(f"  Cronbach's alpha: {alpha_real:.2f}")
print(f"  95% CI: [{ci_real[0]:.2f}, {ci_real[1]:.2f}]")
print(f"  Mean (score): {mean_real:.2f}")
print(f"  SD (score): {sd_real:.2f}")

# --- Step 8: Save the Results to a Text File ---
results_dir = get_working_dir() / 'plots' / 'mist'
results_dir.mkdir(parents=True, exist_ok=True)
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = results_dir / f"mist8_reliability_results_{timestamp}.txt"

with open(output_file, "w") as f:
    f.write("MIST-8 Full Scale Reliability:\n")
    f.write(f"  Cronbach's alpha: {alpha_full:.2f}\n")
    f.write(f"  95% CI: [{ci_full[0]:.2f}, {ci_full[1]:.2f}]\n")
    f.write(f"  Mean (score): {mean_score:.2f}\n")
    f.write(f"  SD (score): {sd_score:.2f}\n\n")
    
    f.write("MIST-8 Fake Items:\n")
    f.write(f"  Cronbach's alpha: {alpha_fake:.2f}\n")
    f.write(f"  95% CI: [{ci_fake[0]:.2f}, {ci_fake[1]:.2f}]\n")
    f.write(f"  Mean (score): {mean_fake:.2f}\n")
    f.write(f"  SD (score): {sd_fake:.2f}\n\n")
    
    f.write("MIST-8 Real Items:\n")
    f.write(f"  Cronbach's alpha: {alpha_real:.2f}\n")
    f.write(f"  95% CI: [{ci_real[0]:.2f}, {ci_real[1]:.2f}]\n")
    f.write(f"  Mean (score): {mean_real:.2f}\n")
    f.write(f"  SD (score): {sd_real:.2f}\n")


MIST-8 Full Scale Reliability:
  Cronbach's alpha: 0.61
  95% CI: [0.54, 0.67]
  Mean (score): 6.07
  SD (score): 1.74

MIST-8 Fake Items:
  Cronbach's alpha: 0.40
  95% CI: [0.28, 0.50]
  Mean (score): 2.83
  SD (score): 1.07

MIST-8 Real Items:
  Cronbach's alpha: 0.47
  95% CI: [0.36, 0.56]
  Mean (score): 3.23
  SD (score): 0.96


# Correlations between demographic factors and mist results

In [9]:
# # ----- Step 1: Load and combine Demographics Data -----
# df_t_dem = pd.read_csv(get_working_dir() / 'data' / 'processed' / 't_demographics.csv', encoding='utf-8')
# df_m_dem = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_demographics.csv', encoding='utf-8')
# df_mt_dem = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_t_demographics.csv', encoding='utf-8')
# 
# # Combine the three demographics files
# df_dem = pd.concat([df_t_dem, df_m_dem, df_mt_dem], ignore_index=True)
# 
# # Selected demographic variables
# selected_dem = ['age_group', 'political_orientation', 'engagement_with_political_content']
# 
# # Filter out rows where any of these variables is "I prefer not to answer"
# for var in selected_dem:
#     df_dem = df_dem[df_dem[var] != "I prefer not to answer"]
# 
# # ----- Step 2: Load MIST Claims Data and compute v -----
# def compute_v_metric(df):
#     # Assumes columns starting with "claim_" are in order:
#     # first 4 fake items and next 4 real items.
#     claim_cols = [col for col in df.columns if col.startswith("claim_")]
#     fake_cols = claim_cols[:4]
#     real_cols = claim_cols[4:8]
#     # For fake items, correct response is "Fake News"
#     f_score = df[fake_cols].apply(lambda row: sum(1 for x in row 
#                                                   if isinstance(x, str) and x.strip().lower() == "fake news"),
#                                    axis=1)
#     # For real items, correct response is "Real News"
#     r_score = df[real_cols].apply(lambda row: sum(1 for x in row 
#                                                   if isinstance(x, str) and x.strip().lower() == "real news"),
#                                    axis=1)
#     # Overall veracity discernment v = f + r
#     return f_score + r_score
# 
# # Load the three MIST claims files
# df_t_claims  = pd.read_csv(get_working_dir() / 'data' / 'processed' / 't_claims.csv', encoding='utf-8')
# df_m_claims  = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_claims.csv', encoding='utf-8')
# df_mt_claims = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_t_claims.csv', encoding='utf-8')
# 
# # Compute the v metric for each study
# df_t_claims['v']  = compute_v_metric(df_t_claims)
# df_m_claims['v']  = compute_v_metric(df_m_claims)
# df_mt_claims['v'] = compute_v_metric(df_mt_claims)
# 
# # Combine the v metrics (only keeping prolific_id and v)
# df_claims_all = pd.concat([df_t_claims[['prolific_id', 'v']],
#                            df_m_claims[['prolific_id', 'v']],
#                            df_mt_claims[['prolific_id', 'v']]], ignore_index=True)
# 
# # ----- Step 3: Merge Demographics with MIST Metrics -----
# df_merged = pd.merge(df_claims_all, df_dem, on="prolific_id", how="inner")
# 
# # ----- Step 4: Define functions for correlation ratio and one-way ANOVA -----
# def correlation_ratio(categories, measurements):
#     """Compute correlation ratio (η) between a categorical and a numeric variable."""
#     categories = np.array(categories)
#     measurements = np.array(measurements)
#     # Factorize the categories
#     fcat, _ = pd.factorize(categories)
#     cat_num = np.unique(fcat)
#     overall_mean = np.mean(measurements)
#     numerator = sum([len(measurements[fcat == cat]) * (np.mean(measurements[fcat == cat]) - overall_mean)**2 
#                      for cat in cat_num])
#     denominator = sum((measurements - overall_mean)**2)
#     return np.sqrt(numerator/denominator) if denominator != 0 else 0.0
# 
# def anova_pvalue(categories, measurements):
#     """Perform one-way ANOVA and return F statistic and p-value."""
#     groups = [measurements[categories == group] for group in np.unique(categories)]
#     f_stat, p_val = f_oneway(*groups)
#     return f_stat, p_val
# 
# # ----- Step 5: Compare each Demographic Variable with v and print significance -----
# for dem in selected_dem:
#     cat_values = df_merged[dem].values
#     num_values = df_merged['v'].values
#     eta = correlation_ratio(cat_values, num_values)
#     f_stat, p_val = anova_pvalue(cat_values, num_values)
#     significance = "Statistically Significant" if p_val < 0.05 else "Not Statistically Significant"
#     print(f"For demographic '{dem}' vs. v:")
#     print(f"  Correlation Ratio (η): {eta:.3f}")
#     print(f"  ANOVA F-statistic: {f_stat:.3f}, p-value: {p_val:.3e} --> {significance}")
#     print("-"*50)

For demographic 'age_group' vs. v:
  Correlation Ratio (η): 0.208
  ANOVA F-statistic: 4.596, p-value: 3.662e-03 --> Statistically Significant
--------------------------------------------------
For demographic 'political_orientation' vs. v:
  Correlation Ratio (η): 0.417
  ANOVA F-statistic: 15.899, p-value: 7.943e-12 --> Statistically Significant
--------------------------------------------------
For demographic 'engagement_with_political_content' vs. v:
  Correlation Ratio (η): 0.173
  ANOVA F-statistic: 2.334, p-value: 5.568e-02 --> Not Statistically Significant
--------------------------------------------------


# Correlation between veracity discernment and age group by converting age group to numeric

In [28]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
import os
from pathlib import Path

def get_working_dir():
    return Path.cwd()

# Function to compute veracity score from a claims file.
# Assumes the file has 8 claim columns (other than prolific_id), 
# where the first 4 should be "Fake News" and the last 4 "Real News".
def compute_veracity_score(df_claims):
    # Exclude the 'prolific_id' column.
    cols = [col for col in df_claims.columns if col != "prolific_id"]
    # Define expected responses.
    expected = ["Fake News"] * 4 + ["Real News"] * 4
    scores = []
    for idx, row in df_claims.iterrows():
        score = 0
        for col, exp in zip(cols, expected):
            if isinstance(row[col], str) and row[col].strip() == exp:
                score += 1
        scores.append(score)
    return pd.Series(scores, index=df_claims.index)

def process_veracity_age_correlation(claims_filename, demo_filename, label):
    # Load the claims file and compute veracity score.
    claims_path = get_working_dir() / 'data' / 'processed' / claims_filename
    df_claims = pd.read_csv(claims_path, encoding='utf-8', dtype={'prolific_id': str})
    df_claims["veracity_score"] = compute_veracity_score(df_claims)
    
    # Load the demographics file.
    demo_path = get_working_dir() / 'data' / 'processed' / demo_filename
    df_demo = pd.read_csv(demo_path, encoding='utf-8', dtype={'prolific_id': str})
    
    # Filter out rows where age_group is "I prefer not to answer".
    df_demo = df_demo[df_demo['age_group'] != "I prefer not to answer"]
    
    # Map age_group to an ordinal numeric scale.
    # Adjust the mapping if your labels differ.
    age_mapping = {
        "18-25 years old": 1,
        "26-35 years old": 2,
        "36-50 years old": 3,
        "Over 50 years old": 4
    }
    df_demo["age_numeric"] = df_demo["age_group"].str.strip().map(age_mapping)
    
    # Merge the two datasets on prolific_id.
    df_merged = pd.merge(
        df_claims[['prolific_id', 'veracity_score']],
        df_demo[['prolific_id', 'age_numeric']],
        on="prolific_id", how="inner"
    )
    
    # Drop rows with missing values.
    valid_df = df_merged.dropna(subset=["veracity_score", "age_numeric"])
    
    if len(valid_df) < 2:
        print(f"{label}: Insufficient data to compute correlation (n = {len(valid_df)}).")
    else:
        # Compute Spearman correlation (suitable since age_numeric is ordinal).
        corr, p_val = spearmanr(valid_df["veracity_score"], valid_df["age_numeric"])
        print(f"{label}:")
        print(f"  Spearman correlation between veracity discernment and age group: r = {corr:.3f}, p = {p_val:.3f}")
    print("-------\n")

# Example usage:
# For Text Explanation (adjust file names as needed)
process_veracity_age_correlation("t_claims.csv", "t_demographics.csv", "Text Explanation")
# For Meme Explanation
process_veracity_age_correlation("m_claims.csv", "m_demographics.csv", "Meme Explanation")
# For Meme+Context Explanation
process_veracity_age_correlation("m_t_claims.csv", "m_t_demographics.csv", "Meme+Context Explanation")

Text Explanation:
  Spearman correlation between veracity discernment and age group: r = 0.158, p = 0.100
-------

Meme Explanation:
  Spearman correlation between veracity discernment and age group: r = 0.397, p = 0.000
-------

Meme+Context Explanation:
  Spearman correlation between veracity discernment and age group: r = 0.118, p = 0.237
-------



# Correlation beteen veracity discernment and political orientation by converting the latter to numeric

In [29]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
import os
from pathlib import Path

def get_working_dir():
    return Path.cwd()

# Function to compute veracity score from a claims file.
# Assumes that aside from the 'prolific_id' column, the file contains 8 columns
# in order where the first 4 should be answered "Fake News" and the last 4 "Real News".
def compute_veracity_score(df_claims):
    # Exclude the 'prolific_id' column.
    cols = [col for col in df_claims.columns if col != "prolific_id"]
    # Define the expected answers.
    expected = ["Fake News"] * 4 + ["Real News"] * 4
    scores = []
    for idx, row in df_claims.iterrows():
        score = 0
        for col, exp in zip(cols, expected):
            if isinstance(row[col], str) and row[col].strip() == exp:
                score += 1
        scores.append(score)
    return pd.Series(scores, index=df_claims.index)

def process_veracity_pol_correlation(claims_filename, demo_filename, label):
    # Load the claims file and compute veracity score.
    claims_path = get_working_dir() / 'data' / 'processed' / claims_filename
    df_claims = pd.read_csv(claims_path, encoding='utf-8', dtype={'prolific_id': str})
    df_claims["veracity_score"] = compute_veracity_score(df_claims)
    
    # Load the demographics file.
    demo_path = get_working_dir() / 'data' / 'processed' / demo_filename
    df_demo = pd.read_csv(demo_path, encoding='utf-8', dtype={'prolific_id': str})
    
    # Filter out rows where political_orientation is "I prefer not to answer".
    df_demo = df_demo[df_demo['political_orientation'] != "I prefer not to answer"]
    
    # Map political_orientation into an ordinal numeric scale.
    pol_mapping = {
        "Very Liberal": 1,
        "Moderately Liberal": 2,
        "Moderate": 3,
        "Moderately Conservative": 4,
        "Very Conservative": 5
    }
    df_demo["pol_numeric"] = df_demo["political_orientation"].str.strip().map(pol_mapping)
    
    # Merge the claims and demographics data on prolific_id.
    df_merged = pd.merge(
        df_claims[['prolific_id', 'veracity_score']],
        df_demo[['prolific_id', 'pol_numeric']],
        on="prolific_id", how="inner"
    )
    
    # Drop rows with missing values.
    valid_df = df_merged.dropna(subset=["veracity_score", "pol_numeric"])
    
    if len(valid_df) < 2:
        print(f"{label}: Insufficient data to compute correlation (n = {len(valid_df)}).")
    else:
        # Compute Spearman correlation.
        corr, p_val = spearmanr(valid_df["veracity_score"], valid_df["pol_numeric"])
        print(f"{label}:")
        print(f"  Spearman correlation between veracity discernment and political orientation: r = {corr:.3f}, p = {p_val:.3f}")
    print("-------\n")

# Process for each explanation type.
# Adjust the file names if necessary.
process_veracity_pol_correlation("t_claims.csv", "t_demographics.csv", "Text Explanation")
process_veracity_pol_correlation("m_claims.csv", "m_demographics.csv", "Meme Explanation")
process_veracity_pol_correlation("m_t_claims.csv", "m_t_demographics.csv", "Meme+Context Explanation")

Text Explanation:
  Spearman correlation between veracity discernment and political orientation: r = -0.405, p = 0.000
-------

Meme Explanation:
  Spearman correlation between veracity discernment and political orientation: r = -0.277, p = 0.005
-------

Meme+Context Explanation:
  Spearman correlation between veracity discernment and political orientation: r = -0.451, p = 0.000
-------



# Corr between veracity discernment and political orientation numeric across all explanation types

In [32]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from pathlib import Path
import os

def get_working_dir():
    return Path.cwd()

def compute_veracity_score(df_claims):
    # Exclude the 'prolific_id' column.
    cols = [col for col in df_claims.columns if col != "prolific_id"]
    # For the first 4 claims, the correct answer is "Fake News"; for the last 4, "Real News".
    expected = ["Fake News"] * 4 + ["Real News"] * 4
    scores = []
    for idx, row in df_claims.iterrows():
        score = 0
        for col, exp in zip(cols, expected):
            if isinstance(row[col], str) and row[col].strip() == exp:
                score += 1
        scores.append(score)
    return pd.Series(scores, index=df_claims.index)

# Define a list of file pairs (claims file, demographics file) for each explanation type.
file_pairs = [
    ("t_claims.csv", "t_demographics.csv"),
    ("m_claims.csv", "m_demographics.csv"),
    ("m_t_claims.csv", "m_t_demographics.csv")
]

# List to hold merged data for each explanation type.
list_dfs = []

for claims_filename, demo_filename in file_pairs:
    # Load claims file and compute veracity score.
    claims_path = get_working_dir() / 'data' / 'processed' / claims_filename
    df_claims = pd.read_csv(claims_path, encoding='utf-8', dtype={'prolific_id': str})
    df_claims["veracity_score"] = compute_veracity_score(df_claims)
    
    # Load corresponding demographics file.
    demo_path = get_working_dir() / 'data' / 'processed' / demo_filename
    df_demo = pd.read_csv(demo_path, encoding='utf-8', dtype={'prolific_id': str})
    
    # Filter out rows where political_orientation is "I prefer not to answer".
    df_demo = df_demo[df_demo["political_orientation"] != "I prefer not to answer"]
    
    # Merge on prolific_id.
    df_merged = pd.merge(
        df_claims[['prolific_id', 'veracity_score']],
        df_demo[['prolific_id', 'political_orientation']],
        on="prolific_id", how="inner"
    )
    list_dfs.append(df_merged)

# Concatenate all merged datasets.
df_all = pd.concat(list_dfs, ignore_index=True)

# Map political_orientation to an ordinal numeric scale.
pol_mapping = {
    "Very Liberal": 1,
    "Moderately Liberal": 2,
    "Moderate": 3,
    "Moderately Conservative": 4,
    "Very Conservative": 5
}
df_all["pol_numeric"] = df_all["political_orientation"].str.strip().map(pol_mapping)

# Drop rows with missing values in veracity_score or pol_numeric.
df_all = df_all.dropna(subset=["veracity_score", "pol_numeric"])

if len(df_all) < 2:
    print("Insufficient data to compute correlation (n =", len(df_all), ").")
else:
    corr, p_val = pearsonr(df_all["veracity_score"], df_all["pol_numeric"])
    print("Pearson correlation between veracity discernment and political orientation (across all explanation types):")
    print(f"  r = {corr:.3f}, p = {p_val:.20f}")

Pearson correlation between veracity discernment and political orientation (across all explanation types):
  r = -0.402, p = 0.00000000000021298597


# Correlation between veracity discernment and age group numeric colapsed to all explanation types

In [35]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
from pathlib import Path
import os

def get_working_dir():
    return Path.cwd()

def compute_veracity_score(df_claims):
    # Exclude 'prolific_id'
    cols = [col for col in df_claims.columns if col != "prolific_id"]
    # Expected answers: first 4 claims should be "Fake News" and last 4 "Real News".
    expected = ["Fake News"] * 4 + ["Real News"] * 4
    scores = []
    for idx, row in df_claims.iterrows():
        score = 0
        for col, exp in zip(cols, expected):
            if isinstance(row[col], str) and row[col].strip() == exp:
                score += 1
        scores.append(score)
    return pd.Series(scores, index=df_claims.index)

def process_file_pair(claims_filename, demo_filename):
    # Load claims file and compute veracity score.
    claims_path = get_working_dir() / 'data' / 'processed' / claims_filename
    df_claims = pd.read_csv(claims_path, encoding='utf-8', dtype={'prolific_id': str})
    df_claims["veracity_score"] = compute_veracity_score(df_claims)
    
    # Load demographics file.
    demo_path = get_working_dir() / 'data' / 'processed' / demo_filename
    df_demo = pd.read_csv(demo_path, encoding='utf-8', dtype={'prolific_id': str})
    # Filter out rows where age_group is "I prefer not to answer"
    df_demo = df_demo[df_demo['age_group'] != "I prefer not to answer"]
    # Map age_group to an ordinal numeric scale.
    age_mapping = {
        "18-25 years old": 1,
        "26-35 years old": 2,
        "36-50 years old": 3,
        "Over 50 years old": 4
    }
    df_demo["age_numeric"] = df_demo["age_group"].str.strip().map(age_mapping)
    
    # Merge on prolific_id.
    df_merged = pd.merge(
        df_claims[['prolific_id', 'veracity_score']],
        df_demo[['prolific_id', 'age_numeric']],
        on="prolific_id", how="inner"
    )
    return df_merged

# Define file pairs for each explanation type.
file_pairs = [
    ("t_claims.csv", "t_demographics.csv"),
    ("m_claims.csv", "m_demographics.csv"),
    ("m_t_claims.csv", "m_t_demographics.csv")
]

# Process each pair and collect the merged data.
list_dfs = []
for claims_file, demo_file in file_pairs:
    df_merged = process_file_pair(claims_file, demo_file)
    list_dfs.append(df_merged)

# Concatenate all merged data into one dataframe.
df_all = pd.concat(list_dfs, ignore_index=True)
df_all = df_all.dropna(subset=["veracity_score", "age_numeric"])

if len(df_all) < 2:
    print("Insufficient data to compute correlation (n = {})".format(len(df_all)))
else:
    corr, p_val = spearmanr(df_all["veracity_score"], df_all["age_numeric"])
    print("Spearman correlation between veracity discernment and age group (across all explanation types):")
    print(f"  r = {corr:.3f}, p = {p_val:.12f}")

Spearman correlation between veracity discernment and age group (across all explanation types):
  r = 0.202, p = 0.000336090161


# Correlation between veracity discernment and engagement with political content numeric colapse for all explanation types

In [36]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
from pathlib import Path
import os

def get_working_dir():
    return Path.cwd()

# Function to compute veracity score from a claims file.
def compute_veracity_score(df_claims):
    # Exclude the 'prolific_id' column.
    cols = [col for col in df_claims.columns if col != "prolific_id"]
    # Expected answers: first 4 should be "Fake News" and last 4 "Real News".
    expected = ["Fake News"] * 4 + ["Real News"] * 4
    scores = []
    for idx, row in df_claims.iterrows():
        score = 0
        for col, exp in zip(cols, expected):
            if isinstance(row[col], str) and row[col].strip() == exp:
                score += 1
        scores.append(score)
    return pd.Series(scores, index=df_claims.index)

# Process one file pair (claims and demographics) and return the merged DataFrame.
def process_file_pair_engagement(claims_filename, demo_filename):
    # Load the claims file and compute veracity score.
    claims_path = get_working_dir() / 'data' / 'processed' / claims_filename
    df_claims = pd.read_csv(claims_path, encoding='utf-8', dtype={'prolific_id': str})
    df_claims["veracity_score"] = compute_veracity_score(df_claims)
    
    # Load the demographics file.
    demo_path = get_working_dir() / 'data' / 'processed' / demo_filename
    df_demo = pd.read_csv(demo_path, encoding='utf-8', dtype={'prolific_id': str})
    
    # Filter out rows where engagement_with_political_content is "I prefer not to answer".
    df_demo = df_demo[df_demo["engagement_with_political_content"] != "I prefer not to answer"]
    
    # Map engagement_with_political_content to an ordinal numeric scale.
    engagement_mapping = {
        "Never": 1,
        "Rarely": 2,
        "Sometimes": 3,
        "Often": 4,
        "Very Frequently": 5
    }
    df_demo["engagement_numeric"] = df_demo["engagement_with_political_content"].str.strip().map(engagement_mapping)
    
    # Merge on prolific_id.
    df_merged = pd.merge(
        df_claims[['prolific_id', 'veracity_score']],
        df_demo[['prolific_id', 'engagement_numeric']],
        on="prolific_id", how="inner"
    )
    return df_merged

# Define file pairs for each explanation type.
file_pairs = [
    ("t_claims.csv", "t_demographics.csv"),
    ("m_claims.csv", "m_demographics.csv"),
    ("m_t_claims.csv", "m_t_demographics.csv")
]

list_dfs = []
for claims_file, demo_file in file_pairs:
    df_merged = process_file_pair_engagement(claims_file, demo_file)
    list_dfs.append(df_merged)

# Concatenate all merged data into one DataFrame.
df_all = pd.concat(list_dfs, ignore_index=True)
df_all = df_all.dropna(subset=["veracity_score", "engagement_numeric"])

if len(df_all) < 2:
    print("Insufficient data to compute correlation (n =", len(df_all), ").")
else:
    # Compute Spearman correlation between veracity score and political engagement.
    corr, p_val = spearmanr(df_all["veracity_score"], df_all["engagement_numeric"])
    print("Spearman correlation between veracity discernment and political engagement (across all explanation types):")
    print(f"  r = {corr:.3f}, p = {p_val:.3f}")

Spearman correlation between veracity discernment and political engagement (across all explanation types):
  r = -0.020, p = 0.721


# Correlation between veracity discernment and education level numeric for all explanation types

In [37]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
from pathlib import Path
import os

def get_working_dir():
    return Path.cwd()

# Function to compute veracity score from a claims file.
# Assumes that aside from 'prolific_id', the file contains 8 claim columns,
# where the first 4 should be "Fake News" and the last 4 "Real News".
def compute_veracity_score(df_claims):
    # Exclude 'prolific_id'
    cols = [col for col in df_claims.columns if col != "prolific_id"]
    # Define expected responses.
    expected = ["Fake News"] * 4 + ["Real News"] * 4
    scores = []
    for idx, row in df_claims.iterrows():
        score = 0
        for col, exp in zip(cols, expected):
            if isinstance(row[col], str) and row[col].strip() == exp:
                score += 1
        scores.append(score)
    return pd.Series(scores, index=df_claims.index)

# Process one file pair (claims file and demographics file) and return merged DataFrame.
def process_file_pair_education(claims_filename, demo_filename):
    # Load the claims file and compute veracity score.
    claims_path = get_working_dir() / 'data' / 'processed' / claims_filename
    df_claims = pd.read_csv(claims_path, encoding='utf-8', dtype={'prolific_id': str})
    df_claims["veracity_score"] = compute_veracity_score(df_claims)
    
    # Load the demographics file.
    demo_path = get_working_dir() / 'data' / 'processed' / demo_filename
    df_demo = pd.read_csv(demo_path, encoding='utf-8', dtype={'prolific_id': str})
    
    # Filter out rows where education_level is "I prefer not to answer".
    df_demo = df_demo[df_demo["education_level"] != "I prefer not to answer"]
    
    # Map education_level to an ordinal numeric scale.
    education_mapping = {
        "Elementary education": 1,
        "High school diploma or equivalent": 2,
        "Bachelor's Degree": 3,
        "Master's Degree": 4,
        "Doctoral degree (PhD)": 5
    }
    df_demo["education_numeric"] = df_demo["education_level"].str.strip().map(education_mapping)
    
    # Merge the claims and demographics data on prolific_id.
    df_merged = pd.merge(
        df_claims[['prolific_id', 'veracity_score']],
        df_demo[['prolific_id', 'education_numeric']],
        on="prolific_id", how="inner"
    )
    return df_merged

# Define file pairs for each explanation type.
file_pairs = [
    ("t_claims.csv", "t_demographics.csv"),
    ("m_claims.csv", "m_demographics.csv"),
    ("m_t_claims.csv", "m_t_demographics.csv")
]

# Process each pair and combine them.
list_dfs = []
for claims_file, demo_file in file_pairs:
    df_merged = process_file_pair_education(claims_file, demo_file)
    list_dfs.append(df_merged)

# Concatenate all merged data into one DataFrame.
df_all = pd.concat(list_dfs, ignore_index=True)
df_all = df_all.dropna(subset=["veracity_score", "education_numeric"])

if len(df_all) < 2:
    print("Insufficient data to compute correlation (n =", len(df_all), ").")
else:
    corr, p_val = spearmanr(df_all["veracity_score"], df_all["education_numeric"])
    print("Spearman correlation between veracity discernment and education level (across all explanation types):")
    print(f"  r = {corr:.3f}, p = {p_val:.3f}")

Spearman correlation between veracity discernment and education level (across all explanation types):
  r = 0.119, p = 0.165


# Correlation between veracity discernment and meme familiarity numeric

In [38]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
from pathlib import Path
import os

def get_working_dir():
    return Path.cwd()

# Function to compute veracity score from a claims file.
# Assumes that, aside from 'prolific_id', the file contains 8 claim columns,
# where the first 4 should be answered as "Fake News" and the last 4 as "Real News".
def compute_veracity_score(df_claims):
    cols = [col for col in df_claims.columns if col != "prolific_id"]
    expected = ["Fake News"] * 4 + ["Real News"] * 4
    scores = []
    for idx, row in df_claims.iterrows():
        score = 0
        for col, exp in zip(cols, expected):
            if isinstance(row[col], str) and row[col].strip() == exp:
                score += 1
        scores.append(score)
    return pd.Series(scores, index=df_claims.index)

# Process one file pair: load claims file and demographics file, compute veracity score,
# filter out rows with "I prefer not to answer" for meme culture familiarity,
# and map meme culture familiarity to a numeric ordinal scale.
def process_file_pair_meme_culture(claims_filename, demo_filename):
    # Load claims file.
    claims_path = get_working_dir() / 'data' / 'processed' / claims_filename
    df_claims = pd.read_csv(claims_path, encoding='utf-8', dtype={'prolific_id': str})
    df_claims["veracity_score"] = compute_veracity_score(df_claims)
    
    # Load demographics file.
    demo_path = get_working_dir() / 'data' / 'processed' / demo_filename
    df_demo = pd.read_csv(demo_path, encoding='utf-8', dtype={'prolific_id': str})
    
    # Filter out rows where meme_culture_familiarity is "I prefer not to answer".
    df_demo = df_demo[df_demo["meme_culture_familiarity"] != "I prefer not to answer"]
    
    # Map meme_culture_familiarity to an ordinal numeric scale.
    meme_culture_mapping = {
        "Not familiar at all (I rarely understand meme references)": 1,
        "Slightly familiar (I understand basic, widely-known memes)": 2,
        "Moderately familiar (I understand most popular memes and their variations)": 3,
        "Very familiar (I understand complex meme references and their evolution)": 4,
        "Extremely familiar (I actively follow meme trends and their cultural context)": 5
    }
    df_demo["meme_culture_numeric"] = df_demo["meme_culture_familiarity"].str.strip().map(meme_culture_mapping)
    
    # Merge the two datasets on prolific_id.
    df_merged = pd.merge(
        df_claims[['prolific_id', 'veracity_score']],
        df_demo[['prolific_id', 'meme_culture_numeric']],
        on="prolific_id", how="inner"
    )
    return df_merged

# Define file pairs for the meme-based explanation types.
file_pairs = [
    ("m_claims.csv", "m_demographics.csv"),
    ("m_t_claims.csv", "m_t_demographics.csv")
]

# Process each file pair and collect merged data.
merged_dfs = []
for claims_file, demo_file in file_pairs:
    df_merged = process_file_pair_meme_culture(claims_file, demo_file)
    merged_dfs.append(df_merged)

# Concatenate merged data from both explanation types.
df_all = pd.concat(merged_dfs, ignore_index=True)
df_all = df_all.dropna(subset=["veracity_score", "meme_culture_numeric"])

if len(df_all) < 2:
    print("Insufficient data to compute correlation (n =", len(df_all), ").")
else:
    corr, p_val = spearmanr(df_all["veracity_score"], df_all["meme_culture_numeric"])
    print("Spearman correlation between veracity discernment and meme culture familiarity (collapsed across meme-based explanation types):")
    print(f"  r = {corr:.3f}, p = {p_val:.3f}")


Spearman correlation between veracity discernment and meme culture familiarity (collapsed across meme-based explanation types):
  r = -0.076, p = 0.280


# Checking how v changes with age_group values and political_orientation values

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

def get_working_dir():
    return Path.cwd()

results_dir = get_working_dir() / 'plots' / 'mist'
results_dir.mkdir(parents=True, exist_ok=True)

# ----- Step 1: Load Demographics Data -----
df_t_dem  = pd.read_csv(get_working_dir() / 'data' / 'processed' / 't_demographics.csv', encoding='utf-8')
df_m_dem  = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_demographics.csv', encoding='utf-8')
df_mt_dem = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_t_demographics.csv', encoding='utf-8')

# Concatenate demographics and clean whitespace
df_dem = pd.concat([df_t_dem, df_m_dem, df_mt_dem], ignore_index=True)
for col in ['age_group', 'political_orientation', 'engagement_with_political_content']:
    df_dem[col] = df_dem[col].astype(str).str.strip()

# Filter out rows with "I prefer not to answer"
selected_dem = ['age_group', 'political_orientation', 'engagement_with_political_content']
for var in selected_dem:
    df_dem = df_dem[df_dem[var] != "I prefer not to answer"]

# ----- Step 2: Load MIST Claims Data and Compute v -----
def compute_v_metric(df):
    # Assumes columns starting with "claim_" are ordered: first 4 fake, next 4 real.
    claim_cols = [col for col in df.columns if col.startswith("claim_")]
    fake_cols = claim_cols[:4]
    real_cols = claim_cols[4:8]
    f_score = df[fake_cols].apply(lambda row: sum(1 for x in row 
                                                  if isinstance(x, str) and x.strip().lower() == "fake news"),
                                   axis=1)
    r_score = df[real_cols].apply(lambda row: sum(1 for x in row 
                                                  if isinstance(x, str) and x.strip().lower() == "real news"),
                                   axis=1)
    return f_score + r_score

df_t_claims  = pd.read_csv(get_working_dir() / 'data' / 'processed' / 't_claims.csv', encoding='utf-8')
df_m_claims  = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_claims.csv', encoding='utf-8')
df_mt_claims = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_t_claims.csv', encoding='utf-8')

df_t_claims['v']  = compute_v_metric(df_t_claims)
df_m_claims['v']  = compute_v_metric(df_m_claims)
df_mt_claims['v'] = compute_v_metric(df_mt_claims)

# Retain only prolific_id and v, and then combine.
df_claims = pd.concat([df_t_claims[['prolific_id', 'v']],
                       df_m_claims[['prolific_id', 'v']],
                       df_mt_claims[['prolific_id', 'v']]],
                      ignore_index=True)

# ----- Step 3: Merge Demographics with Claims Data -----
df_merged = pd.merge(df_claims, df_dem, on="prolific_id", how="inner")

# Clean up political_orientation:
# Replace "Very Conservative" with "Conservative" to match the desired order.
df_merged['political_orientation'] = df_merged['political_orientation']

# Print unique values for diagnostic purposes.
print("Unique political_orientation values after merge:")
print(df_merged["political_orientation"].unique())

# ----- Step 4: Plot v by Age Group -----
age_stats = df_merged.groupby('age_group')['v'].agg(['mean', 'std', 'count'])
age_stats['sem'] = age_stats['std'] / np.sqrt(age_stats['count'])
print("\nDescriptive Statistics for v by Age Group:")
print(age_stats)

fig, ax = plt.subplots(figsize=(8,6))
ax.bar(age_stats.index, age_stats['mean'], yerr=age_stats['sem'], capsize=5)
ax.set_xlabel("Age Group")
ax.set_ylabel("Mean of Veracity Discernment")
ax.set_title("Mean of Veracity Discernment by Age Group")
plt.tight_layout()
fig.savefig(results_dir / 'v_by_age_group.png')
plt.close(fig)

# ----- Step 5: Plot v by Political Orientation -----
# Define the desired order.
ordered_pol = ["Very Liberal", "Moderately Liberal", "Moderate", "Moderately Conservative", "Very Conservative"]

# Convert political_orientation to a categorical variable with specified order.
df_merged['political_orientation'] = pd.Categorical(df_merged['political_orientation'],
                                                    categories=ordered_pol,
                                                    ordered=True)

pol_stats = df_merged.groupby('political_orientation', observed=True)['v'].agg(['mean', 'std', 'count'])
pol_stats['sem'] = pol_stats['std'] / np.sqrt(pol_stats['count'])
# Reindex to desired order.
pol_stats = pol_stats.loc[ordered_pol]
print("\nDescriptive Statistics for v by Political Orientation:")
print(pol_stats)

# Abbreviation mapping for display.
abbr_dict = {"Very Liberal": "VL", 
             "Moderately Liberal": "ML", 
             "Moderate": "Mod", 
             "Moderately Conservative": "MC", 
             "Very Conservative": "VC"}

fig, ax = plt.subplots(figsize=(8,6))
x_pos = np.arange(len(pol_stats.index))
ax.bar(x_pos, pol_stats['mean'], yerr=pol_stats['sem'], capsize=5)
ax.set_xticks(x_pos)
ax.set_xticklabels([abbr_dict[label] for label in pol_stats.index])
ax.set_xlabel("Political Orientation")
ax.set_ylabel("Mean of Veracity Discernment")
ax.set_title("Mean of Veracity Discernment by Political Orientation")
plt.tight_layout()
fig.savefig(results_dir / 'v_by_political_orientation.png')
plt.close(fig)

Unique political_orientation values after merge:
['Moderate' 'Moderately Conservative' 'Moderately Liberal'
 'Very Conservative' 'Very Liberal']

Descriptive Statistics for v by Age Group:
                       mean       std  count       sem
age_group                                             
18-25 years old    5.815789  1.798727     38  0.291792
26-35 years old    5.678571  1.757330     84  0.191740
36-50 years old    6.059406  1.826591    101  0.181753
Over 50 years old  6.611765  1.456464     85  0.157976

Descriptive Statistics for v by Political Orientation:
                             mean       std  count       sem
political_orientation                                       
Very Liberal             6.962963  0.889413     54  0.121034
Moderately Liberal       6.481481  1.475730     81  0.163970
Moderate                 6.311475  1.698440     61  0.217463
Moderately Conservative  5.482353  1.931023     85  0.209449
Very Conservative        4.444444  1.577079     27  0.30350

# Veracity discernment per age and political orientation

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


def get_working_dir():
    return Path.cwd()

results_dir = get_working_dir() / 'plots' / 'mist'
results_dir.mkdir(parents=True, exist_ok=True)

# ----- Build df_merged -----
# Load demographics data
df_t_dem  = pd.read_csv(get_working_dir() / 'data' / 'processed' / 't_demographics.csv', encoding='utf-8')
df_m_dem  = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_demographics.csv', encoding='utf-8')
df_mt_dem = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_t_demographics.csv', encoding='utf-8')
df_dem = pd.concat([df_t_dem, df_m_dem, df_mt_dem], ignore_index=True)
for col in ['age_group', 'political_orientation', 'engagement_with_political_content']:
    df_dem[col] = df_dem[col].astype(str).str.strip()
# Filter out "I prefer not to answer" rows for selected demographics.
for var in ['age_group', 'political_orientation', 'engagement_with_political_content']:
    df_dem = df_dem[df_dem[var] != "I prefer not to answer"]

# Load MIST claims data and compute overall veracity score (v)
def compute_v_metric(df):
    # Assumes columns starting with "claim_" are arranged so that the first 4 are fake items
    # and the next 4 are real items.
    claim_cols = [col for col in df.columns if col.startswith("claim_")]
    fake_cols = claim_cols[:4]
    real_cols = claim_cols[4:8]
    f_score = df[fake_cols].apply(lambda row: sum(1 for x in row 
                                                  if isinstance(x, str) and x.strip().lower() == "fake news"), axis=1)
    r_score = df[real_cols].apply(lambda row: sum(1 for x in row 
                                                  if isinstance(x, str) and x.strip().lower() == "real news"), axis=1)
    return f_score + r_score

df_t_claims  = pd.read_csv(get_working_dir() / 'data' / 'processed' / 't_claims.csv', encoding='utf-8')
df_m_claims  = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_claims.csv', encoding='utf-8')
df_mt_claims = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_t_claims.csv', encoding='utf-8')

df_t_claims['v']  = compute_v_metric(df_t_claims)
df_m_claims['v']  = compute_v_metric(df_m_claims)
df_mt_claims['v'] = compute_v_metric(df_mt_claims)

df_claims = pd.concat([df_t_claims[['prolific_id', 'v']],
                       df_m_claims[['prolific_id', 'v']],
                       df_mt_claims[['prolific_id', 'v']]],
                      ignore_index=True)

# Merge the claims data with demographics on prolific_id
df_merged = pd.merge(df_claims, df_dem, on="prolific_id", how="inner")

# Clean up political_orientation.
# In your data, you have "Very Conservative"; for our desired order, we want "Conservative".
df_merged['political_orientation'] = df_merged['political_orientation']

# ----- Define desired order for age_group and political_orientation -----
age_order = ["18-25 years old", "26-35 years old", "36-50 years old", "Over 50 years old"]
pol_order = ["Very Liberal", "Moderately Liberal", "Moderate", "Moderately Conservative", "Very Conservative"]

df_merged['age_group'] = pd.Categorical(df_merged['age_group'], categories=age_order, ordered=True)
df_merged['political_orientation'] = pd.Categorical(df_merged['political_orientation'], 
                                                    categories=pol_order, ordered=True)

# ----- Step 1: Compute subgroup summary statistics for v -----
subgroup_stats = df_merged.groupby(['age_group', 'political_orientation'])['v'].agg(['mean', 'std', 'count']).reset_index()
subgroup_stats['sem'] = subgroup_stats['std'] / np.sqrt(subgroup_stats['count'])
print("Veracity discernment (v) by Age Group and Political Orientation:")
print(subgroup_stats)

# ----- Step 2: Plot using seaborn for better visualization -----
sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))
# Create a bar plot with age_group on the x-axis and political_orientation as the hue.
ax = sns.barplot(data=df_merged, x="age_group", y="v", hue="political_orientation", 
                 order=age_order, hue_order=pol_order, ci="sd")
ax.set_xlabel("Age Group")
ax.set_ylabel("Mean Veracity Discernment (v)")
ax.set_title("Mean Veracity Discernment by Age Group and Political Orientation")
# Optionally, abbreviate the political_orientation in the legend.
abbr_dict = {"Very Liberal": "VL", "Moderately Liberal": "ML", "Moderate": "Mod", 
             "Moderately Conservative": "MC", "Very Conservative": "C"}
new_labels = [abbr_dict.get(label.get_text(), label.get_text()) for label in ax.get_legend().get_texts()]
for text, new in zip(ax.get_legend().get_texts(), new_labels):
    text.set_text(new)
ax.legend(title="Political Orientation", loc="upper right")
plt.tight_layout()
plt.savefig(results_dir / "v_by_age_and_pol.png")
plt.close()


Veracity discernment (v) by Age Group and Political Orientation:
            age_group    political_orientation      mean       std  count  \
0     18-25 years old             Very Liberal  7.100000  0.737865     10   
1     18-25 years old       Moderately Liberal  6.428571  1.272418      7   
2     18-25 years old                 Moderate  6.285714  1.704336      7   
3     18-25 years old  Moderately Conservative  4.454545  1.863525     11   
4     18-25 years old        Very Conservative  4.000000  1.000000      3   
5     26-35 years old             Very Liberal  6.785714  0.801784     14   
6     26-35 years old       Moderately Liberal  6.217391  1.650249     23   
7     26-35 years old                 Moderate  5.500000  1.689065     18   
8     26-35 years old  Moderately Conservative  5.400000  1.729009     20   
9     26-35 years old        Very Conservative  3.555556  1.424001      9   
10    36-50 years old             Very Liberal  7.066667  1.032796     15   
11    36-50

  subgroup_stats = df_merged.groupby(['age_group', 'political_orientation'])['v'].agg(['mean', 'std', 'count']).reset_index()

The `ci` parameter is deprecated. Use `errorbar='sd'` for the same effect.

  ax = sns.barplot(data=df_merged, x="age_group", y="v", hue="political_orientation",


# Correlation between affect change and veracity discernment

In [23]:
import pandas as pd
import numpy as np
from pathlib import Path
from scipy.stats import pearsonr

def get_working_dir():
    return Path.cwd()

results_dir = get_working_dir() / 'plots' / 'mist'
results_dir.mkdir(parents=True, exist_ok=True)

# ============================
# Step 1: Load and Process Affect Data
# ============================
# Assume the affect CSV files include columns for both pre and post affect measures.
# For Positive Affect (PA): pre_active, pre_determined, pre_attentive, pre_inspired, pre_alert
# For Negative Affect (NA): pre_afraid, pre_nervous, pre_upset, pre_hostile, pre_ashamed
# Similarly for post_ variables.

# Mapping for ordinal responses
affect_mapping = {
    "Not at all": 1,
    "Slightly": 2,
    "Moderately": 3,
    "Very": 4,
    "Extremely": 5,
    "I prefer not to answer": np.nan
}

def map_affect(df):
    # Process both pre and post columns (selecting columns that start with "pre_" or "post_")
    affect_cols = [col for col in df.columns if col.startswith("pre_") or col.startswith("post_")]
    for col in affect_cols:
        df[col] = df[col].map(affect_mapping)
    return df

# Load the three affect files.
df_t_affect   = pd.read_csv(get_working_dir() / 'data' / 'processed' / 't_affect.csv', encoding='utf-8')
df_m_affect   = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_affect.csv', encoding='utf-8')
df_mt_affect  = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_t_affect.csv', encoding='utf-8')

# Map ordinal responses to numeric values.
df_t_affect  = map_affect(df_t_affect)
df_m_affect  = map_affect(df_m_affect)
df_mt_affect = map_affect(df_mt_affect)

# Define the lists of items.
pa_items = ['active', 'determined', 'attentive', 'inspired', 'alert']
na_items = ['afraid', 'nervous', 'upset', 'hostile', 'ashamed']

def compute_affect_change(df):
    # For each scale, assume columns are named as pre_<item> and post_<item>
    pa_pre_cols = [f"pre_{item}" for item in pa_items]
    pa_post_cols = [f"post_{item}" for item in pa_items]
    na_pre_cols = [f"pre_{item}" for item in na_items]
    na_post_cols = [f"post_{item}" for item in na_items]
    
    # Compute composite scores (mean) for pre and post.
    df['PA_pre'] = df[pa_pre_cols].mean(axis=1)
    df['PA_post'] = df[pa_post_cols].mean(axis=1)
    df['NA_pre'] = df[na_pre_cols].mean(axis=1)
    df['NA_post'] = df[na_post_cols].mean(axis=1)
    
    # Compute change scores (post - pre)
    df['PA_change'] = df['PA_post'] - df['PA_pre']
    df['NA_change'] = df['NA_post'] - df['NA_pre']
    
    # Optionally, drop rows with NaN in the computed change scores.
    df = df.dropna(subset=['PA_change', 'NA_change'])
    # Retain only the unique identifier and the change scores.
    return df[['prolific_id', 'PA_change', 'NA_change']]

# Compute affect change for each file.
df_t_affect_change  = compute_affect_change(df_t_affect)
df_m_affect_change  = compute_affect_change(df_m_affect)
df_mt_affect_change = compute_affect_change(df_mt_affect)

# Combine affect change data.
df_affect_change = pd.concat([df_t_affect_change, df_m_affect_change, df_mt_affect_change], ignore_index=True)
# If a participant appears more than once, you might average their scores:
df_affect_change = df_affect_change.groupby('prolific_id', as_index=False).mean()

# ============================
# Step 2: Load and Process MIST Claims Data (Compute v)
# ============================
def compute_v_metric(df):
    # Assumes columns starting with "claim_" are ordered: first 4 fake items, next 4 real items.
    claim_cols = [col for col in df.columns if col.startswith("claim_")]
    fake_cols = claim_cols[:4]
    real_cols = claim_cols[4:8]
    f_score = df[fake_cols].apply(lambda row: sum(1 for x in row if isinstance(x, str) and x.strip().lower() == "fake news"), axis=1)
    r_score = df[real_cols].apply(lambda row: sum(1 for x in row if isinstance(x, str) and x.strip().lower() == "real news"), axis=1)
    return f_score + r_score

df_t_claims  = pd.read_csv(get_working_dir() / 'data' / 'processed' / 't_claims.csv', encoding='utf-8')
df_m_claims  = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_claims.csv', encoding='utf-8')
df_mt_claims = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_t_claims.csv', encoding='utf-8')

df_t_claims['v']  = compute_v_metric(df_t_claims)
df_m_claims['v']  = compute_v_metric(df_m_claims)
df_mt_claims['v'] = compute_v_metric(df_mt_claims)

df_claims = pd.concat([df_t_claims[['prolific_id', 'v']],
                       df_m_claims[['prolific_id', 'v']],
                       df_mt_claims[['prolific_id', 'v']]],
                      ignore_index=True)
# In case participants appear in multiple files, average their v scores.
df_claims = df_claims.groupby('prolific_id', as_index=False).mean()

# ============================
# Step 3: Merge Affect Change with MIST v
# ============================
df_corr = pd.merge(df_affect_change, df_claims, on="prolific_id", how="inner")
print("Merged dataset for correlation:")
print(df_corr.head())


# ============================
# Step 4: Run Correlations
# ============================
# Pearson correlation between PA_change and v
r_pa, p_pa = pearsonr(df_corr['PA_change'], df_corr['v'])
# Pearson correlation between NA_change and v
r_na, p_na = pearsonr(df_corr['NA_change'], df_corr['v'])

print("\nCorrelation between Positive Affect Change and v:")
print(f"  Pearson r = {r_pa:.3f}, p-value = {p_pa:.3e}")

print("\nCorrelation between Negative Affect Change and v:")
print(f"  Pearson r = {r_na:.3f}, p-value = {p_na:.3e}")

Merged dataset for correlation:
                prolific_id  PA_change  NA_change    v
0  5484620efdf99b0379939c6a        0.0        0.0  7.0
1  56bfcce79f7a1e0005fdca9e       -0.8        0.0  7.0
2  56c984eb10a82f0006ffd111       -0.6        0.0  7.0
3  57321c8ec63b5c000f367bb2        0.4        0.6  6.0
4  5751f576a9de4b0006e557b9       -1.0       -0.4  7.0

Correlation between Positive Affect Change and v:
  Pearson r = -0.026, p-value = 6.453e-01

Correlation between Negative Affect Change and v:
  Pearson r = 0.073, p-value = 1.973e-01


# Comparison with MIST Norm Values

In [27]:
import pandas as pd
import numpy as np
from pathlib import Path

def get_working_dir():
    return Path.cwd()

results_dir = get_working_dir() / 'plots' / 'mist'
results_dir.mkdir(parents=True, exist_ok=True)

# ===============================
# Step 1: Load Norms Data from Excel (for reference)
# ===============================
excel_path = get_working_dir() / 'data' / 'MIST8 - US Norm Tables - v2020.10.16.xlsx'
norms_df = pd.read_excel(excel_path, sheet_name="Norms")
age_norms_df = pd.read_excel(excel_path, sheet_name="Age")

print("US Norms Table (Overall):")
print(norms_df.head())
print("\nUS Age Norms Table:")
print(age_norms_df.head())

# ===============================
# Step 2: Load and Process MIST Claims Data
# ===============================
def compute_f_r(df):
    # Assumes the columns starting with "claim_" are ordered:
    # first 4 = fake items, next 4 = real items.
    claim_cols = [col for col in df.columns if col.startswith("claim_")]
    fake_cols = claim_cols[:4]
    real_cols = claim_cols[4:8]
    f_score = df[fake_cols].apply(lambda row: sum(1 for x in row 
                                                  if isinstance(x, str) and x.strip().lower() == "fake news"),
                                   axis=1)
    r_score = df[real_cols].apply(lambda row: sum(1 for x in row 
                                                  if isinstance(x, str) and x.strip().lower() == "real news"),
                                   axis=1)
    return f_score, r_score

# Load the three claims files.
df_t_claims  = pd.read_csv(get_working_dir() / 'data' / 'processed' / 't_claims.csv', encoding='utf-8')
df_m_claims  = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_claims.csv', encoding='utf-8')
df_mt_claims = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_t_claims.csv', encoding='utf-8')

# Compute f, r and v for each file.
f_t, r_t = compute_f_r(df_t_claims)
f_m, r_m = compute_f_r(df_m_claims)
f_mt, r_mt = compute_f_r(df_mt_claims)

df_t_claims['f']  = f_t
df_t_claims['r']  = r_t
df_t_claims['v']  = f_t + r_t

df_m_claims['f']  = f_m
df_m_claims['r']  = r_m
df_m_claims['v']  = f_m + r_m

df_mt_claims['f'] = f_mt
df_mt_claims['r'] = r_mt
df_mt_claims['v'] = f_mt + r_mt

# Combine all claims data (average in case of duplicates).
df_claims = pd.concat([
    df_t_claims[['prolific_id', 'f', 'r', 'v']],
    df_m_claims[['prolific_id', 'f', 'r', 'v']],
    df_mt_claims[['prolific_id', 'f', 'r', 'v']]
], ignore_index=True)
df_claims = df_claims.groupby('prolific_id', as_index=False).mean()

# ===============================
# Step 3: Compute Overall Sample Percentiles for v, f, and r
# ===============================
v_percentiles = np.percentile(df_claims['v'], [5,10,25,50,75,90,95])
f_percentiles = np.percentile(df_claims['f'], [5,10,25,50,75,90,95])
r_percentiles = np.percentile(df_claims['r'], [5,10,25,50,75,90,95])

overall_results = pd.DataFrame({
    "Measure": ["v", "f", "r"],
    "5th": [v_percentiles[0], f_percentiles[0], r_percentiles[0]],
    "10th": [v_percentiles[1], f_percentiles[1], r_percentiles[1]],
    "25th": [v_percentiles[2], f_percentiles[2], r_percentiles[2]],
    "50th": [v_percentiles[3], f_percentiles[3], r_percentiles[3]],
    "75th": [v_percentiles[4], f_percentiles[4], r_percentiles[4]],
    "90th": [v_percentiles[5], f_percentiles[5], r_percentiles[5]],
    "95th": [v_percentiles[6], f_percentiles[6], r_percentiles[6]]
})
print("Overall Sample Percentiles:")
print(overall_results)

overall_results.to_csv(results_dir / "overall_percentiles.csv", index=False)

# ===============================
# Step 4: Compute Age Group Percentiles for v
# ===============================
# Load demographics data and extract age_group.
df_t_dem  = pd.read_csv(get_working_dir() / 'data' / 'processed' / 't_demographics.csv', encoding='utf-8')
df_m_dem  = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_demographics.csv', encoding='utf-8')
df_mt_dem = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_t_demographics.csv', encoding='utf-8')
df_dem = pd.concat([df_t_dem, df_m_dem, df_mt_dem], ignore_index=True)
df_dem['age_group'] = df_dem['age_group'].astype(str).str.strip()
# Filter out rows with "I prefer not to answer"
df_dem = df_dem[df_dem['age_group'] != "I prefer not to answer"]

# Merge with claims data on prolific_id.
df_merged = pd.merge(df_claims, df_dem[['prolific_id', 'age_group']], on="prolific_id", how="inner")

# Our sample age groups may differ from the norm categories. 
# For demonstration, we compute percentiles for each age group in our sample.
age_results = []
age_groups = df_merged['age_group'].unique()
for age in sorted(age_groups):
    group = df_merged[df_merged['age_group'] == age]
    v_pct = np.percentile(group['v'], [5,10,25,50,75,90,95])
    age_results.append({
         "age_group": age,
         "5th": v_pct[0],
         "10th": v_pct[1],
         "25th": v_pct[2],
         "50th": v_pct[3],
         "75th": v_pct[4],
         "90th": v_pct[5],
         "95th": v_pct[6]
    })
age_results_df = pd.DataFrame(age_results)
print("\nAge Group Percentiles for v:")
print(age_results_df)

age_results_df.to_csv(results_dir / "age_group_percentiles.csv", index=False)

# ===============================
# Step 5: Manual Comparison Instructions
# ===============================
print("\nPlease compare the CSV files 'overall_percentiles.csv' and 'age_group_percentiles.csv' saved in the directory:")
print(results_dir)
print("with the published norm values from the Norms and Age sheets of the provided Excel file.")


US Norms Table (Overall):
  General (N = 3,479) - Format A  Unnamed: 1  Unnamed: 2 Unnamed: 3  \
0                    Discernment         NaN         NaN  Fake News   
1                          Score  Percentile         NaN      Score   
2                              1           1         NaN          1   
3                              2           5         NaN          2   
4                              3          12         NaN          3   

   Unnamed: 4  Unnamed: 5 Unnamed: 6  Unnamed: 7  
0         NaN         NaN  Real News         NaN  
1  Percentile         NaN      Score  Percentile  
2          15         NaN          1          16  
3          33         NaN          2          38  
4          63         NaN          3          69  

US Age Norms Table:
  18-25: Emerging Adulthood (n = 390)  Unnamed: 1  Unnamed: 2 Unnamed: 3  \
0                         Discernment         NaN         NaN  Fake News   
1                               Score  Percentile         NaN      S

# Veracity dscernment political orientation and education level

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import datetime

def get_working_dir():
    return Path.cwd()

# Create plots directory for this analysis.
results_dir = get_working_dir() / 'plots' / 'mist'
results_dir.mkdir(parents=True, exist_ok=True)

# ----- Load and Merge Demographics Data -----
# Load demographics files.
df_t_dem  = pd.read_csv(get_working_dir() / 'data' / 'processed' / 't_demographics.csv', encoding='utf-8')
df_m_dem  = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_demographics.csv', encoding='utf-8')
df_mt_dem = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_t_demographics.csv', encoding='utf-8')
df_dem = pd.concat([df_t_dem, df_m_dem, df_mt_dem], ignore_index=True)

# Ensure string type and strip whitespace for selected columns.
for col in ['age_group', 'political_orientation', 'engagement_with_political_content', 'education_level']:
    df_dem[col] = df_dem[col].astype(str).str.strip()

# Filter out "I prefer not to answer" responses for select variables.
for var in ['age_group', 'political_orientation', 'engagement_with_political_content']:
    df_dem = df_dem[df_dem[var] != "I prefer not to answer"]

# ----- Load Claims Data and Compute Veracity Metric (v) -----
def compute_v_metric(df):
    # Assumes columns starting with "claim_" are arranged so that the first 4 are fake items and the next 4 are real items.
    claim_cols = [col for col in df.columns if col.startswith("claim_")]
    fake_cols = claim_cols[:4]
    real_cols = claim_cols[4:8]
    f_score = df[fake_cols].apply(lambda row: sum(1 for x in row 
                                                  if isinstance(x, str) and x.strip().lower() == "fake news"), axis=1)
    r_score = df[real_cols].apply(lambda row: sum(1 for x in row 
                                                  if isinstance(x, str) and x.strip().lower() == "real news"), axis=1)
    return f_score + r_score

df_t_claims  = pd.read_csv(get_working_dir() / 'data' / 'processed' / 't_claims.csv', encoding='utf-8')
df_m_claims  = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_claims.csv', encoding='utf-8')
df_mt_claims = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_t_claims.csv', encoding='utf-8')

df_t_claims['v']  = compute_v_metric(df_t_claims)
df_m_claims['v']  = compute_v_metric(df_m_claims)
df_mt_claims['v'] = compute_v_metric(df_mt_claims)

df_claims = pd.concat([
    df_t_claims[['prolific_id', 'v']],
    df_m_claims[['prolific_id', 'v']],
    df_mt_claims[['prolific_id', 'v']]
], ignore_index=True)

# Merge the claims data with demographics on prolific_id.
df_merged = pd.merge(df_claims, df_dem, on="prolific_id", how="inner")

# ----- Prepare Categorical Variables -----
# Define the desired orders.
pol_order = ["Very Liberal", "Moderately Liberal", "Moderate", "Moderately Conservative", "Very Conservative"]
edu_order = ["Elementary Education", "High School", "Bachelor's", "Master's", "Phd"]

df_merged['political_orientation'] = pd.Categorical(df_merged['political_orientation'], categories=pol_order, ordered=True)
df_merged['education_level'] = pd.Categorical(df_merged['education_level'], categories=edu_order, ordered=True)

# ----- Step 1: Compute Subgroup Summary Statistics for v -----
subgroup_stats = df_merged.groupby(['political_orientation', 'education_level'])['v'] \
                          .agg(['mean', 'std', 'count']).reset_index()
subgroup_stats['sem'] = subgroup_stats['std'] / np.sqrt(subgroup_stats['count'])
print("Veracity discernment (v) by Political Orientation and Education Level:")
print(subgroup_stats)

# ----- Step 2: Visualize the Data with a Bar Plot -----
sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))

# Create a bar plot with political_orientation on x and education_level as the hue.
ax = sns.barplot(data=df_merged, x="political_orientation", y="v", hue="education_level", 
                 order=pol_order, hue_order=edu_order, ci="sd")
ax.set_xlabel("Political Orientation")
ax.set_ylabel("Mean Veracity Discernment (v)")
ax.set_title("Mean Veracity Discernment by Political Orientation and Education Level")

# Optionally, abbreviate education level labels for the legend.
edu_abbr = {"Elementary Education": "Elem", "High School": "HS", "Bachelor's": "BA", "Master's": "MA", "Phd": "PhD"}
new_labels = [edu_abbr.get(label.get_text(), label.get_text()) for label in ax.get_legend().get_texts()]
for text, new in zip(ax.get_legend().get_texts(), new_labels):
    text.set_text(new)
ax.legend(title="Education Level", loc="upper right")

plt.tight_layout()
plt.savefig(results_dir / "v_by_pol_and_edu.png")
plt.close()

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

def get_working_dir():
    return Path.cwd()

# ---------------------------
# 1. Load and Aggregate Claims Data
# ---------------------------
# Load each file from the 'data/processed' directory.
df_t_claims  = pd.read_csv(get_working_dir() / 'data' / 'processed' / 't_claims.csv', encoding='utf-8', dtype={'prolific_id': str})
df_m_claims  = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_claims.csv', encoding='utf-8', dtype={'prolific_id': str})
df_mt_claims = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_t_claims.csv', encoding='utf-8', dtype={'prolific_id': str})

# Concatenate all data frames.
df_claims = pd.concat([df_t_claims, df_m_claims, df_mt_claims], ignore_index=True)
print("Combined claims data shape:", df_claims.shape)
print("Head of combined claims data:")
print(df_claims.head())

# ---------------------------
# 2. Identify MIST-8 Item Columns and Score Them
# ---------------------------
# Fetch all columns starting with "claim_"
claim_cols = [col for col in df_claims.columns if col.startswith("claim_")]

# Assuming the first four are fake items and the next four are real items.
fake_cols = claim_cols[:4]
real_cols = claim_cols[4:8]

print("\nFake item columns:", fake_cols)
print("Real item columns:", real_cols)

# Define a scoring function: for fake items, 1 if response is "fake news"; for real items, 1 if response is "real news".
def score_item(response, correct_answer):
    if isinstance(response, str) and response.strip().lower() == correct_answer:
        return 1
    else:
        return 0

# Build a new DataFrame X with the 8 binary scores.
X = pd.DataFrame(index=df_claims.index)
for col in fake_cols:
    X[col] = df_claims[col].apply(lambda x: score_item(x, "fake news"))
for col in real_cols:
    X[col] = df_claims[col].apply(lambda x: score_item(x, "real news"))

print("\nFirst 5 rows of binary scored MIST-8 items:")
print(X.head())
print("\nDescriptive statistics for each item:")
print(X.describe())

# ---------------------------
# 3. Compute McDonald’s Omega
# ---------------------------
# Step 1: Compute the correlation (or covariance) matrix of the 8 items.
R = X.corr()
print("\nItem correlation matrix (R):")
print(R)

# Step 2: Perform eigenvalue decomposition of the correlation matrix.
eigenvalues, eigenvectors = np.linalg.eig(R)
print("\nEigenvalues of R:")
print(eigenvalues)

# Identify the largest eigenvalue and corresponding eigenvector.
i_max = np.argmax(eigenvalues.real)
r1 = eigenvalues.real[i_max]
v1 = eigenvectors[:, i_max].real
print(f"\nDominant eigenvalue (r1): {r1}")
print("Corresponding eigenvector (v1):")
print(v1)

# Step 3: Estimate factor loadings.
lambda_vec = np.sqrt(r1) * v1
print("\nEstimated factor loadings (lambda_vec):")
print(lambda_vec)

# Compute unique variances (psi_i = 1 - lambda_i^2 for standardized items).
unique_variances = 1 - lambda_vec**2
print("\nUnique variances for items (psi_i):")
print(unique_variances)

# Total common variance is the square of the sum of loadings.
common_variance = (lambda_vec.sum())**2
# Total variance = common variance + sum of unique variances.
total_variance = common_variance + unique_variances.sum()
# McDonald's Omega
omega_total = common_variance / total_variance

print(f"\nMcDonald's Omega for the aggregated MIST-8 scale: {omega_total:.3f}")

# ---------------------------
# 4. Alternative Omega Calculation (Optional)
# ---------------------------
# Using an alternative formula that uses the number of items (k).
k = X.shape[1]
omega_alternative = (lambda_vec.sum()**2) / ((lambda_vec.sum()**2) + (k - (lambda_vec**2).sum()))
print(f"McDonald's Omega (alternative formula) for the aggregated MIST-8 scale: {omega_alternative:.3f}")
