In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os
from pathlib import Path
import datetime
from scipy.stats import chi2_contingency
import numpy as np

In [None]:
def get_working_dir():
    return Path.cwd()

processed_dir = get_working_dir() / 'data' / 'processed'
os.makedirs(processed_dir, exist_ok=True)

df = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_creative_preferences.csv')

In [None]:
processed_dir = get_working_dir() / 'data' / 'processed'
os.makedirs(processed_dir, exist_ok=True)

# Create plots/demographics directory
results_dir = get_working_dir() / 'plots' / 'creative_explanations'
os.makedirs(results_dir, exist_ok=True)

In [18]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import os

def get_working_dir():
    return Path.cwd()

# Define output directory and create it if needed.
results_dir = get_working_dir() / 'plots' / 'creative_explanations'
os.makedirs(results_dir, exist_ok=True)

# Load the CSV file.
df = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_creative_preferences.csv')

# Debug: print value counts of the original column.
print("Original value counts for 'creative_explanations_likeability':")
print(df['creative_explanations_likeability'].value_counts(), "\n")

# If the column is object type, map the responses to numeric values.
if df['creative_explanations_likeability'].dtype == object:
    likeability_mapping = {
        "Strongly dislike": 1,
        "Somewhat dislike": 2,
        "Neither like nor dislike": 3,
        "Somewhat like": 4,
        "Strongly like": 5
    }
    df['creative_explanations_likeability_numeric'] = df['creative_explanations_likeability'].str.strip().map(likeability_mapping)
else:
    df['creative_explanations_likeability_numeric'] = pd.to_numeric(df['creative_explanations_likeability'], errors='coerce')

# Debug: print value counts of the numeric column.
print("Value counts for 'creative_explanations_likeability_numeric':")
print(df['creative_explanations_likeability_numeric'].value_counts(), "\n")

# Define bin edges such that each rating 1–5 falls into its own bin.
bins = [0.5, 1.5, 2.5, 3.5, 4.5, 5.5]

plt.figure(figsize=(8,6))
plt.hist(df['creative_explanations_likeability_numeric'].dropna(), bins=bins, color='skyblue', edgecolor='black')
plt.title('Distribution of Creative Explanations Likeability')
plt.xlabel('Likeability Rating')
plt.ylabel('Count')
# Set x-ticks at the center of each bin: 1,2,3,4,5.
tick_positions = [1, 2, 3, 4, 5]
tick_labels = ["Strongly dislike", "Somewhat dislike", "Neither like nor dislike", "Somewhat like", "Strongly like"]
plt.xticks(tick_positions, tick_labels, rotation=45)
plt.tight_layout()
plt.savefig(results_dir / 'creative_explanations_likeability.png')
plt.close()


Original value counts for 'creative_explanations_likeability':
creative_explanations_likeability
Somewhat like               32
Strongly like               23
Neither like nor dislike    21
Somewhat dislike            13
Strongly dislike            12
Name: count, dtype: int64 

Value counts for 'creative_explanations_likeability_numeric':
creative_explanations_likeability_numeric
4    32
5    23
3    21
2    13
1    12
Name: count, dtype: int64 



# Correlation between CE likeability and age group

In [21]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
import os
from pathlib import Path

def get_working_dir():
    return Path.cwd()

# Define output directory (if you wish to save any plots later)
results_dir = get_working_dir() / 'plots' / 'creative_explanations'
os.makedirs(results_dir, exist_ok=True)

# Load the creative preferences file.
df_creative = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_creative_preferences.csv')

# Load the demographics file (for creative preferences, we assume it's m_demographics.csv).
df_demo = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_demographics.csv')

# Merge the two files on prolific_id (forcing prolific_id to be string if needed).
df_creative['prolific_id'] = df_creative['prolific_id'].astype(str)
df_demo['prolific_id'] = df_demo['prolific_id'].astype(str)
df = pd.merge(df_creative, df_demo[['prolific_id', 'age_group']], on='prolific_id', how='left')

# Filter out rows where age_group is "I prefer not to answer".
df = df[df['age_group'] != "I prefer not to answer"]

# Map creative_explanations_likeability responses to numeric values.
# Expected responses: "Strongly dislike", "Somewhat dislike", "Neither like nor dislike", "Somewhat like", "Strongly like"
likeability_mapping = {
    "Strongly dislike": 1,
    "Somewhat dislike": 2,
    "Neither like nor dislike": 3,
    "Somewhat like": 4,
    "Strongly like": 5
}
df["likeability_numeric"] = df["creative_explanations_likeability"].str.strip().map(likeability_mapping)

# Map age_group to an ordinal numeric scale.
# Expected age group options: "18-25 years old", "26-35 years old", "36-50 years old", "Over 50 years old"
age_mapping = {
    "18-25 years old": 1,
    "26-35 years old": 2,
    "36-50 years old": 3,
    "Over 50 years old": 4
}
df["age_numeric"] = df["age_group"].map(age_mapping)

# Drop rows with missing values in key measures.
valid_df = df.dropna(subset=["likeability_numeric", "age_numeric"])

# Check if there are sufficient data points.
if len(valid_df) < 2:
    print(f"Insufficient data to compute correlation (n = {len(valid_df)}).")
else:
    # Compute Spearman correlation (suitable for ordinal data).
    corr, p_val = spearmanr(valid_df["likeability_numeric"], valid_df["age_numeric"])
    print("Correlation between creative explanations likeability and age group:")
    print(f"  Spearman correlation: r = {corr:.3f}, p = {p_val:.3f}")


Correlation between creative explanations likeability and age group:
  Spearman correlation: r = -0.115, p = 0.250


# Correlation between ce likeability and level of education

In [22]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import os
from pathlib import Path

def get_working_dir():
    return Path.cwd()

# Load the creative preferences file.
df_pref = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_creative_preferences.csv', dtype={'prolific_id': str})

# Load the demographics file that contains the education level.
# Adjust the file name if necessary (e.g., "m_creative_demographics.csv").
df_demo = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_demographics.csv', dtype={'prolific_id': str})

# Merge the two files on prolific_id.
df = pd.merge(df_pref, df_demo[['prolific_id', 'education_level']], on='prolific_id', how='inner')

# Filter out rows where education_level is "I prefer not to answer".
df = df[df['education_level'] != "I prefer not to answer"]

# Map creative_explanations_likeability responses to numeric values.
likeability_mapping = {
    "Strongly dislike": 1,
    "Somewhat dislike": 2,
    "Neither like nor dislike": 3,
    "Somewhat like": 4,
    "Strongly like": 5
}
df['creative_explanations_likeability_numeric'] = df['creative_explanations_likeability'].str.strip().map(likeability_mapping)

# Map education_level to an ordinal numeric scale.
education_mapping = {
    "Elementary education": 1,
    "High school diploma or equivalent": 2,
    "Bachelor's Degree": 3,
    "Master's Degree": 4,
    "Doctoral degree (PhD)": 5
}
df['education_numeric'] = df['education_level'].str.strip().map(education_mapping)

# Drop rows with missing values in the key columns.
df = df.dropna(subset=['creative_explanations_likeability_numeric', 'education_numeric'])

if len(df) < 2:
    print("Insufficient data to compute correlation (n =", len(df), ").")
else:
    corr, p_val = pearsonr(df['creative_explanations_likeability_numeric'], df['education_numeric'])
    print("Pearson correlation between creative explanations likeability and education level:")
    print(f"  r = {corr:.3f}, p = {p_val:.3f}")


Pearson correlation between creative explanations likeability and education level:
  r = 0.139, p = 0.375


# Correlation between ce likeability and political orientation

In [23]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import os
from pathlib import Path

def get_working_dir():
    return Path.cwd()

# Load creative preferences file
df_pref = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_creative_preferences.csv', dtype={'prolific_id': str})

# Load demographics file (assumed to contain "political_orientation")
df_demo = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_demographics.csv', dtype={'prolific_id': str})

# Merge the two datasets on prolific_id
df = pd.merge(df_pref, df_demo[['prolific_id', 'political_orientation']], on='prolific_id', how='inner')

# Filter out rows where political_orientation is "I prefer not to answer"
df = df[df['political_orientation'] != "I prefer not to answer"]

# Map creative_explanations_likeability to numeric values.
likeability_mapping = {
    "Strongly dislike": 1,
    "Somewhat dislike": 2,
    "Neither like nor dislike": 3,
    "Somewhat like": 4,
    "Strongly like": 5
}
df['creative_explanations_likeability_numeric'] = df['creative_explanations_likeability'].str.strip().map(likeability_mapping)

# Map political_orientation to an ordinal numeric scale.
pol_mapping = {
    "Very Liberal": 1,
    "Moderately Liberal": 2,
    "Moderate": 3,
    "Moderately Conservative": 4,
    "Very Conservative": 5
}
df['pol_numeric'] = df['political_orientation'].str.strip().map(pol_mapping)

# Drop rows with missing values in key columns.
df = df.dropna(subset=['creative_explanations_likeability_numeric', 'pol_numeric'])

if len(df) < 2:
    print("Insufficient data to compute correlation (n =", len(df), ").")
else:
    corr, p_val = pearsonr(df['creative_explanations_likeability_numeric'], df['pol_numeric'])
    print("Pearson correlation between creative explanations likeability and political orientation:")
    print(f"  r = {corr:.3f}, p = {p_val:.3f}")


Pearson correlation between creative explanations likeability and political orientation:
  r = 0.004, p = 0.965


# Correlation between ce likeability and political engagement

In [25]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import os
from pathlib import Path

def get_working_dir():
    return Path.cwd()

# Load creative preferences file.
df_pref = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_creative_preferences.csv', dtype={'prolific_id': str})

# Load demographics file (assumed to contain "engagement_with_political_content").
df_demo = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_demographics.csv', dtype={'prolific_id': str})

# Merge the two datasets on prolific_id.
df = pd.merge(df_pref, df_demo[['prolific_id', 'engagement_with_political_content']], on='prolific_id', how='inner')

# Filter out rows where engagement_with_political_content is "I prefer not to answer".
df = df[df['engagement_with_political_content'] != "I prefer not to answer"]

# Map creative_explanations_likeability to numeric values.
likeability_mapping = {
    "Strongly dislike": 1,
    "Somewhat dislike": 2,
    "Neither like nor dislike": 3,
    "Somewhat like": 4,
    "Strongly like": 5
}
df['creative_explanations_likeability_numeric'] = df['creative_explanations_likeability'].str.strip().map(likeability_mapping)

# Map engagement_with_political_content to an ordinal numeric scale.
engagement_mapping = {
    "Never": 1,
    "Rarely": 2,
    "Sometimes": 3,
    "Often": 4,
    "Very Frequently": 5
}
df['engagement_numeric'] = df['engagement_with_political_content'].str.strip().map(engagement_mapping)

# Drop rows with missing values in key columns.
df = df.dropna(subset=['creative_explanations_likeability_numeric', 'engagement_numeric'])

if len(df) < 2:
    print("Insufficient data to compute correlation (n =", len(df), ").")
else:
    # Compute Pearson correlation.
    corr, p_val = pearsonr(df['creative_explanations_likeability_numeric'], df['engagement_numeric'])
    print("Pearson correlation between creative explanations likeability and political engagement:")
    print(f"  r = {corr:.3f}, p = {p_val:.3f}")

Pearson correlation between creative explanations likeability and political engagement:
  r = -0.040, p = 0.694


# Correlation between ce likeability and meme familiarity

In [26]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import os
from pathlib import Path

def get_working_dir():
    return Path.cwd()

# Load the creative preferences file.
df_pref = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_creative_preferences.csv', dtype={'prolific_id': str})

# Load the corresponding demographics file (assumed to contain the meme_culture_familiarity column).
df_demo = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_demographics.csv', dtype={'prolific_id': str})

# Merge the two datasets on prolific_id.
df = pd.merge(df_pref, df_demo[['prolific_id', 'meme_culture_familiarity']], on='prolific_id', how='inner')

# Filter out rows where meme_culture_familiarity is "I prefer not to answer".
df = df[df['meme_culture_familiarity'] != "I prefer not to answer"]

# Map creative_explanations_likeability to numeric values.
likeability_mapping = {
    "Strongly dislike": 1,
    "Somewhat dislike": 2,
    "Neither like nor dislike": 3,
    "Somewhat like": 4,
    "Strongly like": 5
}
df['creative_explanations_likeability_numeric'] = df['creative_explanations_likeability'].str.strip().map(likeability_mapping)

# Map meme_culture_familiarity to a numeric ordinal scale.
meme_culture_mapping = {
    "Not familiar at all (I rarely understand meme references)": 1,
    "Slightly familiar (I understand basic, widely-known memes)": 2,
    "Moderately familiar (I understand most popular memes and their variations)": 3,
    "Very familiar (I understand complex meme references and their evolution)": 4,
    "Extremely familiar (I actively follow meme trends and their cultural context)": 5
}
df['meme_culture_numeric'] = df['meme_culture_familiarity'].str.strip().map(meme_culture_mapping)

# Drop rows with missing values in key columns.
df = df.dropna(subset=['creative_explanations_likeability_numeric', 'meme_culture_numeric'])

if len(df) < 2:
    print("Insufficient data to compute correlation (n =", len(df), ").")
else:
    corr, p_val = pearsonr(df['creative_explanations_likeability_numeric'], df['meme_culture_numeric'])
    print("Pearson correlation between creative explanations likeability and meme culture familiarity:")
    print(f"  r = {corr:.3f}, p = {p_val:.3f}")


Pearson correlation between creative explanations likeability and meme culture familiarity:
  r = -0.105, p = 0.298


# Correlation between ce likeability and veracity discernment

In [27]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import os
from pathlib import Path

def get_working_dir():
    return Path.cwd()

# Mapping for creative_explanations_likeability responses.
likeability_mapping = {
    "Strongly dislike": 1,
    "Somewhat dislike": 2,
    "Neither like nor dislike": 3,
    "Somewhat like": 4,
    "Strongly like": 5
}

# Compute veracity score from the claims file.
# For the first 4 claim columns the correct answer is "Fake News"
# and for the last 4 the correct answer is "Real News".
def compute_veracity_score(df_claims):
    # Exclude the "prolific_id" column.
    cols = [col for col in df_claims.columns if col != "prolific_id"]
    expected = ["Fake News"] * 4 + ["Real News"] * 4
    scores = []
    for idx, row in df_claims.iterrows():
        score = 0
        for col, exp in zip(cols, expected):
            if isinstance(row[col], str) and row[col].strip() == exp:
                score += 1
        scores.append(score)
    return pd.Series(scores, index=df_claims.index)

def process_likeability_veracity_correlation(pref_filename, claims_filename, label):
    # Load creative preferences file and veracity (claims) file.
    df_pref = pd.read_csv(get_working_dir() / 'data' / 'processed' / pref_filename, dtype={'prolific_id': str})
    df_claims = pd.read_csv(get_working_dir() / 'data' / 'processed' / claims_filename, dtype={'prolific_id': str})
    
    # Map creative_explanations_likeability to numeric values.
    df_pref['creative_explanations_likeability_numeric'] = df_pref['creative_explanations_likeability'].str.strip().map(likeability_mapping)
    
    # Compute veracity score.
    df_claims["veracity_score"] = compute_veracity_score(df_claims)
    
    # Merge the two datasets on prolific_id.
    df_merged = pd.merge(
        df_pref[['prolific_id', 'creative_explanations_likeability_numeric']],
        df_claims[['prolific_id', 'veracity_score']],
        on="prolific_id", how="inner"
    )
    
    # Drop rows with missing values.
    valid_df = df_merged.dropna(subset=["creative_explanations_likeability_numeric", "veracity_score"])
    
    if len(valid_df) < 2:
        print(f"{label}: Insufficient data to compute correlation (n = {len(valid_df)}).")
    else:
        corr, p_val = pearsonr(valid_df['creative_explanations_likeability_numeric'], valid_df['veracity_score'])
        print(f"{label}:")
        print(f"  Pearson correlation between creative explanations likeability and veracity discernment: r = {corr:.3f}, p = {p_val:.3f}")
    print("-------\n")

# Process for the creative explanations.
# Adjust the file names as needed. Here we assume the creative preferences file is "m_creative_preferences.csv"
# and the corresponding veracity discernment file is "m_claims.csv".
process_likeability_veracity_correlation("m_creative_preferences.csv", "m_claims.csv", "Meme Creative Explanations")


Meme Creative Explanations:
  Pearson correlation between creative explanations likeability and veracity discernment: r = -0.176, p = 0.078
-------



# Correlation between ce type and age group

In [29]:
import pandas as pd
import numpy as np
from scipy.stats import f_oneway
from pathlib import Path
import os

def get_working_dir():
    return Path.cwd()

# Load creative preferences and demographics files.
df_pref = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_creative_preferences.csv', dtype={'prolific_id': str})
df_demo = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_demographics.csv', dtype={'prolific_id': str})

# Merge on prolific_id.
df = pd.merge(df_pref, df_demo[['prolific_id', 'age_group']], on='prolific_id', how='inner')

# Filter out rows where age_group is "I prefer not to answer".
df = df[df['age_group'] != "I prefer not to answer"]

# Map age_group to an ordinal numeric scale.
age_mapping = {
    "18-25 years old": 1,
    "26-35 years old": 2,
    "36-50 years old": 3,
    "Over 50 years old": 4
}
df['age_numeric'] = df['age_group'].map(age_mapping)

# Drop rows with missing values in age_numeric.
df = df.dropna(subset=['age_numeric'])

# Compute the overall mean of age_numeric.
overall_mean = df['age_numeric'].mean()

# Compute total sum of squares.
ss_total = np.sum((df['age_numeric'] - overall_mean)**2)

# Compute between-group sum of squares.
groups = df.groupby('most_effective_creative_explanation')
ss_between = sum([len(group) * ((group['age_numeric'].mean() - overall_mean)**2) for name, group in groups])

# Compute the correlation ratio (eta).
eta = np.sqrt(ss_between / ss_total)

# Also perform one-way ANOVA to obtain a p-value.
group_values = [group['age_numeric'].values for name, group in groups]
f_stat, p_val = f_oneway(*group_values)

print("Association between 'most_effective_creative_explanation' and age group:")
print(f"  Correlation ratio (η): {eta:.3f}")
print(f"  One-way ANOVA F-statistic: {f_stat:.3f}, p-value: {p_val:.3f}")


Association between 'most_effective_creative_explanation' and age group:
  Correlation ratio (η): 0.202
  One-way ANOVA F-statistic: 2.080, p-value: 0.130


# Correlation between ce type and level of education

In [30]:
import pandas as pd
import numpy as np
from scipy.stats import f_oneway
from pathlib import Path
import os

def get_working_dir():
    return Path.cwd()

# Load creative preferences and demographics files.
df_pref = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_creative_preferences.csv', dtype={'prolific_id': str})
df_demo = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_demographics.csv', dtype={'prolific_id': str})

# Merge the two datasets on prolific_id.
df = pd.merge(df_pref, df_demo[['prolific_id', 'education_level']], on='prolific_id', how='inner')

# Filter out rows where education_level is "I prefer not to answer".
df = df[df['education_level'] != "I prefer not to answer"]

# Map education levels to an ordinal numeric scale.
education_mapping = {
    "Elementary education": 1,
    "High school diploma or equivalent": 2,
    "Bachelor's Degree": 3,
    "Master's Degree": 4,
    "Doctoral degree (PhD)": 5
}
df['education_numeric'] = df['education_level'].str.strip().map(education_mapping)

# Drop rows with missing education_numeric.
df = df.dropna(subset=['education_numeric'])

# Compute the overall mean of education_numeric.
overall_mean = df['education_numeric'].mean()

# Compute total sum of squares.
ss_total = np.sum((df['education_numeric'] - overall_mean)**2)

# Group by creative explanation type.
groups = df.groupby('most_effective_creative_explanation')

# Compute the between-group sum of squares.
ss_between = sum([len(group) * ((group['education_numeric'].mean() - overall_mean)**2) 
                  for name, group in groups])

# Compute the correlation ratio (eta).
eta = np.sqrt(ss_between / ss_total)

# Also perform one-way ANOVA to obtain a p-value.
group_values = [group['education_numeric'].values for name, group in groups]
f_stat, p_val = f_oneway(*group_values)

print("Association between creative explanation type and education level:")
print(f"  Correlation ratio (η): {eta:.3f}")
print(f"  One-way ANOVA F-statistic: {f_stat:.3f}, p-value: {p_val:.3f}")


Association between creative explanation type and education level:
  Correlation ratio (η): 0.224
  One-way ANOVA F-statistic: 1.056, p-value: 0.357


# Plots for ce most effective type and demographic factors

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import os

def get_working_dir():
    return Path.cwd()

# Define output directory and create it if it doesn't exist.
results_dir = get_working_dir() / 'plots' / 'creative_explanations'
os.makedirs(results_dir, exist_ok=True)

# Load creative preferences and demographics files.
df_pref = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_creative_preferences.csv', dtype={'prolific_id': str})
df_demo = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_demographics.csv', dtype={'prolific_id': str})

# Merge the datasets on prolific_id.
df = pd.merge(df_pref, df_demo, on='prolific_id', how='inner')

# List of demographic variables to plot against "most_effective_creative_explanation".
dem_vars = [
    "age_group", 
    "political_orientation", 
    "education_level", 
    "engagement_with_political_content", 
    "meme_culture_familiarity"
]

# Loop over each demographic variable and create a bar plot.
for var in dem_vars:
    # Filter out rows where the demographic response is "I prefer not to answer".
    df_filtered = df[df[var] != "I prefer not to answer"]
    
    # Create a crosstab (frequency table) of creative explanation type by the demographic variable.
    ct = pd.crosstab(df_filtered[var], df_filtered["most_effective_creative_explanation"])
    
    # Plot the frequency table as a grouped bar chart.
    ax = ct.plot(kind='bar', figsize=(10,6), edgecolor='black')
    plt.title(f"Most Effective Creative Explanation by {var.replace('_', ' ').title()}")
    plt.xlabel(var.replace('_', ' ').title())
    plt.ylabel("Count")
    plt.xticks(rotation=45)
    plt.legend(title="Most Effective CE", bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    
    # Save the plot.
    plt.savefig(results_dir / f"most_effective_creative_explanation_by_{var}.png")
    plt.close()


# Correlation between ce likeability and affect change

In [35]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import os
from pathlib import Path

def get_working_dir():
    return Path.cwd()

# Mapping for creative_explanations_likeability responses.
likeability_mapping = {
    "Strongly dislike": 1,
    "Somewhat dislike": 2,
    "Neither like nor dislike": 3,
    "Somewhat like": 4,
    "Strongly like": 5
}

# Mapping for affect responses.
affect_mapping = {
    "Not at all": 1,
    "Slightly": 2,
    "Moderately": 3,
    "Very": 4,
    "Extremely": 5
}

# Compute affect changes from the affect file.
# Positive affect items: alert, inspired, determined, attentive, active.
# Negative affect items: upset, hostile, ashamed, nervous, afraid.
def compute_affect_changes(df_affect):
    positive_items = ["alert", "inspired", "determined", "attentive", "active"]
    negative_items = ["upset", "hostile", "ashamed", "nervous", "afraid"]
    
    # Map all affect responses using affect_mapping.
    for col in df_affect.columns:
        if df_affect[col].dtype == object:
            df_affect[col] = df_affect[col].str.strip().map(affect_mapping)
    
    # Compute positive affect change: (post - pre) for each positive item.
    pos_diffs = []
    for item in positive_items:
        pre_col = f"pre_{item}"
        post_col = f"post_{item}"
        if pre_col in df_affect.columns and post_col in df_affect.columns:
            pos_diffs.append(df_affect[post_col] - df_affect[pre_col])
    if pos_diffs:
        pos_change = pd.concat(pos_diffs, axis=1).mean(axis=1)
    else:
        pos_change = pd.Series(np.nan, index=df_affect.index)
    
    # Compute negative affect change: (post - pre) for each negative item.
    neg_diffs = []
    for item in negative_items:
        pre_col = f"pre_{item}"
        post_col = f"post_{item}"
        if pre_col in df_affect.columns and post_col in df_affect.columns:
            neg_diffs.append(df_affect[post_col] - df_affect[pre_col])
    if neg_diffs:
        neg_change = pd.concat(neg_diffs, axis=1).mean(axis=1)
    else:
        neg_change = pd.Series(np.nan, index=df_affect.index)
    
    return pos_change, neg_change

def process_likeability_affect_correlation(pref_filename, affect_filename, label):
    # Load creative preferences file.
    df_pref = pd.read_csv(get_working_dir() / 'data' / 'processed' / pref_filename, dtype={'prolific_id': str})
    # Map creative_explanations_likeability to numeric values.
    df_pref['creative_explanations_likeability_numeric'] = df_pref['creative_explanations_likeability'].str.strip().map(likeability_mapping)
    
    # Load creative affect file.
    df_affect = pd.read_csv(get_working_dir() / 'data' / 'processed' / affect_filename, dtype={'prolific_id': str})
    pos_change, neg_change = compute_affect_changes(df_affect)
    df_affect["positive_affect_change"] = pos_change
    df_affect["negative_affect_change"] = neg_change
    
    # Merge the two datasets on prolific_id.
    df_merged = pd.DataFrame({
        'prolific_id': df_pref['prolific_id'],  # Use the valid prolific_ids
        'creative_explanations_likeability_numeric': df_pref['creative_explanations_likeability_numeric'],
        'positive_affect_change': df_affect['positive_affect_change'],
        'negative_affect_change': df_affect['negative_affect_change']
    })
    
    # Drop rows with missing values in key columns.
    valid_df = df_merged.dropna(subset=['creative_explanations_likeability_numeric', 'positive_affect_change', 'negative_affect_change'])
    
    if len(valid_df) < 2:
        print(f"{label}: Insufficient data to compute correlation (n = {len(valid_df)}).")
    else:
        corr_pos, p_pos = pearsonr(valid_df['creative_explanations_likeability_numeric'], valid_df['positive_affect_change'])
        corr_neg, p_neg = pearsonr(valid_df['creative_explanations_likeability_numeric'], valid_df['negative_affect_change'])
        print(f"{label}:")
        print(f"  Pearson correlation between CE likeability and positive affect change: r = {corr_pos:.3f}, p = {p_pos:.3f}")
        print(f"  Pearson correlation between CE likeability and negative affect change: r = {corr_neg:.3f}, p = {p_neg:.3f}")
    print("-------\n")

# Process for creative explanations.
# Adjust file names if needed. Here we assume:
# - Creative preferences are in "m_creative_preferences.csv"
# - The corresponding creative affect file is "m_creative_affect.csv"
process_likeability_affect_correlation("m_creative_preferences.csv", "m_affect.csv", "Creative Explanations")


Creative Explanations:
  Pearson correlation between CE likeability and positive affect change: r = -0.090, p = 0.368
  Pearson correlation between CE likeability and negative affect change: r = 0.036, p = 0.721
-------



In [2]:
import pandas as pd
from pathlib import Path
import os

def get_working_dir():
    return Path.cwd()

# Define output directory (if later you want to save textual reports).
results_dir = get_working_dir() / 'plots' / 'creative_explanations'
results_dir.mkdir(parents=True, exist_ok=True)

# Load creative preferences and demographics files.
df_pref = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_creative_preferences.csv', dtype={'prolific_id': str})
df_demo = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_demographics.csv', dtype={'prolific_id': str})

# Merge the datasets on prolific_id.
df = pd.merge(df_pref, df_demo, on='prolific_id', how='inner')

# List of demographic variables to analyze.
dem_vars = [
    "age_group", 
    "political_orientation", 
    "education_level", 
    "engagement_with_political_content", 
    "meme_culture_familiarity"
]

print("Most Effective Creative Explanation Type by Demographic Categories:\n")

# Loop over each demographic variable.
for var in dem_vars:
    print(f"=== {var.replace('_', ' ').title()} ===")
    # Filter out rows where the demographic response is "I prefer not to answer".
    df_filtered = df[df[var] != "I prefer not to answer"]
    
    # Create a frequency table (crosstab) of creative explanation type by the demographic variable.
    ct = pd.crosstab(df_filtered[var], df_filtered["most_effective_creative_explanation"])
    
    # For each level of the demographic variable, find the most frequent creative explanation.
    for level in ct.index:
        # Find the creative explanation type with the highest count.
        most_effective = ct.loc[level].idxmax()
        count = ct.loc[level].max()
        total = ct.loc[level].sum()
        perc = 100 * count / total if total > 0 else 0
        print(f"  {level}: {most_effective} ({count} out of {total} responses, {perc:.1f}%)")
    print()  # newline for spacing between demographics


Most Effective Creative Explanation Type by Demographic Categories:

=== Age Group ===
  18-25 years old: The meme (6 out of 8 responses, 75.0%)
  26-35 years old: The meme (21 out of 37 responses, 56.8%)
  36-50 years old: The meme (23 out of 31 responses, 74.2%)
  Over 50 years old: The poem (11 out of 25 responses, 44.0%)

=== Political Orientation ===
  Moderate: The meme (17 out of 24 responses, 70.8%)
  Moderately Conservative: The meme (14 out of 28 responses, 50.0%)
  Moderately Liberal: The meme (13 out of 23 responses, 56.5%)
  Very Conservative: The meme (3 out of 7 responses, 42.9%)
  Very Liberal: The meme (12 out of 19 responses, 63.2%)

=== Education Level ===
  Bachelor's degree: The meme (24 out of 40 responses, 60.0%)
  Doctoral degree (PhD): The meme (4 out of 7 responses, 57.1%)
  High school diploma or equivalent: The meme (22 out of 36 responses, 61.1%)
  Master's degree: The meme (7 out of 16 responses, 43.8%)

=== Engagement With Political Content ===
  Never: T

In [3]:
import pandas as pd
from pathlib import Path
import os

def get_working_dir():
    return Path.cwd()

# Define an output directory (if needed for future uses).
results_dir = get_working_dir() / 'plots' / 'creative_explanations'
results_dir.mkdir(parents=True, exist_ok=True)

# Load creative preferences and demographics files.
df_pref = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_creative_preferences.csv', dtype={'prolific_id': str})
df_demo = pd.read_csv(get_working_dir() / 'data' / 'processed' / 'm_demographics.csv', dtype={'prolific_id': str})

# Merge the datasets on prolific_id. (Demographics are merged even if not used in this analysis.)
df = pd.merge(df_pref, df_demo, on='prolific_id', how='inner')

# Count the frequency of each creative explanation option.
creative_counts = df['most_effective_creative_explanation'].value_counts()
total = creative_counts.sum()

print("Frequency counts for each creative explanation option:")
for option, count in creative_counts.items():
    percentage = 100 * count / total
    print(f"  {option}: {count} responses ({percentage:.1f}%)")


Frequency counts for each creative explanation option:
  The meme: 59 responses (58.4%)
  The poem: 25 responses (24.8%)
  The joke: 17 responses (16.8%)
