In [4]:
import pandas as pd
from pathlib import Path
import os

In [5]:
def get_working_dir():
    return Path.cwd()

In [6]:
def read_rejected_ids(file_path):
    try:
        with open(file_path, 'r') as f:
            rejected_ids = [line.strip() for line in f.readlines()]
        return rejected_ids
    except FileNotFoundError:
        print(f"Warning: Rejected IDs file not found at {file_path}")
        return []

def check_duplicates(df, dataset_name):
    duplicate_ids = df[df.duplicated(subset=[prolific_id_column], keep=False)][prolific_id_column].tolist()
    if duplicate_ids:
        print(f"Found duplicate IDs in {dataset_name} dataset: {duplicate_ids}")
    return duplicate_ids

In [7]:
processed_dir = get_working_dir() / 'data' / 'processed'
os.makedirs(processed_dir, exist_ok=True)

df_t = pd.read_csv(get_working_dir() / 'data' / 't.csv', encoding='utf-8')
df_m = pd.read_csv(get_working_dir() /  'data' / 'm.csv', encoding='utf-8')
df_m_t = pd.read_csv(get_working_dir() /  'data' / 'm_t.csv', encoding='utf-8')

rejected_t = read_rejected_ids(get_working_dir() / 'data' / 'rejected_ids_t.txt')
rejected_m = read_rejected_ids(get_working_dir() / 'data' / 'rejected_ids_m.txt')
rejected_m_t = read_rejected_ids(get_working_dir() / 'data' / 'rejected_ids_m_t.txt')

/Users/sergiopinto/Desktop/MemeFact/meta_study


In [8]:
print(f"Dataset t: {df_t.shape[0]} rows, {df_t.shape[1]} columns")
print(f"Dataset m: {df_m.shape[0]} rows, {df_m.shape[1]} columns")
print(f"Dataset m_t: {df_m_t.shape[0]} rows, {df_m_t.shape[1]} columns")
print(f"Rejected IDs - t: {len(rejected_t)}")
print(f"Rejected IDs - m: {len(rejected_m)}")
print(f"Rejected IDs - m_t: {len(rejected_m_t)}")

Dataset t: 118 rows, 132 columns
Dataset m: 110 rows, 136 columns
Dataset m_t: 107 rows, 133 columns
Rejected IDs - t: 8
Rejected IDs - m: 9
Rejected IDs - m_t: 5


In [9]:
prolific_id_column = "Please enter your prolific ID."

# Check for duplicates in each dataset
duplicate_t = check_duplicates(df_t, "t")
duplicate_m = check_duplicates(df_m, "m")
duplicate_m_t = check_duplicates(df_m_t, "m_t")

# Remove rejected IDs from each dataset
df_t_clean = df_t[~df_t[prolific_id_column].isin(rejected_t)]
df_m_clean = df_m[~df_m[prolific_id_column].isin(rejected_m)]
df_m_t_clean = df_m_t[~df_m_t[prolific_id_column].isin(rejected_m_t)]

# If there are duplicates, keep the first occurrence
if duplicate_t:
    df_t_clean = df_t_clean.drop_duplicates(subset=[prolific_id_column], keep='first')
if duplicate_m:
    df_m_clean = df_m_clean.drop_duplicates(subset=[prolific_id_column], keep='first')
if duplicate_m_t:
    df_m_t_clean = df_m_t_clean.drop_duplicates(subset=[prolific_id_column], keep='first')

# Display cleaned dataset information
print("\nAfter removing rejected IDs and duplicates:")
print(f"Dataset t: {df_t_clean.shape[0]} rows, {df_t_clean.shape[1]} columns")
print(f"Dataset m: {df_m_clean.shape[0]} rows, {df_m_clean.shape[1]} columns")
print(f"Dataset m_t: {df_m_t_clean.shape[0]} rows, {df_m_t_clean.shape[1]} columns")

# Save cleaned datasets to the processed directory
df_t_clean.to_csv(processed_dir / 't_phase1.csv', index=False, encoding='utf-8')
df_m_clean.to_csv(processed_dir / 'm_phase1.csv', index=False, encoding='utf-8')
df_m_t_clean.to_csv(processed_dir / 'm_t_phase1.csv', index=False, encoding='utf-8')


After removing rejected IDs and duplicates:
Dataset t: 110 rows, 132 columns
Dataset m: 101 rows, 136 columns
Dataset m_t: 102 rows, 133 columns

Checking for missing values in key columns:
t dataset missing values: {'Please enter your prolific ID.': 0, 'Please indicate your age group.': 0}
m dataset missing values: {'Please enter your prolific ID.': 0, 'Please indicate your age group.': 0}
m_t dataset missing values: {'Please enter your prolific ID.': 0, 'Please indicate your age group.': 0}


In [14]:
print("Loading Phase 1 processed datasets...")
df_t = pd.read_csv(processed_dir / 't_phase1.csv', encoding='utf-8')
df_m = pd.read_csv(processed_dir / 'm_phase1.csv', encoding='utf-8')
df_m_t = pd.read_csv(processed_dir / 'm_t_phase1.csv', encoding='utf-8')

# Define columns to rename (same for all datasets)
rename_map = {
    "Please enter your prolific ID.": "prolific_id",
    "Please indicate your age group.": "age_group",
    "Please indicate your citizenship status in the United States.": "citizenship_status",
    "Please rate your proficiency in English language comprehension.": "english_proficiency",
    "Please indicate your highest completed level of education.": "education_level",
    "Please indicate your political orientation.\n\n'Moderately Liberal' means someone who supports gradual social progress, a mix of government intervention and market freedom, and strong individual rights. \n\n'Moderately Conservative' means someone who emphasizes tradition, limited government, free markets, and cautious, gradual change.": "political_orientation",
    "Please indicate how frequently you interact with political content on social media platforms.\n\nInteracting includes any form of engagement such as liking/reacting, sharing (to your stories), reposting, commenting, or messaging content to others.\n\nPolitical content includes news stories, videos, memes, posts, stories, tweets, or any other material containing political claims, statements, or discussions about policies, elections, political figures, or social issues with political implications.": "engagement_with_political_content",
    "How would you rate your familiarity with internet meme culture, specifically your ability to understand the context and meaning of image macro memes?": "meme_culture_familiarity"
}

# Columns to remove
columns_to_remove = [
    "I hereby confirm that I have read the Data Collection & Privacy Information and consent to take part in this study by selecting the 'I agree' option below:",
    "Timestamp",
    "Please select 'Often' to show you are paying attention to this question.",
    "Please provide your email address if you'd like to be contacted for future studies."
]

# Define demographic columns for separate files (using original column names before renaming)
demographic_columns = [
    "Please enter your prolific ID.",
    "Please indicate your age group.",
    "Please indicate your citizenship status in the United States.",
    "Please rate your proficiency in English language comprehension.",
    "Please indicate your highest completed level of education.",
    "Please indicate your political orientation.\n\n'Moderately Liberal' means someone who supports gradual social progress, a mix of government intervention and market freedom, and strong individual rights. \n\n'Moderately Conservative' means someone who emphasizes tradition, limited government, free markets, and cautious, gradual change.",
    "Please indicate how frequently you interact with political content on social media platforms.\n\nInteracting includes any form of engagement such as liking/reacting, sharing (to your stories), reposting, commenting, or messaging content to others.\n\nPolitical content includes news stories, videos, memes, posts, stories, tweets, or any other material containing political claims, statements, or discussions about policies, elections, political figures, or social issues with political implications."
]

# Add meme familiarity column for m and m_t datasets
m_mt_demographic_columns = demographic_columns + [
    "How would you rate your familiarity with internet meme culture, specifically your ability to understand the context and meaning of image macro memes?"
]

# Function to process datasets for phase 2
def process_phase2(df, dataset_name, demographic_cols):
    print(f"\nProcessing {dataset_name} dataset for Phase 2...")
    
    # Create a copy of the original dataframe to work with
    df_copy = df.copy()
    
    # Create a copy of the demographics for separate file
    if dataset_name in ['m', 'm_t']:
        demo_df = df_copy[demographic_cols].copy()
    else:
        demo_df = df_copy[demographic_columns].copy()
    
    # Rename demographic columns
    demo_df = demo_df.rename(columns=rename_map)
    
    # Save demographics file with renamed prolific_id
    demo_file_path = processed_dir / f"{dataset_name}_demographics.csv"
    demo_df.to_csv(demo_file_path, index=False, encoding='utf-8')
    print(f"  Demographics file saved to {demo_file_path}")
    
    # Process main dataset for Phase 2
    # 1. Remove specified columns that exist in the dataset
    for col in columns_to_remove:
        if col in df_copy.columns:
            df_copy = df_copy.drop(columns=[col])
    
    # 2. Rename columns including prolific_id
    df_copy = df_copy.rename(columns=rename_map)
    
    # Save Phase 2 processed file (with prolific_id included)
    phase2_file_path = processed_dir / f"{dataset_name}_phase2.csv"
    df_copy.to_csv(phase2_file_path, index=False, encoding='utf-8')
    print(f"  Phase 2 processed file saved to {phase2_file_path}")
    
    return demo_df, df_copy

# Process each dataset
t_demo, df_t_phase2 = process_phase2(df_t, 't', demographic_columns)
m_demo, df_m_phase2 = process_phase2(df_m, 'm', m_mt_demographic_columns)
m_t_demo, df_m_t_phase2 = process_phase2(df_m_t, 'm_t', m_mt_demographic_columns)

# Verify column renaming in demographic files
print("\nVerifying column renaming in demographic files:")
demographic_renamed_columns = [
    "prolific_id", "age_group", "citizenship_status", "english_proficiency", 
    "education_level", "political_orientation", "engagement_with_political_content"
]

for dataset_name, df in [("t", t_demo), ("m", m_demo), ("m_t", m_t_demo)]:
    print(f"\n{dataset_name}_demographics.csv:")
    for col in demographic_renamed_columns:
        if col in df.columns:
            print(f"  ✓ {col}")
        else:
            print(f"  ✗ {col} not found")
    
    # Check meme familiarity column for m and m_t datasets
    if dataset_name in ['m', 'm_t']:
        if "meme_culture_familiarity" in df.columns:
            print(f"  ✓ meme_culture_familiarity")
        else:
            print(f"  ✗ meme_culture_familiarity not found")

# Verify prolific_id is retained in phase 2 files
print("\nVerifying prolific_id is retained in phase 2 files:")
for dataset_name, df in [("t", df_t_phase2), ("m", df_m_phase2), ("m_t", df_m_t_phase2)]:
    if "prolific_id" in df.columns:
        print(f"  ✓ {dataset_name}_phase2.csv has prolific_id column")
    else:
        print(f"  ✗ {dataset_name}_phase2.csv missing prolific_id column")

# Count columns removed
print("\nColumns removed from each dataset:")
for dataset_name, original, processed in [
    ("t", df_t, df_t_phase2), 
    ("m", df_m, df_m_phase2), 
    ("m_t", df_m_t, df_m_t_phase2)
]:
    removed_count = len(original.columns) - len(processed.columns)
    print(f"  {dataset_name}: {removed_count} columns removed")

print("\nPhase 2 processing completed successfully.")

Loading Phase 1 processed datasets...

Processing t dataset for Phase 2...
  Demographics file saved to /Users/sergiopinto/Desktop/MemeFact/meta_study/data/processed/t_demographics.csv
  Phase 2 processed file saved to /Users/sergiopinto/Desktop/MemeFact/meta_study/data/processed/t_phase2.csv

Processing m dataset for Phase 2...
  Demographics file saved to /Users/sergiopinto/Desktop/MemeFact/meta_study/data/processed/m_demographics.csv
  Phase 2 processed file saved to /Users/sergiopinto/Desktop/MemeFact/meta_study/data/processed/m_phase2.csv

Processing m_t dataset for Phase 2...
  Demographics file saved to /Users/sergiopinto/Desktop/MemeFact/meta_study/data/processed/m_t_demographics.csv
  Phase 2 processed file saved to /Users/sergiopinto/Desktop/MemeFact/meta_study/data/processed/m_t_phase2.csv

Verifying column renaming in demographic files:

t_demographics.csv:
  ✓ prolific_id
  ✓ age_group
  ✓ citizenship_status
  ✓ english_proficiency
  ✓ education_level
  ✓ political_orienta

In [15]:
# Load phase 2 processed datasets
df_t = pd.read_csv(processed_dir / 't_phase2.csv', encoding='utf-8')
df_m = pd.read_csv(processed_dir / 'm_phase2.csv', encoding='utf-8')
df_m_t = pd.read_csv(processed_dir / 'm_t_phase2.csv', encoding='utf-8')

# Load demographic files to identify demographic columns
df_t_demo = pd.read_csv(processed_dir / 't_demographics.csv', encoding='utf-8')
df_m_demo = pd.read_csv(processed_dir / 'm_demographics.csv', encoding='utf-8')
df_m_t_demo = pd.read_csv(processed_dir / 'm_t_demographics.csv', encoding='utf-8')

# Define demographic columns to remove (excluding prolific_id)
demographic_columns = [
    "age_group",
    "citizenship_status",
    "english_proficiency",
    "education_level",
    "political_orientation",
    "engagement_with_political_content"
]

# Add meme familiarity for m and m_t datasets
meme_column = ["meme_culture_familiarity"]

# Define emotions for renaming
emotions = ['upset', 'hostile', 'alert', 'ashamed', 'inspired', 
            'nervous', 'determined', 'attentive', 'afraid', 'active']

# Function to process datasets for phase 3
def process_phase3(df, dataset_name, demo_columns, meme_column):
    print(f"\nProcessing {dataset_name} dataset for Phase 3...")
    
    # Create a copy of the dataframe
    df_copy = df.copy()
    
    # Identify pre and post affect columns based on text patterns
    pre_affect_cols = []
    post_affect_cols = []
    
    for col in df_copy.columns:
        if "normally feel" in col.lower():
            pre_affect_cols.append(col)
        elif "prior to the study" in col.lower():
            post_affect_cols.append(col)
    
    print(f"  Found {len(pre_affect_cols)} pre-affect columns and {len(post_affect_cols)} post-affect columns")
    
    # Extract affect columns with prolific_id for the separate file
    affect_df = df_copy[['prolific_id'] + pre_affect_cols + post_affect_cols].copy()
    
    # Rename affect columns to standard names
    # For pre-affect columns
    for emotion in emotions:
        for col in pre_affect_cols:
            if f"feel {emotion}" in col.lower():
                affect_df = affect_df.rename(columns={col: f"pre_{emotion}"})
    
    # For post-affect columns
    for emotion in emotions:
        for col in post_affect_cols:
            if f"feel {emotion}" in col.lower():
                affect_df = affect_df.rename(columns={col: f"post_{emotion}"})
    
    # Save affect data to separate file
    affect_file_path = processed_dir / f"{dataset_name}_affect.csv"
    affect_df.to_csv(affect_file_path, index=False, encoding='utf-8')
    print(f"  Affect data saved to {affect_file_path}")
    
    # Remove demographic columns from main dataset (except prolific_id)
    columns_to_remove = set(demo_columns)
    
    # Add meme familiarity to columns to remove if it exists and is in the dataset
    if dataset_name in ['m', 'm_t'] and meme_column[0] in df_copy.columns:
        columns_to_remove.update(meme_column)
    
    # Remove affected columns from main dataset
    columns_to_remove.update(pre_affect_cols)
    columns_to_remove.update(post_affect_cols)
    
    # Ensure all columns to remove exist in the dataframe
    columns_to_remove = [col for col in columns_to_remove if col in df_copy.columns]
    
    # Remove columns
    df_phase3 = df_copy.drop(columns=columns_to_remove)
    
    # Save Phase 3 processed file
    phase3_file_path = processed_dir / f"{dataset_name}_phase3.csv"
    df_phase3.to_csv(phase3_file_path, index=False, encoding='utf-8')
    print(f"  Phase 3 processed file saved to {phase3_file_path}")
    
    return affect_df, df_phase3

# Process each dataset
t_affect, df_t_phase3 = process_phase3(df_t, 't', demographic_columns, meme_column)
m_affect, df_m_phase3 = process_phase3(df_m, 'm', demographic_columns, meme_column)
m_t_affect, df_m_t_phase3 = process_phase3(df_m_t, 'm_t', demographic_columns, meme_column)

# Verify column renaming in affect files
print("\nVerifying column renaming in affect files:")
expected_affect_columns = ['prolific_id'] + [f"pre_{e}" for e in emotions] + [f"post_{e}" for e in emotions]

for dataset_name, df in [("t", t_affect), ("m", m_affect), ("m_t", m_t_affect)]:
    print(f"\n{dataset_name}_affect.csv columns check:")
    
    # Count matching columns
    found_columns = [col for col in expected_affect_columns if col in df.columns]
    print(f"  Found {len(found_columns)} of {len(expected_affect_columns)} expected columns")
    
    # Check for any missing columns
    missing_columns = [col for col in expected_affect_columns if col not in df.columns]
    if missing_columns:
        print(f"  Missing columns: {missing_columns}")
    else:
        print(f"  All expected columns are present")

# Verify removal of affect and demographic columns in phase 3 files
print("\nVerifying removal of affect and demographic columns in phase 3 files:")
for dataset_name, df in [("t", df_t_phase3), ("m", df_m_phase3), ("m_t", df_m_t_phase3)]:
    print(f"\n{dataset_name}_phase3.csv:")
    
    # Check if prolific_id is still present
    if "prolific_id" in df.columns:
        print(f"  ✓ prolific_id column is present")
    else:
        print(f"  ✗ prolific_id column is missing")
    
    # Check that affect columns are removed
    affect_columns_present = []
    for col in df.columns:
        if any(emotion in col.lower() for emotion in emotions) and any(prefix in col.lower() for prefix in ["normally feel", "prior to the study"]):
            affect_columns_present.append(col)
    
    if affect_columns_present:
        print(f"  ✗ Found {len(affect_columns_present)} affect columns that should have been removed")
        print(f"    {affect_columns_present[:5]}{'...' if len(affect_columns_present) > 5 else ''}")
    else:
        print(f"  ✓ All affect columns have been removed")
    
    # Check that demographic columns are removed
    demo_columns_present = [col for col in demographic_columns if col in df.columns]
    if demo_columns_present:
        print(f"  ✗ Found demographic columns that should have been removed: {demo_columns_present}")
    else:
        print(f"  ✓ All demographic columns have been removed")

print("\nPhase 3 processing completed successfully.")


Processing t dataset for Phase 3...
  Found 10 pre-affect columns and 10 post-affect columns
  Affect data saved to /Users/sergiopinto/Desktop/MemeFact/meta_study/data/processed/t_affect.csv
  Phase 3 processed file saved to /Users/sergiopinto/Desktop/MemeFact/meta_study/data/processed/t_phase3.csv

Processing m dataset for Phase 3...
  Found 10 pre-affect columns and 10 post-affect columns
  Affect data saved to /Users/sergiopinto/Desktop/MemeFact/meta_study/data/processed/m_affect.csv
  Phase 3 processed file saved to /Users/sergiopinto/Desktop/MemeFact/meta_study/data/processed/m_phase3.csv

Processing m_t dataset for Phase 3...
  Found 10 pre-affect columns and 10 post-affect columns
  Affect data saved to /Users/sergiopinto/Desktop/MemeFact/meta_study/data/processed/m_t_affect.csv
  Phase 3 processed file saved to /Users/sergiopinto/Desktop/MemeFact/meta_study/data/processed/m_t_phase3.csv

Verifying column renaming in affect files:

t_affect.csv columns check:
  Found 21 of 21 e

In [17]:
df_t = pd.read_csv(processed_dir / 't_phase3.csv', encoding='utf-8')
df_m = pd.read_csv(processed_dir / 'm_phase3.csv', encoding='utf-8')
df_m_t = pd.read_csv(processed_dir / 'm_t_phase3.csv', encoding='utf-8')

# Define the claim columns and their new names
claim_mapping = {
    "Government officials have manipulated stock prices to hide scandals.": "claim_stock_manipulation",
    "New study: Left-wingers are more likely to lie to get a higher salary.": "claim_left_wing_salary_lie",
    "Certain vaccines are loaded with dangerous chemicals and toxins.": "claim_vaccine_toxins",
    "The government is knowingly spreading disease through the airwaves and food supply.": "claim_government_disease_spread",
    "Attitudes toward the European Union are largely positive, both within Europe and outside it.": "claim_eu_positive_attitudes",
    "Hyatt will remove small bottles from hotel bathrooms by 2021.": "claim_hyatt_small_bottles",
    "Republicans divided in views of Trump's conduct, democrats are broadly critical.": "claim_republicans_trump_conduct",
    "Global warming age gap: Younger Americans most worried.": "claim_climate_worry_age_gap"
}

# Function to find the best matching column name
def find_matching_column(columns, claim_text):
    exact_match = [col for col in columns if col == claim_text]
    if exact_match:
        return exact_match[0]
    
    # Try partial matching if no exact match
    partial_matches = [col for col in columns if claim_text.split(':')[0] in col]
    if partial_matches:
        return partial_matches[0]
    
    # Try even more fuzzy matching for specific claims
    if "Trump" in claim_text:
        trump_matches = [col for col in columns if "Trump" in col]
        if trump_matches:
            return trump_matches[0]
    
    if "warming" in claim_text:
        warming_matches = [col for col in columns if "warming" in col]
        if warming_matches:
            return warming_matches[0]
    
    return None

# Function to process datasets for phase 4
def process_phase4(df, dataset_name, claim_mapping):
    print(f"\nProcessing {dataset_name} dataset for Phase 4...")
    
    # Create a copy of the dataframe
    df_copy = df.copy()
    
    # Print all column names for debugging
    print(f"  Available columns in dataset: {len(df_copy.columns)}")
    
    # Extract claim columns with prolific_id
    claims_df = df_copy[['prolific_id']].copy()
    
    # Find and extract claim columns
    extracted_claims = []
    not_found_claims = []
    
    for original_name, new_name in claim_mapping.items():
        # First try direct matching
        if original_name in df_copy.columns:
            claims_df[new_name] = df_copy[original_name]
            extracted_claims.append(original_name)
        else:
            # Try fuzzy matching if direct match fails
            match = find_matching_column(df_copy.columns, original_name)
            if match:
                claims_df[new_name] = df_copy[match]
                extracted_claims.append(match)
                print(f"  Matched '{original_name}' to column '{match}'")
            else:
                not_found_claims.append(original_name)
    
    print(f"  Found and extracted {len(extracted_claims)} of {len(claim_mapping)} claim columns")
    
    if not_found_claims:
        print(f"  Could not find columns for these claims:")
        for claim in not_found_claims:
            print(f"    - {claim}")
    
    # Print the first few characters of each found column for verification
    for col in extracted_claims:
        if col in df_copy.columns:
            first_value = str(df_copy[col].iloc[0])
            print(f"  Column '{col}' found with first value: {first_value[:30]}...")
    
    # Remove claim columns from main dataset
    df_phase4 = df_copy.drop(columns=extracted_claims)
    
    # Save claims data to separate file
    claims_file_path = processed_dir / f"{dataset_name}_claims.csv"
    claims_df.to_csv(claims_file_path, index=False, encoding='utf-8')
    print(f"  Claims data saved to {claims_file_path}")
    
    # Save Phase 4 processed file
    phase4_file_path = processed_dir / f"{dataset_name}_phase4.csv"
    df_phase4.to_csv(phase4_file_path, index=False, encoding='utf-8')
    print(f"  Phase 4 processed file saved to {phase4_file_path}")
    
    return claims_df, df_phase4, not_found_claims

# Process each dataset
t_claims, df_t_phase4, t_not_found = process_phase4(df_t, 't', claim_mapping)
m_claims, df_m_phase4, m_not_found = process_phase4(df_m, 'm', claim_mapping)
m_t_claims, df_m_t_phase4, m_t_not_found = process_phase4(df_m_t, 'm_t', claim_mapping)

# Verify claim extraction
print("\nVerifying claim extraction in claim files:")
expected_claim_columns = ['prolific_id'] + list(claim_mapping.values())

for dataset_name, df in [("t", t_claims), ("m", m_claims), ("m_t", m_t_claims)]:
    print(f"\n{dataset_name}_claims.csv columns check:")
    
    # Count matching columns
    found_columns = [col for col in expected_claim_columns if col in df.columns]
    print(f"  Found {len(found_columns) - 1} of {len(expected_claim_columns) - 1} expected claim columns")
    
    # Check for any missing claim columns
    missing_columns = [col for col in expected_claim_columns if col not in df.columns]
    if missing_columns:
        print(f"  Missing columns: {missing_columns}")
    else:
        print(f"  All expected columns are present")

# Verify removal of claim columns in phase 4 files
print("\nVerifying removal of claim columns in phase 4 files:")
for dataset_name, df, not_found in [
    ("t", df_t_phase4, t_not_found), 
    ("m", df_m_phase4, m_not_found), 
    ("m_t", df_m_t_phase4, m_t_not_found)
]:
    print(f"\n{dataset_name}_phase4.csv:")
    
    # Check if prolific_id is still present
    if "prolific_id" in df.columns:
        print(f"  ✓ prolific_id column is present")
    else:
        print(f"  ✗ prolific_id column is missing")
    
    # Check for any remaining claim-related columns
    claim_words = ['government', 'vaccine', 'trump', 'republican', 'democrat', 'left-wing', 'eu', 'hyatt', 'warming']
    remaining_claim_cols = []
    
    for col in df.columns:
        if any(word.lower() in col.lower() for word in claim_words):
            remaining_claim_cols.append(col)
    
    if remaining_claim_cols:
        print(f"  ⚠ Found {len(remaining_claim_cols)} possible claim-related columns that might need review:")
        for col in remaining_claim_cols[:5]:
            print(f"    - {col}")
        if len(remaining_claim_cols) > 5:
            print(f"    ... and {len(remaining_claim_cols) - 5} more")
    else:
        print(f"  ✓ No obvious claim-related columns remain")

# Final summary
print("\nPhase 4 processing completed successfully.")
print("\nSummary of files created:")
print("  - t_claims.csv, m_claims.csv, m_t_claims.csv: Contain prolific_id and renamed claim columns")
print("  - t_phase4.csv, m_phase4.csv, m_t_phase4.csv: Contain remaining data with claim columns removed")


Processing t dataset for Phase 4...
  Available columns in dataset: 102
  Matched 'Republicans divided in views of Trump's conduct, democrats are broadly critical.' to column 'Republicans divided in views of Trump’s conduct, democrats are broadly critical.'
  Found and extracted 8 of 8 claim columns
  Column 'Government officials have manipulated stock prices to hide scandals.' found with first value: Real News...
  Column 'New study: Left-wingers are more likely to lie to get a higher salary.' found with first value: Fake News...
  Column 'Certain vaccines are loaded with dangerous chemicals and toxins.' found with first value: Fake News...
  Column 'The government is knowingly spreading disease through the airwaves and food supply.' found with first value: Fake News...
  Column 'Attitudes toward the European Union are largely positive, both within Europe and outside it.' found with first value: Real News...
  Column 'Hyatt will remove small bottles from hotel bathrooms by 2021.' fou

In [20]:
import re

feedback_dir = get_working_dir() / 'plots' / 'feedback'
os.makedirs(feedback_dir, exist_ok=True)

df_t = pd.read_csv(processed_dir / 't_phase4.csv', encoding='utf-8')
df_m = pd.read_csv(processed_dir / 'm_phase4.csv', encoding='utf-8')
df_m_t = pd.read_csv(processed_dir / 'm_t_phase4.csv', encoding='utf-8')

# Define patterns to identify different types of feedback columns
optional_feedback_pattern = r"Optional.*observations.*suggestions.*concerns.*claim.*explanation"
survey_feedback_pattern = r"Do you have any comments or suggestions about this survey\?"
creative_explanation_pattern = r"Optional.*creative explanation format.*effective"

# Function to extract and write feedback to text files
def extract_feedback(df, dataset_name, feedback_dir):
    print(f"\nExtracting feedback from {dataset_name} dataset...")
    
    # Create a copy of the dataframe
    df_copy = df.copy()
    
    # Find all feedback columns
    optional_feedback_cols = []
    survey_feedback_col = None
    creative_explanation_col = None
    
    for col in df_copy.columns:
        if re.search(optional_feedback_pattern, col, re.IGNORECASE):
            optional_feedback_cols.append(col)
        elif re.search(survey_feedback_pattern, col, re.IGNORECASE):
            survey_feedback_col = col
        elif re.search(creative_explanation_pattern, col, re.IGNORECASE):
            creative_explanation_col = col
    
    # Sort optional feedback columns (assuming they are numbered sequentially)
    optional_feedback_cols.sort()
    
    print(f"  Found {len(optional_feedback_cols)} optional feedback columns")
    if survey_feedback_col:
        print(f"  Found survey feedback column: {survey_feedback_col}")
    if creative_explanation_col:
        print(f"  Found creative explanation column: {creative_explanation_col}")
    
    # Process optional feedback columns
    optional_feedback_path = feedback_dir / f"{dataset_name}_optional_feedback.txt"
    with open(optional_feedback_path, 'w', encoding='utf-8') as f:
        f.write(f"===== Optional Feedback for {dataset_name.upper()} Study =====\n\n")
        
        for i, col in enumerate(optional_feedback_cols):
            f.write(f"===== CLAIM {i+1} FEEDBACK =====\n\n")
            
            # Extract feedback for this claim
            feedback_items = df_copy[df_copy[col].notna()][['prolific_id', col]]
            
            if feedback_items.empty:
                f.write("No feedback provided for this claim.\n\n")
            else:
                for _, row in feedback_items.iterrows():
                    f.write(f"Participant {row['prolific_id']}:\n")
                    f.write(f"{row[col]}\n\n")
            
            f.write("\n" + "="*50 + "\n\n")
    
    print(f"  Optional feedback saved to {optional_feedback_path}")
    
    # Process survey feedback
    removed_columns = optional_feedback_cols.copy()
    
    if survey_feedback_col:
        # Append to survey feedback file (create if doesn't exist)
        survey_feedback_path = feedback_dir / "feedback_survey.txt"
        
        # Check if file exists to determine if we need to write headers
        file_exists = os.path.isfile(survey_feedback_path)
        
        with open(survey_feedback_path, 'a', encoding='utf-8') as f:
            if not file_exists:
                f.write("===== SURVEY FEEDBACK FROM ALL STUDIES =====\n\n")
            
            f.write(f"===== {dataset_name.upper()} STUDY FEEDBACK =====\n\n")
            
            # Extract survey feedback
            feedback_items = df_copy[df_copy[survey_feedback_col].notna()][['prolific_id', survey_feedback_col]]
            
            if feedback_items.empty:
                f.write("No survey feedback provided.\n\n")
            else:
                for _, row in feedback_items.iterrows():
                    f.write(f"Participant {row['prolific_id']}:\n")
                    f.write(f"{row[survey_feedback_col]}\n\n")
            
            f.write("\n" + "="*50 + "\n\n")
        
        print(f"  Survey feedback appended to {survey_feedback_path}")
        removed_columns.append(survey_feedback_col)
    
    # Process creative explanation feedback (only for 'm' dataset)
    if creative_explanation_col:
        creative_feedback_path = feedback_dir / "m_creative_explanation_feedback.txt"
        
        with open(creative_feedback_path, 'w', encoding='utf-8') as f:
            f.write("===== CREATIVE EXPLANATION FORMAT FEEDBACK =====\n\n")
            
            # Extract creative explanation feedback
            feedback_items = df_copy[df_copy[creative_explanation_col].notna()][['prolific_id', creative_explanation_col]]
            
            if feedback_items.empty:
                f.write("No creative explanation feedback provided.\n\n")
            else:
                for _, row in feedback_items.iterrows():
                    f.write(f"Participant {row['prolific_id']}:\n")
                    f.write(f"{row[creative_explanation_col]}\n\n")
        
        print(f"  Creative explanation feedback saved to {creative_feedback_path}")
        removed_columns.append(creative_explanation_col)
    
    # Create Phase 5 file by removing feedback columns
    df_phase5 = df_copy.drop(columns=removed_columns)
    
    # Save Phase 5 processed file
    phase5_file_path = processed_dir / f"{dataset_name}_phase5.csv"
    df_phase5.to_csv(phase5_file_path, index=False, encoding='utf-8')
    print(f"  Phase 5 processed file saved to {phase5_file_path}")
    
    return df_phase5, removed_columns

# Extract feedback and create Phase 5 files
df_t_phase5, t_removed = extract_feedback(df_t, 't', feedback_dir)
df_m_phase5, m_removed = extract_feedback(df_m, 'm', feedback_dir)
df_m_t_phase5, m_t_removed = extract_feedback(df_m_t, 'm_t', feedback_dir)

# Verify removal of feedback columns
print("\nVerifying removal of feedback columns in Phase 5 files:")
for dataset_name, df, removed in [
    ("t", df_t_phase5, t_removed), 
    ("m", df_m_phase5, m_removed), 
    ("m_t", df_m_t_phase5, m_t_removed)
]:
    print(f"\n{dataset_name}_phase5.csv:")
    
    # Check if prolific_id is still present
    if "prolific_id" in df.columns:
        print(f"  ✓ prolific_id column is present")
    else:
        print(f"  ✗ prolific_id column is missing")
    
    # Check that all feedback columns were removed
    remaining_feedback_cols = []
    
    for col in df.columns:
        if (re.search(optional_feedback_pattern, col, re.IGNORECASE) or
            re.search(survey_feedback_pattern, col, re.IGNORECASE) or
            re.search(creative_explanation_pattern, col, re.IGNORECASE)):
            remaining_feedback_cols.append(col)
    
    if remaining_feedback_cols:
        print(f"  ⚠ Found {len(remaining_feedback_cols)} possible feedback columns that might need review:")
        for col in remaining_feedback_cols:
            print(f"    - {col}")
    else:
        print(f"  ✓ All feedback columns have been removed")
    
    print(f"  Removed {len(removed)} feedback columns total")

# Final summary
print("\nPhase 5 processing completed successfully.")
print("\nSummary of files created:")
print("  - t_phase5.csv, m_phase5.csv, m_t_phase5.csv: Contain remaining data with feedback columns removed")
print("\nFeedback files created:")
print("  - t_optional_feedback.txt: Optional feedback for Text study")
print("  - m_optional_feedback.txt: Optional feedback for Meme study")
print("  - m_t_optional_feedback.txt: Optional feedback for Meme+Context study")
print("  - feedback_survey.txt: Combined survey feedback from all studies")
print("  - m_creative_explanation_feedback.txt: Creative explanation format feedback from Meme study")


Extracting feedback from t dataset...
  Found 10 optional feedback columns
  Found survey feedback column: Do you have any comments or suggestions about this survey? Your feedback will help us improve future studies.
  Optional feedback saved to /Users/sergiopinto/Desktop/MemeFact/meta_study/results/feedback/t_optional_feedback.txt
  Survey feedback appended to /Users/sergiopinto/Desktop/MemeFact/meta_study/results/feedback/feedback_survey.txt
  Phase 5 processed file saved to /Users/sergiopinto/Desktop/MemeFact/meta_study/data/processed/t_phase5.csv

Extracting feedback from m dataset...
  Found 10 optional feedback columns
  Found survey feedback column: Do you have any comments or suggestions about this survey? Your feedback will help us improve future studies.
  Found creative explanation column: Optional: What do you think makes a creative explanation format effective for explaining claims?
  Optional feedback saved to /Users/sergiopinto/Desktop/MemeFact/meta_study/results/feedba

In [23]:
df_t = pd.read_csv(processed_dir / 't_phase5.csv', encoding='utf-8')
df_m = pd.read_csv(processed_dir / 'm_phase5.csv', encoding='utf-8')
df_m_t = pd.read_csv(processed_dir / 'm_t_phase5.csv', encoding='utf-8')

# Function to process datasets for phase 6
def process_phase6(df, dataset_name):
    print(f"\nProcessing {dataset_name} dataset for Phase 6...")
    
    # Create a copy of the dataframe
    df_copy = df.copy()
    
    # Define patterns for different types of questions
    accuracy_pattern = r"How accurate do you think the claim is\?(\.[\d]+)?"
    confidence_pattern = r"How confident are you in your accuracy assessment of the claim\?(\.[\d]+)?"
    engagement_pattern = r"If you were to encounter the claim on social media, how likely would you be to interact with content containing it\?(\.[\d]+)?"
    
    # Patterns for explanation assessment columns with different variants
    if dataset_name == 'm_t':
        # Meme+Context patterns
        explainability_pattern = r"How well do the explanation and the context convey their stance on the claim\?(\.[\d]+)?"
        credibility_pattern = r"How credible do you find the explanation and the context\?(\.[\d]+)?"
    else:
        # Text and Meme patterns
        explainability_pattern = r"How well does the explanation convey its stance on the claim\?(\.[\d]+)?"
        credibility_pattern = r"How credible do you find the explanation\?(\.[\d]+)?"
    
    # Attention check pattern
    attention_check_pattern = r"How carefully are you reading this question\?(\.[\d]+)?"
    
    # Find columns that match each pattern
    accuracy_cols = [col for col in df_copy.columns if re.search(accuracy_pattern, col, re.IGNORECASE)]
    confidence_cols = [col for col in df_copy.columns if re.search(confidence_pattern, col, re.IGNORECASE)]
    engagement_cols = [col for col in df_copy.columns if re.search(engagement_pattern, col, re.IGNORECASE)]
    explainability_cols = [col for col in df_copy.columns if re.search(explainability_pattern, col, re.IGNORECASE)]
    credibility_cols = [col for col in df_copy.columns if re.search(credibility_pattern, col, re.IGNORECASE)]
    attention_check_cols = [col for col in df_copy.columns if re.search(attention_check_pattern, col, re.IGNORECASE)]
    
    # Print what we found
    print(f"  Found {len(accuracy_cols)} accuracy columns")
    print(f"  Found {len(confidence_cols)} confidence columns")
    print(f"  Found {len(engagement_cols)} engagement columns")
    print(f"  Found {len(explainability_cols)} explainability columns")
    print(f"  Found {len(credibility_cols)} credibility columns")
    print(f"  Found {len(attention_check_cols)} attention check columns to remove")
    
    # Helper function to extract the suffix number from a column name
    def get_suffix_num(col_name):
        match = re.search(r"\.(\d+)$", col_name)
        return int(match.group(1)) if match else 0
    
    # Group columns by claim
    claim_columns = []
    for claim_idx in range(10):  # For 10 claims
        claim_group = {}
        
        # For first claim, look for columns without suffix and with suffix .1
        # For subsequent claims, look for columns with suffix .2*claim_idx and .2*claim_idx+1
        if claim_idx == 0:
            # Claim 1 pre columns (no suffix)
            pre_accuracy = [col for col in accuracy_cols if get_suffix_num(col) == 0]
            pre_confidence = [col for col in confidence_cols if get_suffix_num(col) == 0]
            pre_engagement = [col for col in engagement_cols if get_suffix_num(col) == 0]
            
            # Claim 1 post columns (suffix .1)
            post_accuracy = [col for col in accuracy_cols if get_suffix_num(col) == 1]
            post_confidence = [col for col in confidence_cols if get_suffix_num(col) == 1]
            post_engagement = [col for col in engagement_cols if get_suffix_num(col) == 1]
            
            # Claim 1 correction columns
            correction_explainability = [col for col in explainability_cols if get_suffix_num(col) == 0]
            correction_credibility = [col for col in credibility_cols if get_suffix_num(col) == 0]
        else:
            # Pre columns (suffix .2*claim_idx)
            suffix_pre = 2 * claim_idx
            pre_accuracy = [col for col in accuracy_cols if get_suffix_num(col) == suffix_pre]
            pre_confidence = [col for col in confidence_cols if get_suffix_num(col) == suffix_pre]
            pre_engagement = [col for col in engagement_cols if get_suffix_num(col) == suffix_pre]
            
            # Post columns (suffix .2*claim_idx+1)
            suffix_post = 2 * claim_idx + 1
            post_accuracy = [col for col in accuracy_cols if get_suffix_num(col) == suffix_post]
            post_confidence = [col for col in confidence_cols if get_suffix_num(col) == suffix_post]
            post_engagement = [col for col in engagement_cols if get_suffix_num(col) == suffix_post]
            
            # Correction columns
            correction_explainability = [col for col in explainability_cols if get_suffix_num(col) == claim_idx]
            correction_credibility = [col for col in credibility_cols if get_suffix_num(col) == claim_idx]
        
        # Store found columns for this claim
        if pre_accuracy:
            claim_group['pre_accuracy'] = pre_accuracy[0]
        if pre_confidence:
            claim_group['pre_confidence'] = pre_confidence[0]
        if pre_engagement:
            claim_group['pre_engagement'] = pre_engagement[0]
        if post_accuracy:
            claim_group['post_accuracy'] = post_accuracy[0]
        if post_confidence:
            claim_group['post_confidence'] = post_confidence[0]
        if post_engagement:
            claim_group['post_engagement'] = post_engagement[0]
        if correction_explainability:
            claim_group['correction_explainability'] = correction_explainability[0]
        if correction_credibility:
            claim_group['correction_credibility'] = correction_credibility[0]
        
        # Only add if we found columns for this claim
        if claim_group:
            claim_columns.append((claim_idx + 1, claim_group))
    
    # Create rename mapping
    rename_mapping = {}
    
    for claim_num, columns in claim_columns:
        for col_type, col_name in columns.items():
            new_name = f"claim{claim_num}_{col_type}"
            rename_mapping[col_name] = new_name
    
    # Apply renaming and remove attention check columns
    df_renamed = df_copy.rename(columns=rename_mapping)
    df_without_attention = df_renamed.drop(columns=attention_check_cols)
    
    # Reorder columns to keep claim-related columns together
    # First, get non-claim columns (including prolific_id)
    non_claim_cols = [col for col in df_without_attention.columns if not col.startswith('claim')]
    
    ordered_cols = non_claim_cols.copy()
    
    for claim_num, _ in claim_columns:
        # Define the desired order of columns for each claim
        claim_col_order = [
            f'claim{claim_num}_pre_accuracy',
            f'claim{claim_num}_pre_confidence',
            f'claim{claim_num}_pre_engagement',
            f'claim{claim_num}_post_accuracy',
            f'claim{claim_num}_post_confidence',
            f'claim{claim_num}_post_engagement',
            f'claim{claim_num}_correction_explainability',
            f'claim{claim_num}_correction_credibility'
        ]
        
        # Add only columns that exist
        for col in claim_col_order:
            if col in df_without_attention.columns:
                ordered_cols.append(col)
    
    # Reorder the dataframe columns
    df_final = df_without_attention[ordered_cols]
    
    # Save the processed file
    phase6_file_path = processed_dir / f"{dataset_name}_phase6.csv"
    df_final.to_csv(phase6_file_path, index=False, encoding='utf-8')
    print(f"  Phase 6 processed file saved to {phase6_file_path}")
    
    # Return processed dataframe and summary
    return df_final, {
        'renamed': len(rename_mapping),
        'removed': len(attention_check_cols),
        'claims_processed': len(claim_columns)
    }

# Process each dataset
df_t_phase6, t_summary = process_phase6(df_t, 't')
df_m_phase6, m_summary = process_phase6(df_m, 'm')
df_m_t_phase6, m_t_summary = process_phase6(df_m_t, 'm_t')

# Final summary
print("\nPhase 6 processing completed successfully.")
print("\nSummary of changes:")
print(f"  t dataset: {t_summary['renamed']} columns renamed, {t_summary['removed']} columns removed, {t_summary['claims_processed']} claims processed")
print(f"  m dataset: {m_summary['renamed']} columns renamed, {m_summary['removed']} columns removed, {m_summary['claims_processed']} claims processed")
print(f"  m_t dataset: {m_t_summary['renamed']} columns renamed, {m_t_summary['removed']} columns removed, {m_t_summary['claims_processed']} claims processed")

print("\nColumn order in final files:")
print("  1. Non-claim columns (including prolific_id)")
print("  2. For each claim (1 through 10), in order:")
print("     - Pre-assessment columns (pre_accuracy, pre_confidence, pre_engagement)")
print("     - Post-assessment columns (post_accuracy, post_confidence, post_engagement)")
print("     - Correction assessment columns (correction_explainability, correction_credibility)")

# Verify final column structure
if df_t_phase6.shape[0] > 0:
    claim_cols = [col for col in df_t_phase6.columns if col.startswith('claim')]
    if claim_cols:
        print("\nExample of column structure in t_phase6.csv:")
        claim_nums = sorted(list(set([int(re.search(r'claim(\d+)', col).group(1)) for col in claim_cols])))
        for claim_num in claim_nums[:2]:  # Show first two claims
            cols = [col for col in df_t_phase6.columns if f'claim{claim_num}_' in col]
            print(f"\nClaim {claim_num} columns:")
            for col in cols:
                print(f"  - {col}")
        if len(claim_nums) > 2:
            print(f"  ...and {len(claim_nums)-2} more claims")


Processing t dataset for Phase 6...
  Found 20 accuracy columns
  Found 20 confidence columns
  Found 20 engagement columns
  Found 10 explainability columns
  Found 10 credibility columns
  Found 2 attention check columns to remove
  Phase 6 processed file saved to /Users/sergiopinto/Desktop/MemeFact/meta_study/data/processed/t_phase6.csv

Processing m dataset for Phase 6...
  Found 20 accuracy columns
  Found 20 confidence columns
  Found 20 engagement columns
  Found 10 explainability columns
  Found 10 credibility columns
  Found 2 attention check columns to remove
  Phase 6 processed file saved to /Users/sergiopinto/Desktop/MemeFact/meta_study/data/processed/m_phase6.csv

Processing m_t dataset for Phase 6...
  Found 20 accuracy columns
  Found 20 confidence columns
  Found 20 engagement columns
  Found 10 explainability columns
  Found 10 credibility columns
  Found 2 attention check columns to remove
  Phase 6 processed file saved to /Users/sergiopinto/Desktop/MemeFact/meta_stu

In [24]:
df_m = pd.read_csv(processed_dir / 'm_phase6.csv', encoding='utf-8')

# Define patterns to identify the creative explanation preference columns
most_effective_pattern = r"Which of these creative explanations do you find most effective at explaining this specific claim\?"
creative_likeability_pattern = r"On a scale from 1 to 5, how much would you like to see more fact-checking content that uses creative formats"

# Process the dataset for phase 7
def process_phase7(df, dataset_name='m'):
    print(f"\nProcessing {dataset_name} dataset for Phase 7...")
    
    # Create a copy of the dataframe
    df_copy = df.copy()
    
    # Find columns that match the patterns
    most_effective_col = None
    creative_likeability_col = None
    
    for col in df_copy.columns:
        if re.search(most_effective_pattern, col, re.IGNORECASE):
            most_effective_col = col
        elif re.search(creative_likeability_pattern, col, re.IGNORECASE):
            creative_likeability_col = col
    
    # Extract the creative explanation preference columns with prolific_id
    creative_prefs_df = df_copy[['prolific_id']].copy()
    
    # Add the preference columns if found
    removed_columns = []
    
    if most_effective_col:
        print(f"  Found 'most effective creative explanation' column: {most_effective_col}")
        creative_prefs_df['most_effective_creative_explanation'] = df_copy[most_effective_col]
        removed_columns.append(most_effective_col)
    else:
        print("  Warning: Could not find 'most effective creative explanation' column")
    
    if creative_likeability_col:
        print(f"  Found 'creative explanations likeability' column: {creative_likeability_col}")
        creative_prefs_df['creative_explanations_likeability'] = df_copy[creative_likeability_col]
        removed_columns.append(creative_likeability_col)
    else:
        print("  Warning: Could not find 'creative explanations likeability' column")
    
    # Create Phase 7 file by removing the creative preference columns
    df_phase7 = df_copy.drop(columns=removed_columns)
    
    # Save the creative preferences to a separate file
    creative_prefs_path = processed_dir / f"{dataset_name}_creative_preferences.csv"
    creative_prefs_df.to_csv(creative_prefs_path, index=False, encoding='utf-8')
    print(f"  Creative preferences saved to {creative_prefs_path}")
    
    # Save Phase 7 processed file
    phase7_file_path = processed_dir / f"{dataset_name}_phase7.csv"
    df_phase7.to_csv(phase7_file_path, index=False, encoding='utf-8')
    print(f"  Phase 7 processed file saved to {phase7_file_path}")
    
    return df_phase7, creative_prefs_df, len(removed_columns)

# Process the dataset
df_m_phase7, df_m_creative_prefs, columns_removed = process_phase7(df_m)

# Final summary
print("\nPhase 7 processing completed successfully.")
print(f"\nRemoved {columns_removed} creative preference columns from 'm_phase6.csv'")
print("Created files:")
print("  - m_phase7.csv: Contains all columns from m_phase6.csv except creative preference columns")
print("  - m_creative_preferences.csv: Contains prolific_id and renamed creative preference columns")

# Verify the creative preferences file
if df_m_creative_prefs.shape[0] > 0:
    print("\nStructure of m_creative_preferences.csv:")
    print(f"  Rows: {df_m_creative_prefs.shape[0]}")
    print(f"  Columns: {df_m_creative_prefs.columns.tolist()}")
    
    # Show a few sample values if available
    if 'most_effective_creative_explanation' in df_m_creative_prefs.columns:
        unique_values = df_m_creative_prefs['most_effective_creative_explanation'].unique()
        if len(unique_values) > 0:
            print(f"\nSample unique values for 'most_effective_creative_explanation':")
            for val in unique_values[:5]:
                print(f"  - {val}")
            if len(unique_values) > 5:
                print(f"  ...and {len(unique_values)-5} more unique values")
    
    if 'creative_explanations_likeability' in df_m_creative_prefs.columns:
        unique_values = df_m_creative_prefs['creative_explanations_likeability'].unique()
        if len(unique_values) > 0:
            print(f"\nSample unique values for 'creative_explanations_likeability':")
            for val in unique_values[:5]:
                print(f"  - {val}")
            if len(unique_values) > 5:
                print(f"  ...and {len(unique_values)-5} more unique values")


Processing m dataset for Phase 7...
  Found 'most effective creative explanation' column: Which of these creative explanations do you find most effective at explaining this specific claim?
  Found 'creative explanations likeability' column: On a scale from 1 to 5, how much would you like to see more fact-checking content that uses creative formats (like poems, jokes, or memes) instead of traditional text-based explanations?
  Creative preferences saved to /Users/sergiopinto/Desktop/MemeFact/meta_study/data/processed/m_creative_preferences.csv
  Phase 7 processed file saved to /Users/sergiopinto/Desktop/MemeFact/meta_study/data/processed/m_phase7.csv

Phase 7 processing completed successfully.

Removed 2 creative preference columns from 'm_phase6.csv'
Created files:
  - m_phase7.csv: Contains all columns from m_phase6.csv except creative preference columns
  - m_creative_preferences.csv: Contains prolific_id and renamed creative preference columns

Structure of m_creative_preferences.c