This file includes the analysis conducted on the pornography consumption survey data.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from matplotlib.backends.backend_pdf import PdfPages
import statsmodels.api as sm
from statsmodels.formula.api import ols

## Load in and Clean the Data

In [None]:
# Load and remove non-participant rows clean the dataset
data = pd.read_csv("")  # File path can be added here for confidentiality.
data = data[~(data['Status'] == "Survey Preview")]  # Remove non-participant rows
data = data[data['Consent'] == "I agree to participate in the research"]  # Keep only rows where participants consented

In [None]:
#clean the data

# Drop unnecessary metadata columns
columns_to_drop = ["UserLanguage", "StartDate", "EndDate", "Status", "DistributionChannel", 
                   "Q_RecaptchaScore", "Q37_Click.Count", "Q38_Click.Count", "Q39_Click.Count", 
                   "Q37_Last.Click", "Q38_Last.Click", "Q39_Last.Click"]
data.drop(columns=columns_to_drop, inplace=True)

# Replace empty string entries with NaN for further cleaning
data.replace("", np.nan, inplace=True)

# Renaming columns
rename_dict = {
    'Q1': 'age',
    'Q2': 'ethnicity',
    'Q3_1': 'religious_affiliation',
    'Q4': 'education_completed',
    'Q5': 'education_region',
    'Q6': 'alcohol_per_week',
    'Q7': 'gender_identity',
    'Q40_1': 'pre_test_feelings',
    'Q37_First.Click': 'control_group_start',
    'Q37_Page.Submit': 'control_group_end',
    'Q38_First.Click': 'info_group_start',
    'Q38_Page.Submit': 'info_group_end',
    'Q39_First.Click': 'pre_test_start',
    'Q39_Page.Submit': 'pre_test_end',
    'Q36_1': 'info_group_feelings',
    'Q10': 'discomfort_scenario_A',
    'Q11': 'discomfort_scenario_B',
    'Q12': 'discomfort_scenario_C',
    'Q13': 'discomfort_scenario_D',
    'Q14': 'discomfort_scenario_E',
    'Q15': 'discomfort_scenario_F',
    'Q16': 'unclear_scenario_G',
    'Q16.1': 'num_partners',
    'Q17_1': 'sexual_orientation',
    'Q17_1.1': 'frequency_of_sex',
    'Q18': 'age_first_exposure',
    'Q19_1': 'incident_report',
    'Q20': 'relationship_status',
    'Q21_1': 'frequency_of_activity_A',
    'Q22_1': 'frequency_of_activity_B',
    'Q23_1': 'activity_A_without_activity_B',
    'Q24_1': 'comfort_discussing_topic_A',
    'Q25_1': 'discomfort_discussing_topic_A',
    'Q26_1': 'never_discussed_activity_A',
    'Q27_1': 'comfort_discussing_activity_B',
    'Q28_1': 'activity_B_with_partner',
    'Q29_1': 'unaware_activity_B_made',
    'Q30_1': 'guilt_feelings',
    'Q31_1': 'appeal_level',
    'Q32_1': 'awareness_of_creation',
    'Q33_1': 'education_about_topic',
    'Q34_1': 'friends_comfortable',
    'Q35_1': 'people_comfortable'
}

# Apply renaming
data.rename(columns=rename_dict, inplace=True)

## Look at summary stats for the sample

In [None]:
# Create an empty dictionary to store all the results
results = {}

# Gender distribution
gender_counts = data['gender_identity'].value_counts()
results['Gender Distribution'] = gender_counts

# Race distribution with percentage
race_counts = data['ethnicity'].value_counts(normalize=True) * 100
race_counts = race_counts.sort_values(ascending=False)
results['Race Distribution (%)'] = race_counts

# Age distribution: Replacing "24 or older" with 24 and converting to numeric
data['age'].replace("24 or older", 24, inplace=True)
data['age'] = pd.to_numeric(data['age'], errors='coerce')

# Summary statistics for age
results['Mean Age'] = data['age'].mean()
results['Median Age'] = data['age'].median()
results['Standard Deviation of Age'] = data['age'].std()
results['Minimum Age'] = data['age'].min()
results['Maximum Age'] = data['age'].max()

# Religious affiliation distribution and summary statistics
data['religious_affiliation'] = pd.to_numeric(data['religious_affiliation'], errors='coerce')

results['Mean Religious Affiliation'] = data['religious_affiliation'].mean()
results['Median Religious Affiliation'] = data['religious_affiliation'].median()
results['Standard Deviation of Religious Affiliation'] = data['religious_affiliation'].std()
results['Minimum Religious Affiliation'] = data['religious_affiliation'].min()
results['Maximum Religious Affiliation'] = data['religious_affiliation'].max()

# College completion distribution
college_completed_counts = data['education_completed'].value_counts(normalize=True) * 100
results['College Completed Distribution (%)'] = college_completed_counts

# College region distribution
college_region_counts = data['education_region'].value_counts(normalize=True) * 100
results['College Region Distribution (%)'] = college_region_counts

# Filter data for participants who completed the survey
data_complete = data[data['Progress'] == 100]

# Sexual orientation distribution
sexuality_counts = data_complete['sexual_orientation'].value_counts(normalize=True) * 100
results['Sexual Orientation Distribution (%)'] = sexuality_counts

# Distribution of missing values in 'sexual_orientation' column by gender
na_sexuality = data_complete[data_complete['sexual_orientation'].isna()]
na_sexuality_gender_counts = na_sexuality['gender_identity'].value_counts(normalize=True) * 100
results['Missing Sexuality by Gender Distribution (%)'] = na_sexuality_gender_counts

# Assault history by gender
assault_female = data[data['gender_identity'] == 'Cisgender female']['incident_report'].value_counts(normalize=True) * 100
assault_male = data[data['gender_identity'] == 'Cisgender male']['incident_report'].value_counts(normalize=True) * 100
results['Assault History (Female %)'] = assault_female
results['Assault History (Male %)'] = assault_male

# Convert results dictionary to DataFrame
# Since the lengths of different categories vary, we'll store them in separate DataFrames and write them sequentially to a CSV
gender_df = pd.DataFrame(gender_counts).reset_index().rename(columns={'index': 'Gender', 'gender_identity': 'Count'})
race_df = pd.DataFrame(race_counts).reset_index().rename(columns={'index': 'Race', 'ethnicity': 'Percentage'})
age_stats = pd.DataFrame({
    'Statistic': ['Mean Age', 'Median Age', 'Standard Deviation of Age', 'Minimum Age', 'Maximum Age'],
    'Value': [results['Mean Age'], results['Median Age'], results['Standard Deviation of Age'], results['Minimum Age'], results['Maximum Age']]
})
religious_stats = pd.DataFrame({
    'Statistic': ['Mean Religious Affiliation', 'Median Religious Affiliation', 'Standard Deviation of Religious Affiliation', 'Minimum Religious Affiliation', 'Maximum Religious Affiliation'],
    'Value': [results['Mean Religious Affiliation'], results['Median Religious Affiliation'], results['Standard Deviation of Religious Affiliation'], results['Minimum Religious Affiliation'], results['Maximum Religious Affiliation']]
})
college_completed_df = pd.DataFrame(college_completed_counts).reset_index().rename(columns={'index': 'Education Completed', 'education_completed': 'Percentage'})
college_region_df = pd.DataFrame(college_region_counts).reset_index().rename(columns={'index': 'Education Region', 'education_region': 'Percentage'})
sexuality_df = pd.DataFrame(sexuality_counts).reset_index().rename(columns={'index': 'Sexual Orientation', 'sexual_orientation': 'Percentage'})
na_sexuality_gender_df = pd.DataFrame(na_sexuality_gender_counts).reset_index().rename(columns={'index': 'Gender', 'gender_identity': 'Missing Sexuality (%)'})
assault_female_df = pd.DataFrame(assault_female).reset_index().rename(columns={'index': 'Incident Report', 'incident_report': 'Percentage (Female)'})
assault_male_df = pd.DataFrame(assault_male).reset_index().rename(columns={'index': 'Incident Report', 'incident_report': 'Percentage (Male)'})

# Write all DataFrames to CSV, appending to the same file
with open('survey_summary.csv', 'w') as f:
    gender_df.to_csv(f, index=False)
    race_df.to_csv(f, index=False)
    age_stats.to_csv(f, index=False)
    religious_stats.to_csv(f, index=False)
    college_completed_df.to_csv(f, index=False)
    college_region_df.to_csv(f, index=False)
    sexuality_df.to_csv(f, index=False)
    na_sexuality_gender_df.to_csv(f, index=False)
    assault_female_df.to_csv(f, index=False)
    assault_male_df.to_csv(f, index=False)

print("All results saved to 'survey_summary.csv'.")


In [None]:
#trends in sex-related variables

# Create an empty dictionary to store all results
results = {}

# Subset participants who did not report masturbation frequency
data_no_mast = data[data['freq_masturbation'].isna()]

# Proportion of each gender with no entry in 'freq_masturbation'
prop_na_by_gender = data_no_mast['gender_identity'].value_counts(normalize=True) * 100
prop_na_by_gender_df = pd.DataFrame(prop_na_by_gender).reset_index().rename(columns={'index': 'Gender', 'gender_identity': 'Percentage of No Entry'})
prop_na_by_gender_df.to_csv('proportion_no_entry_freq_masturbation_by_gender.csv', index=False)

# Convert 'freq_masturbation' to numeric
data['freq_masturbation'] = pd.to_numeric(data['freq_masturbation'], errors='coerce')

# Calculate average frequency of masturbation by gender
avg_freq_by_gender = data.groupby('gender_identity')['freq_masturbation'].mean()
avg_freq_by_gender_df = pd.DataFrame(avg_freq_by_gender).reset_index().rename(columns={'freq_masturbation': 'Average Frequency'})
avg_freq_by_gender_df.to_csv('average_freq_masturbation_by_gender.csv', index=False)

# Subset data by gender
data_female = data[data['gender_identity'] == 'Cisgender female'].copy()
data_male = data[data['gender_identity'] == 'Cisgender male'].copy()

# Remove rows with missing 'freq_masturbation' values
data_male = data_male.dropna(subset=['freq_masturbation'])
data_female = data_female.dropna(subset=['freq_masturbation'])

# Calculate sample mean and standard error for males
sample_mean_male = data_male['freq_masturbation'].mean()
se_male = data_male['freq_masturbation'].std() / np.sqrt(len(data_male))

# Calculate z-score and p-value for comparison with female sample
sample_mean_female = data_female['freq_masturbation'].mean()
z_score = (sample_mean_male - sample_mean_female) / se_male
p_value = 2 * (1 - stats.norm.cdf(abs(z_score)))

z_porn_df = pd.DataFrame({
    'Comparison': ['Masturbation Frequency'],
    'Z-Score': [z_score],
    'P-Value': [p_value]
})
z_porn_df.to_csv('z_score_p_value_masturbation.csv', index=False)

# Create 'in_relationship' column
data['in_relationship'] = np.where(data['relationship_status'] == 'Yes', 1, 
                                   np.where(pd.isna(data['relationship_status']), np.nan, 0))

# Calculate average frequency of masturbation by relationship status and gender
avg_freq_by_rel_gender = data.groupby(['in_relationship', 'gender_identity'])['freq_masturbation'].mean()
avg_freq_by_rel_gender_df = pd.DataFrame(avg_freq_by_rel_gender).reset_index().rename(columns={'freq_masturbation': 'Average Frequency'})
avg_freq_by_rel_gender_df.to_csv('average_freq_by_relationship_status_and_gender.csv', index=False)

# Group data by 'freq_porn' and calculate percentages
freq_porn_distribution = data['freq_porn'].value_counts(normalize=True) * 100
freq_porn_distribution_df = pd.DataFrame(freq_porn_distribution).reset_index().rename(columns={'index': 'Porn Frequency', 'freq_porn': 'Percentage'})
freq_porn_distribution_df.to_csv('porn_frequency_distribution.csv', index=False)

# Remove rows with missing 'freq_porn'
data_male = data_male.dropna(subset=['freq_porn'])
data_female = data_female.dropna(subset=['freq_porn'])

# Calculate sample mean and standard error for 'freq_porn' in males
sample_mean_porn_male = data_male['freq_porn'].mean()
se_porn_male = data_male['freq_porn'].std() / np.sqrt(len(data_male))

# Calculate z-score and p-value for comparison with female sample
sample_mean_porn_female = data_female['freq_porn'].mean()
z_score_porn = (sample_mean_porn_male - sample_mean_porn_female) / se_porn_male
p_value_porn = 2 * (1 - stats.norm.cdf(abs(z_score_porn)))

z_porn_df = pd.DataFrame({
    'Comparison': ['Porn Frequency'],
    'Z-Score': [z_score_porn],
    'P-Value': [p_value_porn]
})
z_porn_df.to_csv('z_score_p_value_porn.csv', index=False)

# Convert 'freq_sex' to numeric and calculate correlations
data['freq_sex'] = pd.to_numeric(data['freq_sex'], errors='coerce')

# Male correlation between 'freq_sex' and 'freq_masturbation'
male_corr = data_male[['freq_sex', 'freq_masturbation']].dropna().corr().loc['freq_sex', 'freq_masturbation']
male_corr_df = pd.DataFrame({'Male Correlation (freq_sex and freq_masturbation)': [male_corr]})
male_corr_df.to_csv('male_correlation_freq_sex_freq_masturbation.csv', index=False)

# Female correlation between 'freq_sex' and 'freq_porn'
female_corr = data_female[['freq_sex', 'freq_porn']].dropna().corr().loc['freq_sex', 'freq_porn']
female_corr_df = pd.DataFrame({'Female Correlation (freq_sex and freq_porn)': [female_corr]})
female_corr_df.to_csv('female_correlation_freq_sex_freq_porn.csv', index=False)

# Correlation between 'freq_porn' and 'freq_masturbation' across all participants
data_complete_porn = data.dropna(subset=['freq_porn', 'freq_masturbation'])
porn_mast_corr = data_complete_porn[['freq_porn', 'freq_masturbation']].corr().loc['freq_porn', 'freq_masturbation']
porn_mast_corr_df = pd.DataFrame({'Correlation between freq_porn and freq_masturbation': [porn_mast_corr]})
porn_mast_corr_df.to_csv('correlation_freq_porn_freq_masturbation.csv', index=False)

# Analyze 'mast_no_porn' column
data_complete_mast_no_porn = data.dropna(subset=['mast_no_porn'])
data_complete_mast_no_porn['mast_no_porn'] = pd.to_numeric(data_complete_mast_no_porn['mast_no_porn'], errors='coerce')
mast_no_porn_summary_df = data_complete_mast_no_porn['mast_no_porn'].describe()
mast_no_porn_summary_df.to_csv('mast_no_porn_summary.csv', index=False)

print("All results saved to CSV files.")


In [None]:
# Age when participants first viewed adult material
age_viewed_distribution = data['age_first_exposure'].value_counts(normalize=True) * 100
age_viewed_distribution.sort_values(ascending=False).to_csv('age_viewed_distribution.csv')

# Convert 'speak_sex' to numeric and filter out rows with missing values
data['speak_sex'] = pd.to_numeric(data['speak_sex'], errors='coerce')
data_speak_sex = data.dropna(subset=['speak_sex'])
speak_sex_distribution = data_speak_sex['speak_sex'].value_counts(normalize=True) * 100
speak_sex_distribution.to_csv('speak_sex_distribution.csv')

# Convert 'speak_porn' to numeric and filter out rows with missing values
data['speak_porn'] = pd.to_numeric(data['speak_porn'], errors='coerce')
data_speak_porn = data.dropna(subset=['speak_porn'])
speak_porn_distribution = data_speak_porn['speak_porn'].value_counts(normalize=True) * 100
speak_porn_distribution.to_csv('speak_porn_distribution.csv')

# Convert 'porn_partner' to numeric and filter out rows with missing values
data['porn_partner'] = pd.to_numeric(data['porn_partner'], errors='coerce')
data_porn_partner = data.dropna(subset=['porn_partner'])
porn_partner_distribution = data_porn_partner['porn_partner'].value_counts(normalize=True) * 100
porn_partner_distribution.to_csv('porn_partner_distribution.csv')

# Subset data by gender for 'porn_partner'
data_male_partner = data_porn_partner[data_porn_partner['gender_identity'] == 'Cisgender male']
data_female_partner = data_porn_partner[data_porn_partner['gender_identity'] == 'Cisgender female']

data_male_partner['porn_partner'].describe().to_csv('porn_partner_summary_male.csv')
data_female_partner['porn_partner'].describe().to_csv('porn_partner_summary_female.csv')

# Convert 'never_speak_mast' to numeric and filter out rows with missing values
data['never_speak_mast'] = pd.to_numeric(data['never_speak_mast'], errors='coerce')
data_never_speak_mast = data.dropna(subset=['never_speak_mast'])
never_speak_mast_distribution = data_never_speak_mast['never_speak_mast'].value_counts(normalize=True) * 100
never_speak_mast_distribution.to_csv('never_speak_mast_distribution.csv')

# Convert 'educ_sex' to numeric and filter out rows with missing values
data['educ_sex'] = pd.to_numeric(data['educ_sex'], errors='coerce')
data_educ_sex = data.dropna(subset=['educ_sex'])
educ_sex_distribution = data_educ_sex['educ_sex'].value_counts(normalize=True) * 100
educ_sex_distribution.to_csv('educ_sex_distribution.csv')

# Subset data by gender for 'educ_sex'
data_male_educ = data_educ_sex[data_educ_sex['gender_identity'] == 'Cisgender male']
data_female_educ = data_educ_sex[data_educ_sex['gender_identity'] == 'Cisgender female']

data_male_educ['educ_sex'].describe().to_csv('educ_sex_summary_male.csv')
data_female_educ['educ_sex'].describe().to_csv('educ_sex_summary_female.csv')

# Calculate z-score and p-value for 'educ_sex'
sample_mean_male_educ = data_male_educ['educ_sex'].mean()
se_male_educ = data_male_educ['educ_sex'].std() / np.sqrt(len(data_male_educ))
sample_mean_female_educ = data_female_educ['educ_sex'].mean()
z_score_educ = (sample_mean_male_educ - sample_mean_female_educ) / se_male_educ
p_value_educ = 2 * (1 - stats.norm.cdf(abs(z_score_educ)))

z_score_educ_df = pd.DataFrame({'Z-Score': [z_score_educ], 'P-Value': [p_value_educ]})
z_score_educ_df.to_csv('z_score_p_value_educ_sex.csv')

# Trends in finding pornography appealing
data['appeals'] = pd.to_numeric(data['appeals'], errors='coerce')
data_appeals = data.dropna(subset=['appeals'])
appeals_distribution = data_appeals['appeals'].value_counts(normalize=True) * 100
appeals_distribution.sort_values(ascending=False).to_csv('appeals_distribution.csv')

# Subset data by gender for 'appeals'
data_male_appeals = data_appeals[data_appeals['gender_identity'] == 'Cisgender male']
data_female_appeals = data_appeals[data_appeals['gender_identity'] == 'Cisgender female']

data_male_appeals['appeals'].describe().to_csv('appeals_summary_male.csv')
data_female_appeals['appeals'].describe().to_csv('appeals_summary_female.csv')

# Calculate z-score and p-value for 'appeals'
sample_mean_male_appeals = data_male_appeals['appeals'].mean()
se_male_appeals = data_male_appeals['appeals'].std() / np.sqrt(len(data_male_appeals))
sample_mean_female_appeals = data_female_appeals['appeals'].mean()
z_score_appeals = (sample_mean_male_appeals - sample_mean_female_appeals) / se_male_appeals
p_value_appeals = 2 * (1 - stats.norm.cdf(abs(z_score_appeals)))

z_score_appeals_df = pd.DataFrame({'Z-Score': [z_score_appeals], 'P-Value': [p_value_appeals]})
z_score_appeals_df.to_csv('z_score_p_value_appeals.csv')

# Knowledge and attitudes around pornography
data['dont_know_porn_made'] = pd.to_numeric(data['dont_know_porn_made'], errors='coerce')
data_dont_know = data.dropna(subset=['dont_know_porn_made'])
dont_know_distribution = data_dont_know['dont_know_porn_made'].value_counts(normalize=True) * 100
dont_know_distribution.to_csv('dont_know_porn_made_distribution.csv')

# Subset data by gender for 'dont_know_porn_made'
data_male_dont_know = data_dont_know[data_dont_know['gender_identity'] == 'Cisgender male']
data_female_dont_know = data_dont_know[data_dont_know['gender_identity'] == 'Cisgender female']

data_male_dont_know['dont_know_porn_made'].describe().to_csv('dont_know_porn_made_summary_male.csv')
data_female_dont_know['dont_know_porn_made'].describe().to_csv('dont_know_porn_made_summary_female.csv')

# Calculate z-score and p-value for 'dont_know_porn_made'
sample_mean_male_dont_know = data_male_dont_know['dont_know_porn_made'].mean()
se_male_dont_know = data_male_dont_know['dont_know_porn_made'].std() / np.sqrt(len(data_male_dont_know))
sample_mean_female_dont_know = data_female_dont_know['dont_know_porn_made'].mean()
z_score_dont_know = (sample_mean_male_dont_know - sample_mean_female_dont_know) / se_male_dont_know
p_value_dont_know = 2 * (1 - stats.norm.cdf(abs(z_score_dont_know)))

z_score_dont_know_df = pd.DataFrame({'Z-Score': [z_score_dont_know], 'P-Value': [p_value_dont_know]})
z_score_dont_know_df.to_csv('z_score_p_value_dont_know_porn_made.csv')

# Guilt feelings around pornography
data['guilt_feelings'] = pd.to_numeric(data['guilt_feelings'], errors='coerce')
data_guilt = data.dropna(subset=['guilt_feelings'])
guilt_distribution = data_guilt['guilt_feelings'].value_counts(normalize=True) * 100
guilt_distribution.to_csv('guilt_feelings_distribution.csv')

# Subset data by gender for 'guilt_feelings'
data_male_guilt = data_guilt[data_guilt['gender_identity'] == 'Cisgender male']
data_female_guilt = data_guilt[data_guilt['gender_identity'] == 'Cisgender female']

data_male_guilt['guilt_feelings'].describe().to_csv('guilt_feelings_summary_male.csv')
data_female_guilt['guilt_feelings'].describe().to_csv('guilt_feelings_summary_female.csv')

# Calculate z-score and p-value for 'guilt_feelings'
sample_mean_male_guilt = data_male_guilt['guilt_feelings'].mean()
se_male_guilt = data_male_guilt['guilt_feelings'].std() / np.sqrt(len(data_male_guilt))
sample_mean_female_guilt = data_female_guilt['guilt_feelings'].mean()
z_score_guilt = (sample_mean_male_guilt - sample_mean_female_guilt) / se_male_guilt
p_value_guilt = 2 * (1 - stats.norm.cdf(abs(z_score_guilt)))

z_score_guilt_df = pd.DataFrame({'Z-Score': [z_score_guilt], 'P-Value': [p_value_guilt]})
z_score_guilt_df.to_csv('z_score_p_value_guilt_feelings.csv')

# Uncomfortable speaking about sex
data['uncomfy_speak_sex'] = pd.to_numeric(data['uncomfy_speak_sex'], errors='coerce')
data_uncomfy_speak_sex = data.dropna(subset=['uncomfy_speak_sex'])
uncomfy_speak_sex_distribution = data_uncomfy_speak_sex['uncomfy_speak_sex'].value_counts(normalize=True) * 100
uncomfy_speak_sex_distribution.to_csv('uncomfy_speak_sex_distribution.csv')

# Friends and people comfortable scores
data['friends_comfortable'] = pd.to_numeric(data['friends_comfortable'], errors='coerce')
data['people_comfortable'] = pd.to_numeric(data['people_comfortable'], errors='coerce')

data_friends_ppl_okay = data.dropna(subset=['friends_comfortable', 'people_comfortable'])
friends_distribution = data_friends_ppl_okay['friends_comfortable'].value_counts(normalize=True) * 100
people_distribution = data_friends_ppl_okay['people_comfortable'].value_counts(normalize=True) * 100

friends_distribution.to_csv('friends_comfortable_distribution.csv')
people_distribution.to_csv('people_comfortable_distribution.csv')

# Correlation between 'friends_comfortable' and 'people_comfortable'
correlation_friends_ppl_okay = data_friends_ppl_okay[['friends_comfortable', 'people_comfortable']].corr().loc['friends_comfortable', 'people_comfortable']
pd.DataFrame({'Correlation': [correlation_friends_ppl_okay]}).to_csv('correlation_friends_people_comfortable.csv')


## Comfort Level Overall

In the experiment, participants in the control and treatment groups describe their comfort with a variety of sexually-explicit vignettes. Some vignettes describe consensual scenarios, while others are non-consensual or unclear. In the following section, I plot the overall comfort level for each scenario, grouped by consent level.

In [None]:
# Aggregate comfort level in non-consensual scenarios.

# Define a function to plot comfort levels and save to PDF
def plot_comfort_levels(column, title, filename):
    # Count the values for the given column and fill missing categories with 0
    counts = data[column].value_counts().reindex(
        ["Very uncomfortable", "Uncomfortable", "Neither comfortable nor uncomfortable", 
         "Comfortable", "Very comfortable"], fill_value=0)

    # Create the plot
    plt.figure(figsize=(8, 6))
    sns.barplot(x=counts.index, y=counts.values)
    plt.xticks(rotation=45)
    plt.title(title)
    plt.xlabel("Comfort Level")
    plt.ylabel("Number of Responses")
    plt.tight_layout()

    # Save the plot as a PDF file
    pdf_filename = f"{filename}.pdf"
    plt.savefig(pdf_filename)
    print(f"Saved: {pdf_filename}")
    plt.close()

# List of columns, their corresponding titles, and filenames for PDFs
scenarios = [
    ('discomfort_scenario_A', 'Aggregate Comfort Level with Non-Consensual Scenario A', 'non_consent_scenario_A_comfort_level'),
    ('discomfort_scenario_F', 'Aggregate Comfort Level with Non-Consensual Scenario F', 'non_consent_scenario_F_comfort_level'),
    ('discomfort_scenario_E', 'Aggregate Comfort Level with Non-Consensual Scenario E', 'non_consent_scenario_E_comfort_level')
]

# Loop through each scenario, plot, and save as PDF
for column, title, filename in scenarios:
    plot_comfort_levels(column, title, filename)

# Combined plot of comfort levels in non-consensual scenarios

# Import GridSpec for arranging plots in a grid
from matplotlib.gridspec import GridSpec

# Create a function to plot multiple scenarios in one figure
def combined_plot_scenarios():
    # Create subplots with GridSpec
    fig = plt.figure(figsize=(18, 6))
    grid = GridSpec(1, 3, figure=fig)

    # Plot for Scenario A
    ax1 = fig.add_subplot(grid[0, 0])
    counts_scenario_A = data['discomfort_scenario_A'].value_counts().reindex(
        ["Very uncomfortable", "Uncomfortable", "Neither comfortable nor uncomfortable", 
         "Comfortable", "Very comfortable"], fill_value=0)
    sns.barplot(x=counts_scenario_A.index, y=counts_scenario_A.values, ax=ax1)
    ax1.set_title('a. Scenario A')
    ax1.set_xlabel('Comfort Level')
    ax1.set_ylabel('Number of Responses')
    ax1.set_xticklabels(counts_scenario_A.index, rotation=45)

    # Plot for Scenario F
    ax2 = fig.add_subplot(grid[0, 1])
    counts_scenario_F = data['discomfort_scenario_F'].value_counts().reindex(
        ["Very uncomfortable", "Uncomfortable", "Neither comfortable nor uncomfortable", 
         "Comfortable", "Very comfortable"], fill_value=0)
    sns.barplot(x=counts_scenario_F.index, y=counts_scenario_F.values, ax=ax2)
    ax2.set_title('b. Scenario F')
    ax2.set_xlabel('Comfort Level')
    ax2.set_ylabel('')
    ax2.set_xticklabels(counts_scenario_F.index, rotation=45)

    # Plot for Scenario E
    ax3 = fig.add_subplot(grid[0, 2])
    counts_scenario_E = data['discomfort_scenario_E'].value_counts().reindex(
        ["Very uncomfortable", "Uncomfortable", "Neither comfortable nor uncomfortable", 
         "Comfortable", "Very comfortable"], fill_value=0)
    sns.barplot(x=counts_scenario_E.index, y=counts_scenario_E.values, ax=ax3)
    ax3.set_title('c. Scenario E')
    ax3.set_xlabel('Comfort Level')
    ax3.set_ylabel('')
    ax3.set_xticklabels(counts_scenario_E.index, rotation=45)

    # Add a title to the overall figure
    fig.suptitle("Figure 2: Comfort with Non-Consensual Scenarios", fontsize=16, fontweight='bold')

    # Save the combined plot as a PDF
    plt.tight_layout(rect=[0, 0, 1, 0.95])  # Adjust layout to fit title
    plt.savefig("combined_non_consent_scenarios.pdf")
    print("Saved: combined_non_consent_scenarios.pdf")
    plt.show()

# Call the function to generate and save the combined plot
combined_plot_scenarios()

#repeat plotting for consensual scenarios.

# Function to create bar plot for each scenario
def plot_comfort_scenario(data, column, title, ax):
    # Create a target order for the comfort levels
    target_order = ["Very uncomfortable", "Uncomfortable", "Neither comfortable nor uncomfortable", 
                    "Comfortable", "Very comfortable"]
    
    # Count the values for each comfort level, filling missing categories with 0
    counts = data[column].value_counts().reindex(target_order, fill_value=0)
    
    # Create the bar plot on the given axis
    sns.barplot(x=counts.index, y=counts.values, ax=ax)
    ax.set_title(title)
    ax.set_xlabel('Comfort Level')
    ax.set_ylabel('Number of Responses')
    ax.set_xticklabels(counts.index, rotation=45)

# Set up the figure and GridSpec for arranging plots
fig = plt.figure(figsize=(18, 6))
grid = GridSpec(1, 3, figure=fig)

# Plot for Scenario Sam (consensual)
ax1 = fig.add_subplot(grid[0, 0])
plot_comfort_scenario(data, 'comfort_scenario_Sam', 'Aggregate Comfort Level with Consensual Scenario (Sam)', ax1)

# Plot for Scenario Dan
ax2 = fig.add_subplot(grid[0, 1])
plot_comfort_scenario(data, 'comfort_scenario_Dan', 'Aggregate Comfort Level with Consensual Scenario (Dan)', ax2)

# Plot for Scenario Rebecca
ax3 = fig.add_subplot(grid[0, 2])
plot_comfort_scenario(data, 'comfort_scenario_Rebecca', 'Aggregate Comfort Level with Consensual Scenario (Rebecca)', ax3)

# Set the overall title for the figure
fig.suptitle("Figure 3: Comfort with Consensual Scenarios", fontsize=16, fontweight='bold')

# Save the combined plot as a PDF
plt.tight_layout(rect=[0, 0, 1, 0.95])  # Adjust layout to fit title
plt.savefig("combined_consent_scenarios.pdf")
plt.show()

#recreate above plot with small tweaks

# Filter out rows with NaNs
data_filtered_sam = data[~data['comfort_scenario_Sam'].isna()]
data_filtered_dan = data[~data['comfort_scenario_Dan'].isna()]
data_filtered_rebecca = data[~data['comfort_scenario_Rebecca'].isna()]

# Function to create filtered bar plots
def plot_filtered_scenario(data, column, title, ax):
    # Create a target order for the comfort levels
    target_order = ["Very uncomfortable", "Uncomfortable", "Neither comfortable nor uncomfortable", 
                    "Comfortable", "Very comfortable"]
    
    # Count the values for each comfort level, filling missing categories with 0
    counts = data[column].value_counts().reindex(target_order, fill_value=0)
    
    # Create the bar plot on the given axis
    sns.barplot(x=counts.index, y=counts.values, ax=ax, color="black")
    ax.set_title(title)
    ax.set_xticklabels(['VU', 'U', 'N', 'C', 'VC'], rotation=45)
    ax.set_ylabel('Number of Responses')

# Set up the figure and GridSpec for arranging plots
fig = plt.figure(figsize=(18, 6))
grid = GridSpec(1, 3, figure=fig)

# Plot for Scenario Sam
ax1 = fig.add_subplot(grid[0, 0])
plot_filtered_scenario(data_filtered_sam, 'comfort_scenario_Sam', 'a. Sam', ax1)

# Plot for Scenario Dan
ax2 = fig.add_subplot(grid[0, 1])
plot_filtered_scenario(data_filtered_dan, 'comfort_scenario_Dan', 'b. Dan', ax2)
ax2.set_ylabel('')

# Plot for Scenario Rebecca
ax3 = fig.add_subplot(grid[0, 2])
plot_filtered_scenario(data_filtered_rebecca, 'comfort_scenario_Rebecca', 'c. Rebecca', ax3)
ax3.set_ylabel('')

# Set overall layout and title
plt.tight_layout()
fig.suptitle("Comfort with Consensual Scenarios", fontsize=16, fontweight='bold', y=1.02)
plt.subplots_adjust(top=0.88)

# Save the plot as a PDF
plt.savefig("filtered_consent_scenarios.pdf")
plt.show()

#redo plotting where consent is unclear

# Filter out missing values in the 'unclear_scenario_G' column
data_filtered_max = data[~data['unclear_scenario_G'].isna()]

# Function to create bar plots for specific scenarios
def plot_unclear_scenario(data, column, ax):
    # Create a target order for the comfort levels
    target_order = ["Very uncomfortable", "Uncomfortable", "Neither comfortable nor uncomfortable", 
                    "Comfortable", "Very comfortable"]
    
    # Count the values for each comfort level, filling missing categories with 0
    counts = data[column].value_counts().reindex(target_order, fill_value=0)
    
    # Create the bar plot
    sns.barplot(x=counts.index, y=counts.values, ax=ax, color="black")
    ax.set_xticklabels(['VU', 'U', 'N', 'C', 'VC'], rotation=45)
    ax.set_ylabel('Number of Responses')

# Set up the figure and GridSpec for arranging plots
fig = plt.figure(figsize=(12, 6))
grid = GridSpec(1, 2, figure=fig)

# Plot for unclear scenario
ax1 = fig.add_subplot(grid[0, 0])
plot_unclear_scenario(data_filtered_max, 'unclear_scenario_G', ax1)
ax1.set_title("Unclear Scenario")
ax1.set_xlabel(None)
ax1.set_ylabel("Number of Responses")

# Plot for Scenario Rebecca (from previous work)
ax2 = fig.add_subplot(grid[0, 1])
plot_unclear_scenario(data_filtered_rebecca, 'comfort_scenario_Rebecca', ax2)
ax2.set_title("Rebecca")
ax2.set_ylabel(None)

# Set the overall layout and save the plot
plt.tight_layout()
plt.subplots_adjust(top=0.88)

# Save the combined plot as a PDF
plt.savefig("unclear_and_rebecca_scenarios.pdf")
plt.show()


## Diving Further into Responses to Individual Vignettes
Some vignettes describe situations involving heterosexual partners, while some involve stituations describe homosexual partners. Level of comfort among participants may be related to whether the scenario described aligns with their sexual preference. Here, I investigate differences in comfort level between gay and straight women.

In [None]:
# Filter for Cisgender female participants
data_female = data[data['gender_identity'] == 'Cisgender female'].copy()

# Convert sexual_orientation to numeric
data_female['sexual_orientation'] = pd.to_numeric(data_female['sexual_orientation'], errors='coerce')

# Create a lookup dictionary to map comfort levels to numeric values
comfort_lookup = {
    "Very uncomfortable": 1,
    "Uncomfortable": 2,
    "Neither comfortable nor uncomfortable": 3,
    "Comfortable": 4,
    "Very comfortable": 5
}

# Use the lookup table to replace string values with numeric values for specific columns
data_female['discomfort_scenario_F'] = data_female['discomfort_scenario_F'].map(comfort_lookup)
data_female['discomfort_scenario_E'] = data_female['discomfort_scenario_E'].map(comfort_lookup)

# Subset participants into two groups based on sexual orientation
group_homosexual = data_female[data_female['sexual_orientation'] >= 3].copy()
group_heterosexual = data_female[data_female['sexual_orientation'] < 3].copy()

# Drop NaN values in 'discomfort_scenario_F' for both groups
group_homosexual = group_homosexual.dropna(subset=['discomfort_scenario_F'])
group_heterosexual = group_heterosexual.dropna(subset=['discomfort_scenario_F'])

# Calculate summary statistics for both groups
summary_homosexual = group_homosexual['discomfort_scenario_F'].describe()
summary_heterosexual = group_heterosexual['discomfort_scenario_F'].describe()

# Calculate the sample mean and standard error for 'discomfort_scenario_F' in the homosexual group
sample_mean_homosexual = group_homosexual['discomfort_scenario_F'].mean()
se_homosexual = group_homosexual['discomfort_scenario_F'].std() / np.sqrt(len(group_homosexual))

# Calculate the mean for 'discomfort_scenario_F' in the heterosexual group
mean_heterosexual = group_heterosexual['discomfort_scenario_F'].mean()

# Calculate the z-score and p-value for the difference in means between the two groups
z_score = (sample_mean_homosexual - mean_heterosexual) / se_homosexual
p_value = 2 * (1 - stats.norm.cdf(abs(z_score)))

# Create a DataFrame to store the results
results = pd.DataFrame({
    'Group': ['Homosexual', 'Heterosexual'],
    'Mean_Discomfort_Scenario_F': [sample_mean_homosexual, mean_heterosexual],
    'Standard_Error': [se_homosexual, 'N/A'], 
    'Z-Score': [z_score, 'N/A'],  
    'P-Value': [p_value, 'N/A']  
})

# Save summary statistics and results to CSV
summary_stats = pd.concat([summary_homosexual, summary_heterosexual], axis=1)
summary_stats.columns = ['Homosexual', 'Heterosexual']

# Save the results to CSV files
summary_stats.to_csv('summary_statistics.csv')
results.to_csv('results_comparison.csv', index=False)

print("Summary statistics and comparison results saved as CSV files.")


Relatedly, participants may feel uncomfortable in reading about scenarios that don't align with their sexual preference. To address this concern, I restrict to participants who completed the experiment, and then compared the number of questions skipped between straight and gay women.

In [None]:
# Filter out rows where Progress is 100%
data_complete = data[data['Progress'] == 100]

# Subset for Cisgender female participants
data_female_complete = data_complete[data_complete['gender_identity'] == 'Cisgender female']

# Subset for homosexual (sexual_orientation >= 3) and heterosexual (sexual_orientation < 3) groups
group_homosexual = data_female_complete[data_female_complete['sexual_orientation'] >= 3]
group_heterosexual = data_female_complete[data_female_complete['sexual_orientation'] < 3]

# List of columns to check for missing values (NaNs)
cols_to_check = ["discomfort_scenario_A", "discomfort_scenario_F", "discomfort_scenario_E", 
                 "discomfort_scenario_B", "discomfort_scenario_C", "discomfort_scenario_D", "unclear_scenario_G"]

# Calculate the proportion of NaN values for each column in both subsets
prop_na_homosexual = group_homosexual[cols_to_check].isna().mean()
prop_na_heterosexual = group_heterosexual[cols_to_check].isna().mean()

# Create a DataFrame to compare the proportions
prop_na_df = pd.DataFrame({
    'category': cols_to_check,
    'homosexual_prop_na': prop_na_homosexual.values,
    'heterosexual_prop_na': prop_na_heterosexual.values
})

# Save the proportions to a CSV file
prop_na_df.to_csv('non_answering_rates.csv', index=False)

print("Proportions of non-answering participants saved to 'non_answering_rates.csv'.")


## Comparing the Control and Treatment Groups

In [None]:
#identify groups

# Assign participant group based on the conditions
data['group_number'] = np.nan  # Add a new column 'group_number' initialized with NaN

# Assign values to 'group_number' based on conditions
data['group_number'] = np.where(data['info_group_start'].notna(), 'info', data['group_number'])
data['group_number'] = np.where(data['pre_test_start'].notna(), 'pressure', data['group_number'])

# For any remaining NaN values, assign 'control' group
data['group_number'].fillna('control', inplace=True)

# Verify the assignment
print(data['group_number'].value_counts())

# Convert end and start times to numeric by replacing commas with dots and converting to float
columns_to_convert = ['info_group_end', 'info_group_start', 'pre_test_end', 'pre_test_start']
for column in columns_to_convert:
    data[column] = data[column].str.replace(",", ".").astype(float)

# Verify the conversions
print(data[['info_group_end', 'info_group_start', 'pre_test_end', 'pre_test_start']].dtypes)


In [None]:
#average comfort, not by gruop

# Dictionary to replace comfort level strings with numeric values
comfort_map = {
    'Very uncomfortable': 1,
    'Uncomfortable': 2,
    'Neither comfortable nor uncomfortable': 3,
    'Comfortable': 4,
    'Very comfortable': 5
}

# List of columns to convert
columns_to_convert = ['non_chris', 'comfort_scenario_Sam', 'comfort_scenario_Dan', 
                      'comfort_scenario_Rebecca', 'non_jes_disc', 'non_beth', 'unclear_scenario_G']

# Replace comfort levels with numeric values across the specified columns
for column in columns_to_convert:
    data[column] = data[column].replace(comfort_map).astype(float)

# Calculate average comfort levels for aggregate, consensual, and non-consensual scenarios
data['agg_mean'] = data[['non_chris', 'comfort_scenario_Sam', 'comfort_scenario_Dan', 
                         'comfort_scenario_Rebecca', 'non_jes_disc', 'non_beth', 
                         'unclear_scenario_G']].mean(axis=1, skipna=True)

data['con_mean'] = data[['comfort_scenario_Sam', 'comfort_scenario_Dan', 'comfort_scenario_Rebecca']].mean(axis=1, skipna=True)

data['non_mean'] = data[['non_chris', 'non_jes_disc', 'non_beth']].mean(axis=1, skipna=True)

# Calculate median comfort levels
data['agg_med'] = data[['non_chris', 'comfort_scenario_Sam', 'comfort_scenario_Dan', 
                        'comfort_scenario_Rebecca', 'non_jes_disc', 'non_beth', 
                        'unclear_scenario_G']].median(axis=1, skipna=True)

data['con_med'] = data[['comfort_scenario_Sam', 'comfort_scenario_Dan', 'comfort_scenario_Rebecca']].median(axis=1, skipna=True)

data['non_med'] = data[['non_chris', 'non_jes_disc', 'non_beth']].median(axis=1, skipna=True)

# Prepare data for plotting the boxplot of average comfort levels
avg_columns = ['agg_mean', 'con_mean', 'non_mean']
avg_df = data[avg_columns]

# Melt the data for easier plotting
data_long = avg_df.melt(var_name='variable', value_name='value')

# Map variable names to more readable labels
data_long['variable'] = data_long['variable'].map({
    'agg_mean': 'Aggregate',
    'con_mean': 'Consensual',
    'non_mean': 'Non-Consensual'
})

# Create the boxplot
plt.figure(figsize=(8, 6))
sns.boxplot(x='variable', y='value', data=data_long)
plt.title("Average Level of Comfort by Scenario Type", fontsize=12, fontweight='bold')
plt.xlabel("Scenario Type", fontsize=10)
plt.ylabel("Average Level of Comfort", fontsize=10)
plt.xticks(fontsize=8)
plt.yticks(fontsize=8)
plt.grid(False)  # Remove grid lines
plt.tight_layout()

# Save the plot
plt.savefig("average_comfort_boxplot.pdf")
plt.show()

## Examine Survey-Taking Characteristics

In [None]:
import pandas as pd
import numpy as np

# Convert 'Duration_in_seconds' to numeric
data['duration_seconds'] = pd.to_numeric(data['duration_seconds'], errors='coerce')

# Filter the data for participants who finished the survey
data_complete = data[data['survey_completed'] == True]

# Remove outliers based on the 'duration_seconds' column (mean + 3 * std)
mean_val = data_complete['duration_seconds'].mean()
sd_val = data_complete['duration_seconds'].std()
outlier_threshold = mean_val + 3 * sd_val

# Filter data to exclude outliers
data_no_outliers = data[data['duration_seconds'] <= outlier_threshold]

# Summary statistics for 'duration_seconds' after removing outliers
summary_no_outliers = data_no_outliers['duration_seconds'].describe()
summary_no_outliers.to_csv('summary_duration_seconds_no_outliers.csv')

# Analyze time spent by gender (female participants)
data_female_complete = data_complete[data_complete['gender_identity'] == 'Cisgender female']

mean_val_female = data_female_complete['duration_seconds'].mean()
sd_val_female = data_female_complete['duration_seconds'].std()
outlier_threshold_female = mean_val_female + 3 * sd_val_female

# Filter data for females to exclude outliers
data_female_no_outliers = data_female_complete[data_female_complete['duration_seconds'] <= outlier_threshold_female]

# Summary statistics for 'duration_seconds' for females after removing outliers
summary_female_no_outliers = data_female_no_outliers['duration_seconds'].describe()
summary_female_no_outliers.to_csv('summary_duration_seconds_female_no_outliers.csv')

# Convert 'Progress' to numeric
data['survey_progress'] = pd.to_numeric(data['survey_progress'], errors='coerce')

# Analyze progress for female participants
data_female_progress = data[data['gender_identity'] == 'Cisgender female']

# Summary of 'survey_progress' for females
summary_female_progress = data_female_progress['survey_progress'].describe()
summary_female_progress.to_csv('summary_survey_progress_female.csv')

print("All survey-taking characteristics saved to CSV files.")


In [None]:
#convert column types for later analysis

# List of columns to convert to numeric
columns_to_convert = [
    'guilt_feelings', 'speak_sex', 'freq_porn', 'religious_affiliation', 'people_comfortable', 'friends_comfortable', 
    'incident_report', 'freq_masturbation', 'mast_no_activity', 'uncomfy_speak_sex', 'never_speak_activity', 
    'speak_porn', 'porn_partner', 'dont_know_creation', 'appeals', 'knowledge_creation', 'educ_sex', 
    'friends_comfortable', 'people_comfortable'
]

# Loop through the columns and convert them to numeric
for column in columns_to_convert:
    data[column] = pd.to_numeric(data[column], errors='coerce')

# Clean up categorical variables using mapping where necessary
# College completed
data['education_completed'] = data['education_completed'].map({
    'Less than one year': 0, '1 year': 1, '2 years': 2, '3 years': 3, '4 years': 4, '5+ years': 5
}).astype(float)

# Sexuality to numeric
data['sexual_orientation'] = pd.to_numeric(data['sexual_orientation'], errors='coerce')

# Age cleanup
data['age'] = data['age'].replace({
    '24 or older': 24, '23': 23, '22': 22, '21': 21, '20': 20, '19': 19, '18': 18
}).astype(float)

# Number of partners
data['num_partners'] = data['num_partners'].map({
    '0': 0, '1-2': 1, '3-4': 2, '5-6': 3, '6 or more': 4
}).astype(float)

# Assign numeric values for gender
data['gender_identity'] = data['gender_identity'].map({
    'Cisgender female': 0, 'Cisgender male': 1
}).astype(float)

# Alcohol consumption per week
data['alc_per_week'] = data['alc_per_week'].map({
    '0 times per week': 0, '1-2 times per week': 1, '3-4 times per week': 2, '5-6 times per week': 3, '7 or more times per week': 4
}).astype(float)

# Relationship status
data['relationship_status'] = data['relationship_status'].map({
    'Yes': 1, 'No': 0, 'Maybe': 0
}).astype(float)

# Frequency of sex
data['freq_sex'] = pd.to_numeric(data['freq_sex'], errors='coerce')

# Age first viewed adult material
data['age_first_exposure'] = data['age_first_exposure'].map({
    '9-11': 1, '12-13': 2, '14-15': 3, '16-17': 4, '18+': 5, 'N/A': 0
}).astype(float)

# Verify that columns were correctly converted
print(data.dtypes)

In [None]:
#create a version of the df that only includes rows where participants included the average prequency that they consume pornography

# Filter rows where 'freq_porn' is not NaN
data_filtered = data.dropna(subset=['freq_porn'])

# Create a binary 'watched_porn' variable based on 'freq_porn'
data_filtered['watched_porn'] = np.where(data_filtered['freq_porn'].isin([0, 1]), 0, 1)

# Create a binary 'race_white' variable based on race
data_filtered['race_white'] = np.where(data_filtered['ethnicity'] == 'White / Caucasian', 1, 0)

# Verify the new columns
print(data_filtered[['watched_porn', 'race_white']].head())


## Comparison with Prior Study
This study was partially inspired by a 2017 paper by Cooper and Klein, where they investigate the effect of social variables on pornography consumption. Here, I verify that the results of this survey align with their main findings.

In [None]:
# Prepare the independent variables and dependent variable for the logistic regression model
X = data_filtered[['gender_identity', 'age', 'race_white', 'religious_affiliation', 'education_completed', 'sexual_orientation', 'alc_per_week']]
X = sm.add_constant(X)  # Add a constant term for the intercept
y = data_filtered['watched_porn']

# Fit the logistic regression model (equivalent to glm with binomial family in R)
modelA = sm.Logit(y, X).fit()

# Save the model summary to a text file
with open('logistic_regression_summary.txt', 'w') as f:
    f.write(modelA.summary().as_text())

# Extract null deviance, degrees of freedom, and number of observations
null_deviance = modelA.llnull
df_null = modelA.df_model + 1  # Adding 1 for intercept
n_obs = len(modelA.fittedvalues)

# Save null deviance, degrees of freedom, and number of observations into a CSV
summary_stats = pd.DataFrame({
    'Statistic': ['Null Deviance', 'Degrees of Freedom (Null)', 'Number of Observations'],
    'Value': [null_deviance, df_null, n_obs]
})

summary_stats.to_csv('logistic_regression_summary_stats.csv', index=False)

print("Logistic regression summary saved to 'logistic_regression_summary.txt' and statistics saved to 'logistic_regression_summary_stats.csv'.")

In [None]:
# Prepare the independent variables and dependent variable for the logistic regression model
X = data_filtered[['gender_identity', 'age', 'race_white', 'religious_affiliation', 'education_completed', 'sexual_orientation', 'alc_per_week']]
X = sm.add_constant(X)  # Add a constant term for the intercept
y = data_filtered['watched_porn']

# Fit the logistic regression model
modelA = sm.Logit(y, X).fit()

# Extract coefficients and standard errors
coefficients = modelA.params
std_errors = modelA.bse

# Calculate odds ratios using exp()
odds_ratios = np.exp(coefficients)

# Calculate standard errors of the odds ratios
odds_ratios_std_errors = odds_ratios * std_errors

# Extract p-values from the model
p_values = modelA.pvalues

# Create a DataFrame to store the results
result = pd.DataFrame({
    'Coefficient': coefficients,
    'Std_Error': std_errors,
    'Odds_Ratio': odds_ratios,
    'Odds_Ratio_Std_Error': odds_ratios_std_errors,
    'P_Value': p_values
})

# Calculate residuals and fitted values
residuals = modelA.resid_response
fitted_values = modelA.fittedvalues

# Calculate chi-squared statistic
chi_squared = np.sum((residuals ** 2) / (fitted_values * (1 - fitted_values)))
df = len(coefficients) - 1
nobs = len(residuals)

# Add chi-squared, degrees of freedom, and number of observations to a separate DataFrame
chi_squared_df = pd.DataFrame({
    'Chi_Squared': [chi_squared],
    'Degrees_Freedom': [df],
    'N_Obs': [nobs]
})

# Save the results DataFrame to a CSV
result.to_csv('logistic_regression_modelA_results.csv', index=False)

# Save chi-squared information to a separate CSV
chi_squared_df.to_csv('logistic_regression_modelA_chi_squared.csv', index=False)

print("Logistic regression results saved to 'logistic_regression_modelA_results.csv' and chi-squared information saved to 'logistic_regression_modelA_chi_squared.csv'.")

In [None]:
# Prepare the independent variables and dependent variable for the logistic regression model (Model B)
X_B = data_filtered[['gender_identity', 'age', 'race_white', 'religious_affiliation', 'education_completed', 
                     'sexual_orientation', 'alc_per_week', 'num_partners', 'relationship_status', 'freq_sex', 
                     'age_first_exposure', 'freq_masturbation']]
X_B = sm.add_constant(X_B)  # Add a constant term for the intercept
y_B = data_filtered['watched_porn']

# Fit the logistic regression model (Model B)
modelB = sm.Logit(y_B, X_B).fit()

# Save the model summary to a text file
with open('logistic_regression_modelB_summary.txt', 'w') as f:
    f.write(modelB.summary().as_text())

# Extract null deviance and degrees of freedom
null_deviance_B = modelB.llnull
df_null_B = modelB.df_model + 1  # Adding 1 for intercept

# Number of observations
n_obs_B = len(modelB.fittedvalues)

# Save null deviance, degrees of freedom, and number of observations to a CSV
summary_stats_B = pd.DataFrame({
    'Statistic': ['Null Deviance', 'Degrees of Freedom (Null)', 'Number of Observations'],
    'Value': [null_deviance_B, df_null_B, n_obs_B]
})

summary_stats_B.to_csv('logistic_regression_modelB_summary_stats.csv', index=False)

# Extract coefficients, standard errors, and p-values
coefficients_B = modelB.params
std_errors_B = modelB.bse
p_values_B = modelB.pvalues

# Calculate odds ratios using exp()
odds_ratios_B = np.exp(coefficients_B)

# Calculate standard errors of the odds ratios
odds_ratios_std_errors_B = odds_ratios_B * std_errors_B

# Create a DataFrame to store the results
result_B = pd.DataFrame({
    'Coefficient': coefficients_B,
    'Std_Error': std_errors_B,
    'Odds_Ratio': odds_ratios_B,
    'Odds_Ratio_Std_Error': odds_ratios_std_errors_B,
    'P_Value': p_values_B
})

# Calculate residuals and fitted values
residuals_B = modelB.resid_response
fitted_values_B = modelB.fittedvalues

# Calculate chi-squared statistic
chi_squared_B = np.sum((residuals_B ** 2) / (fitted_values_B * (1 - fitted_values_B)))
df_B = len(coefficients_B) - 1
nobs_B = len(residuals_B)

# Add chi-squared, degrees of freedom, and number of observations to the results
result_B['Chi_Squared'] = chi_squared_B
result_B['Degrees_Freedom'] = df_B
result_B['N_Obs'] = nobs_B

# Save the result DataFrame to a CSV
result_B.to_csv('logistic_regression_modelB_results.csv', index=False)

print("Logistic regression results saved to 'logistic_regression_modelB_results.csv' and summary statistics saved to 'logistic_regression_modelB_summary_stats.csv'.")

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

# Prepare the independent variables and dependent variable for the logistic regression model (Model C)
X_C = data_filtered[['gender_identity', 'age', 'race_white', 'religious_affiliation', 'education_completed', 
                     'sexual_orientation', 'alc_per_week', 'num_partners', 'relationship_status', 'freq_sex', 
                     'age_first_exposure', 'freq_masturbation', 'speak_porn', 'speak_sex', 
                     'friends_comfortable', 'people_comfortable']]
X_C = sm.add_constant(X_C)  # Add a constant term for the intercept
y_C = data_filtered['watched_porn']

# Fit the logistic regression model (Model C)
modelC = sm.Logit(y_C, X_C).fit()

# Save the model summary to a text file
with open('logistic_regression_modelC_summary.txt', 'w') as f:
    f.write(modelC.summary().as_text())

# Extract null deviance and degrees of freedom
null_deviance_C = modelC.llnull
df_null_C = modelC.df_model + 1  # Adding 1 for intercept

# Number of observations
n_obs_C = len(modelC.fittedvalues)

# Save null deviance, degrees of freedom, and number of observations to a CSV
summary_stats_C = pd.DataFrame({
    'Statistic': ['Null Deviance', 'Degrees of Freedom (Null)', 'Number of Observations'],
    'Value': [null_deviance_C, df_null_C, n_obs_C]
})

summary_stats_C.to_csv('logistic_regression_modelC_summary_stats.csv', index=False)

# Extract coefficients, standard errors, and p-values
coefficients_C = modelC.params
std_errors_C = modelC.bse
p_values_C = modelC.pvalues

# Calculate odds ratios using exp()
odds_ratios_C = np.exp(coefficients_C)

# Calculate standard errors of the odds ratios
odds_ratios_std_errors_C = odds_ratios_C * std_errors_C

# Create a DataFrame to store the results
result_C = pd.DataFrame({
    'Coefficient': coefficients_C,
    'Std_Error': std_errors_C,
    'Odds_Ratio': odds_ratios_C,
    'Odds_Ratio_Std_Error': odds_ratios_std_errors_C,
    'P_Value': p_values_C
})

# Calculate residuals and fitted values
residuals_C = modelC.resid_response
fitted_values_C = modelC.fittedvalues

# Calculate chi-squared statistic
chi_squared_C = np.sum((residuals_C ** 2) / (fitted_values_C * (1 - fitted_values_C)))
df_C = len(coefficients_C) - 1
nobs_C = len(residuals_C)

# Add chi-squared, degrees of freedom, and number of observations to the results
result_C['Chi_Squared'] = chi_squared_C
result_C['Degrees_Freedom'] = df_C
result_C['N_Obs'] = nobs_C

# Save the result DataFrame to a CSV
result_C.to_csv('logistic_regression_modelC_results.csv', index=False)

print("Logistic regression results saved to 'logistic_regression_modelC_results.csv' and summary statistics saved to 'logistic_regression_modelC_summary_stats.csv'.")


## Other Sources of Discomfort
In the experiment, participants are asked to rate their level of comfort. However, there may be other elements of the survey-taking process that affect comfort levels. It is useful to understand trends around discomfort more broadly in order to validate that trends in discomfort are related to the treatments.

In [None]:
#did men often stop taking the survey after reading the vignettes? (sign of discomfort)

# Get the index of the 'unclear_scenario_G' column
unc_max_col = data.columns.get_loc('unclear_scenario_G')

# Select columns from 'unclear_scenario_G' to the end
selected_cols = ['gender_identity'] + data.columns[unc_max_col + 1:].tolist()

# Subset the data frame with the selected columns
data_filtered = data[selected_cols]

# Split the data into male and female subsets
female_df = data_filtered[data_filtered['gender_identity'] == 0]
male_df = data_filtered[data_filtered['gender_identity'] == 1]

# Calculate the number of missing values in each row for males and females
female_na_count = female_df.isna().sum(axis=1)
male_na_count = male_df.isna().sum(axis=1)

# Calculate and print the mean number of missing values for each gender
mean_female_na = female_na_count.mean()
mean_male_na = male_na_count.mean()

print(f"Mean number of missing values (Female): {mean_female_na}")
print(f"Mean number of missing values (Male): {mean_male_na}")

#now same Q, but by group
# Subset the dataset to only include males
males = data_filtered[data_filtered['gender_identity'] == 1]

# Subset the male dataset to only include "info" and "control" groups
male_info_control = males[males['group_number'].isin(['info', 'control'])]

# Create a new column called "num_nas" that counts the number of NAs in each row
male_info_control['num_nas'] = male_info_control.isna().sum(axis=1)

# Calculate the mean number of NAs for males in the "info" group and "control" group
mean_nas_info = male_info_control.loc[male_info_control['group_number'] == 'info', 'num_nas'].mean()
mean_nas_control = male_info_control.loc[male_info_control['group_number'] == 'control', 'num_nas'].mean()

# Print the results
print(f"Mean number of NAs for males in the 'info' group: {mean_nas_info}")
print(f"Mean number of NAs for males in the 'control' group: {mean_nas_control}")


In [None]:
#look at trends in time spent on conditions - check to see if they actually read it, and separating out effects for those who did read it

# Create new columns for the amount of time spent reading the condition
data['control_time_spent'] = np.nan  # Create a column but leave it empty for now
data['info_time_spent'] = data['info_group_end'] - data['info_group_start']
data['pre_time_spent'] = data['pre_test_end'] - data['pre_test_start']

# Calculate mean of control_time_spent for group "control"
mean_control = data.groupby('group_number')['control_time_spent'].mean().loc['control']

# Calculate mean of info_time_spent for group "info"
mean_info = data.groupby('group_number')['info_time_spent'].mean().loc['info']

# Calculate mean of pre_time_spent for group "pressure"
mean_pressure = data.groupby('group_number')['pre_time_spent'].mean().loc['pressure']

# Filter data to include only female participants
data_female = data[data['gender_identity'] == 0]

# Convert 'info_feels_pos' and 'pre_feels_pos' to numeric
data['info_feels_pos'] = pd.to_numeric(data['info_feels_pos'], errors='coerce')
data['pre_feels_pos'] = pd.to_numeric(data['pre_feels_pos'], errors='coerce')

# Summary statistics for 'info_feels_pos'
print("Summary of 'info_feels_pos':\n", data['info_feels_pos'].describe())

# Remove rows where less than a certain amount of time was spent on specific conditions
data_filtered = data[~((data['group_number'] == 'pressure') & (data['pre_time_spent'] < 30))]
data_filtered = data_filtered[~((data['group_number'] == 'info') & (data['info_time_spent'] < 20))]

# Verify the filtering
print(f"Rows remaining after filtering: {len(data_filtered)}")


In [None]:
#r did men stop taking the survey after porn vignettes (BY group)

# Get the index of the 'unclear_scenario_G' column
unc_max_col = data.columns.get_loc('unclear_scenario_G')

# Select columns from 'unclear_scenario_G' to the end, including 'gender_identity'
selected_cols = ['gender_identity'] + data.columns[unc_max_col + 1:].tolist()

# Subset the data with the selected columns
data_filtered = data[selected_cols]

# Subset the dataset to only include males (gender == 0)
males = data_filtered[data_filtered['gender_identity'] == 0]

# Subset the male dataset to only include "info" and "control" groups
male_info_control = males[males['group_number'].isin(['info', 'control'])]

# Create a new column called "num_nas" that counts the number of NAs in each row
male_info_control['num_nas'] = male_info_control.isna().sum(axis=1)

# Calculate the mean number of NAs for males in the "info" group and "control" group
mean_nas_info = male_info_control.loc[male_info_control['group_number'] == 'info', 'num_nas'].mean()
mean_nas_control = male_info_control.loc[male_info_control['group_number'] == 'control', 'num_nas'].mean()

# Print the results
print(f"Mean number of NAs for males in the 'info' group: {mean_nas_info}")
print(f"Mean number of NAs for males in the 'control' group: {mean_nas_control}")


## Final Versions of Figures for the Final Report

In [None]:
# Set up the theme for plots
sns.set(style="whitegrid")

# Create new treatment variable and map conditions
data['treatment'] = data['group_number'].replace({'info': 'treatment', 'pressure': 'treatment', 'control': 'control'})

# Melt the data for easy plotting
avgs = ['treatment', 'agg_mean', 'con_mean', 'non_mean', 'unclear_scenario_G']
avg_df = data[avgs]
data_long = avg_df.melt(id_vars='treatment', var_name='variable', value_name='value')

# Update variable names for better readability
data_long['variable'] = data_long['variable'].replace({
    'agg_mean': 'Aggregate', 
    'con_mean': 'Consensual', 
    'non_mean': 'Non-Consensual', 
    'unclear_scenario_G': 'Unclear'
})

# Drop rows with missing values
data_long = data_long.dropna(subset=['value'])

# Summary statistics and standard errors for plotting
data_long_summary = data_long.groupby(['treatment', 'variable']).agg(
    mean_value=('value', 'mean'),
    se=('value', lambda x: np.std(x, ddof=1) / np.sqrt(len(x)))
).reset_index()

# Save summary statistics to CSV
data_long_summary.to_csv('summary_statistics_all_participants.csv', index=False)

# Plotting: Boxplot for average comfort by scenario type, split by treatment
plt.figure(figsize=(10, 6))
sns.boxplot(x='variable', y='value', hue='treatment', data=data_long, palette='muted')
plt.title('Level of Comfort with Scenarios for all Participants')
plt.xlabel('Scenario Type')
plt.ylabel('Average Level of Comfort')
plt.savefig('comfort_scenarios_boxplot_all_participants.png')
plt.close()

# Bar plot with error bars for mean values
plt.figure(figsize=(10, 6))
sns.barplot(x='variable', y='mean_value', hue='treatment', data=data_long_summary, 
            palette={'info': '#9DC3E6', 'pressure': '#FAC8CD', 'control': '#CCF0E6'}, dodge=True, ci=None)

# Add error bars
for idx, row in data_long_summary.iterrows():
    plt.errorbar(x=idx % len(data_long_summary['variable'].unique()), y=row['mean_value'], 
                 yerr=row['se'], fmt='none', color='black', capsize=5)

plt.title('Average Level of Comfort by Treatment and Scenario Type')
plt.xlabel('Scenario Type')
plt.ylabel('Average Level of Comfort')
plt.savefig('comfort_by_treatment_and_scenario_barplot.png')
plt.close()

# Create plots for male and female participants separately
for gender, gender_name in [(0, 'Female'), (1, 'Male')]:
    gender_data = data[data['gender_identity'] == gender]
    avg_df_gender = gender_data[['treatment', 'agg_mean', 'con_mean', 'non_mean', 'unclear_scenario_G']]
    data_long_gender = avg_df_gender.melt(id_vars='treatment', var_name='variable', value_name='value')
    data_long_gender['variable'] = data_long_gender['variable'].replace({
        'agg_mean': 'Aggregate', 
        'con_mean': 'Consensual', 
        'non_mean': 'Non-Consensual', 
        'unclear_scenario_G': 'Unclear'
    })
    data_long_gender_summary = data_long_gender.groupby(['treatment', 'variable']).agg(
        mean_value=('value', 'mean'),
        se=('value', lambda x: np.std(x, ddof=1) / np.sqrt(len(x)))
    ).reset_index()

    # Save summary statistics to CSV for each gender
    data_long_gender_summary.to_csv(f'summary_statistics_{gender_name.lower()}.csv', index=False)

    # Bar plot for each gender
    plt.figure(figsize=(10, 6))
    sns.barplot(x='variable', y='mean_value', hue='treatment', data=data_long_gender_summary, dodge=True, 
                palette={'info': '#9DC3E6', 'pressure': '#FAC8CD', 'control': '#CCF0E6'})
    
    # Add error bars
    for idx, row in data_long_gender_summary.iterrows():
        plt.errorbar(x=idx % len(data_long_gender_summary['variable'].unique()), y=row['mean_value'], 
                     yerr=row['se'], fmt='none', color='black', capsize=5)
    
    plt.title(f'Average Level of Comfort by Scenario Type - {gender_name}')
    plt.xlabel('Scenario Type')
    plt.ylabel('Average Level of Comfort')
    plt.savefig(f'comfort_by_scenario_{gender_name.lower()}.png')
    plt.close()

# Overall plot for male and female participants for all groups
data_long_summary['treatment'] = data_long_summary['treatment'].replace({0: 'female', 1: 'male'})
data_long_summary = data_long_summary.dropna(subset=['treatment'])

# Plot for male and female participants
plt.figure(figsize=(10, 6))
sns.barplot(x='variable', y='mean_value', hue='treatment', data=data_long_summary, dodge=True, 
            palette={'female': '#FAC8CD', 'male': '#9DC3E6'})
plt.title('Average Level of Comfort by Gender')
plt.xlabel('Scenario Type')
plt.ylabel('Average Level of Comfort')
plt.savefig('comfort_by_gender_barplot.png')
plt.close()


In [None]:
# Function to prepare data for plotting
def prepare_data(data, group_filter):
    # Subset the data by group (control, info, pressure)
    data_group = data[data['group_number'] == group_filter].copy()

    # Map gender to treatment
    data_group['treatment'] = data_group['gender_identity'].replace({0: 'female', 1: 'male'})

    # Melt the data for easy plotting
    avgs = ['treatment', 'con_mean', 'non_mean', 'unclear_scenario_G']
    avg_df = data_group[avgs]
    data_long = avg_df.melt(id_vars='treatment', var_name='variable', value_name='value')

    # Rename the variable for clarity
    data_long['variable'] = data_long['variable'].replace({
        'con_mean': 'Consensual',
        'non_mean': 'Non-Consensual',
        'unclear_scenario_G': 'Unclear'
    })

    # Group by treatment and variable, calculate mean and standard error
    data_long_summary = data_long.groupby(['treatment', 'variable']).agg(
        mean_value=('value', 'mean'),
        se=('value', lambda x: np.std(x, ddof=1) / np.sqrt(len(x)))
    ).reset_index()

    # Save summary statistics to CSV for each group
    data_long_summary.to_csv(f'summary_statistics_{group_filter}.csv', index=False)

    return data_long_summary

# Prepare data for the "control", "info", and "pressure" groups
data_control = prepare_data(data, 'control')
data_info = prepare_data(data, 'info')
data_pressure = prepare_data(data, 'pressure')

# Function to create bar plots with error bars
def plot_bar(data, title, ax):
    sns.barplot(x='variable', y='mean_value', hue='treatment', data=data, dodge=True, palette={'female': '#FAC8CD', 'male': '#9DC3E6'}, ax=ax)
    
    # Add error bars
    for idx, row in data.iterrows():
        ax.errorbar(x=idx % len(data['variable'].unique()), y=row['mean_value'], 
                     yerr=row['se'], fmt='none', color='black', capsize=5)

    ax.set_title(title)
    ax.set_xlabel('Scenario Type')
    ax.set_ylabel('Mean Comfort')
    ax.tick_params(axis='x', rotation=45)

# Set up the grid for plotting the three groups together
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot for "control" group
plot_bar(data_control, 'a. Control', axes[0])

# Plot for "info" group
plot_bar(data_info, 'b. Information', axes[1])

# Plot for "pressure" group
plot_bar(data_pressure, 'c. Social Pressure', axes[2])

# Adjust layout for better spacing
plt.tight_layout()

# Save the figure
plt.savefig('comfort_by_scenario_type.png')
plt.close()

print("Summary statistics saved as CSV files and plots saved as 'comfort_by_scenario_type.png'.")

## Regression Analysis

In [None]:
# 1. Recode gender and combine cis and trans
data['gender_identity'] = data['gender_identity'].replace({0: 'female', 1: 'male'})

# 2. Linear regression models with interaction terms
# Define a function to run and save regression models
def run_regression(formula, data, model_name):
    model = ols(formula, data=data).fit()
    # Save the summary to a text file
    with open(f'{model_name}_summary.txt', 'w') as f:
        f.write(model.summary().as_text())
    return model

# Model 1: Aggregate mean regression
agg_model = run_regression('agg_mean ~ treatment + guilt_feelings + treatment*guilt_feelings + people_comfortable + friends_comfortable + freq_porn + speak_sex + religious_affiliation + gender_identity + treatment*gender_identity', data, 'agg_mean_regression')

# Model 2: Consensual mean regression
con_model = run_regression('con_mean ~ treatment + guilt_feelings + treatment*guilt_feelings + people_comfortable + friends_comfortable + freq_porn + speak_sex + religious_affiliation + gender_identity + treatment*gender_identity', data, 'con_mean_regression')

# Model 3: Non-consensual mean regression
non_model = run_regression('non_mean ~ treatment + guilt_feelings + treatment*guilt_feelings + people_comfortable + friends_comfortable + freq_porn + speak_sex + religious_affiliation + gender_identity + treatment*gender_identity', data, 'non_mean_regression')

# Model 4: Unclear mean regression
unclear_model = run_regression('unclear_scenario_G ~ treatment + guilt_feelings + treatment*guilt_feelings + people_comfortable + friends_comfortable + freq_porn + speak_sex + religious_affiliation + gender_identity + treatment*gender_identity', data, 'unclear_mean_regression')

# 3. Run regression models for low comfort participants
data_low = data[data['agg_mean'] <= 3]
agg_low_model = run_regression('agg_mean ~ treatment + guilt_feelings + treatment*guilt_feelings + people_comfortable + friends_comfortable + freq_porn + speak_sex + religious_affiliation + gender_identity + treatment*gender_identity', data_low, 'agg_low_mean_regression')

# Run regression models for high comfort participants
data_high = data[data['agg_mean'] >= 3]
agg_high_model = run_regression('agg_mean ~ treatment + guilt_feelings + treatment*guilt_feelings + people_comfortable + friends_comfortable + freq_porn + speak_sex + religious_affiliation + gender_identity + treatment*gender_identity', data_high, 'agg_high_mean_regression')

# 4. Mean comparisons between men and women
# Subset the data based on group_number
data_control = data[data['group_number'] == "control"]
data_info = data[data['group_number'] == "info"]
data_pressure = data[data['group_number'] == "pressure"]

# Calculate mean for agg_mean across groups (control, info, pressure)
mean_control = data_control['agg_mean'].mean()
mean_info = data_info['agg_mean'].mean()
mean_pressure = data_pressure['agg_mean'].mean()

# Save the means to a CSV file
means_df = pd.DataFrame({
    'Group': ['Control', 'Info', 'Pressure'],
    'Mean': [mean_control, mean_info, mean_pressure]
})
means_df.to_csv('mean_comparisons.csv', index=False)

# Calculate sample mean, standard error, z-score, and p-value for 'info' group
se_info = data_info['agg_mean'].std() / np.sqrt(len(data_info))
z_score_info = (mean_info - mean_control) / se_info
p_value_info = 2 * (1 - stats.norm.cdf(abs(z_score_info)))

# Calculate sample mean, standard error, z-score, and p-value for 'pressure' group
se_pressure = data_pressure['agg_mean'].std() / np.sqrt(len(data_pressure))
z_score_pressure = (mean_pressure - mean_control) / se_pressure
p_value_pressure = 2 * (1 - stats.norm.cdf(abs(z_score_pressure)))

# Save the z-scores and p-values to a CSV file
z_scores_df = pd.DataFrame({
    'Comparison': ['Info vs Control', 'Pressure vs Control'],
    'Z-Score': [z_score_info, z_score_pressure],
    'P-Value': [p_value_info, p_value_pressure]
})
z_scores_df.to_csv('z_scores_p_values.csv', index=False)

print("Regression summaries saved as text files and statistical results saved as CSV files.")

## Significance of treatments by sex

In [None]:
#for men

# Subset the data for males by group number (control, info, pressure)
data_male_control = data_male[data_male['group_number'] == 'control']
data_male_info = data_male[data_male['group_number'] == 'info']
data_male_pressure = data_male[data_male['group_number'] == 'pressure']

# Filter out rows with missing values for agg_mean
data_male_control_agg = data_male_control.dropna(subset=['agg_mean'])
data_male_info_agg = data_male_info.dropna(subset=['agg_mean'])
data_male_pressure_agg = data_male_pressure.dropna(subset=['agg_mean'])

# Calculate means for agg_mean
mean_control_agg = data_male_control_agg['agg_mean'].mean()
mean_info_agg = data_male_info_agg['agg_mean'].mean()
mean_pressure_agg = data_male_pressure_agg['agg_mean'].mean()

# Z-test: Compare agg_mean between info and control groups
sample_mean_info = data_male_info_agg['agg_mean'].mean()
se_info = data_male_info_agg['agg_mean'].std() / np.sqrt(len(data_male_info_agg))
z_score_info = (sample_mean_info - mean_control_agg) / se_info
p_value_info = 2 * (1 - stats.norm.cdf(abs(z_score_info)))

# Z-test: Compare agg_mean between pressure and control groups
sample_mean_pressure = data_male_pressure_agg['agg_mean'].mean()
se_pressure = data_male_pressure_agg['agg_mean'].std() / np.sqrt(len(data_male_pressure_agg))
z_score_pressure = (sample_mean_pressure - mean_control_agg) / se_pressure
p_value_pressure = 2 * (1 - stats.norm.cdf(abs(z_score_pressure)))

# Save agg_mean results to a CSV
agg_mean_results = pd.DataFrame({
    'Group': ['Control', 'Info', 'Pressure'],
    'Mean': [mean_control_agg, mean_info_agg, mean_pressure_agg],
    'Z-Score': [np.nan, z_score_info, z_score_pressure],
    'P-Value': [np.nan, p_value_info, p_value_pressure]
})
agg_mean_results.to_csv('agg_mean_male_comparisons.csv', index=False)

# Subset the data for con_mean
data_male_control_con = data_male_control.dropna(subset=['con_mean'])
data_male_info_con = data_male_info.dropna(subset=['con_mean'])
data_male_pressure_con = data_male_pressure.dropna(subset=['con_mean'])

# Calculate means for con_mean
mean_control_con = data_male_control_con['con_mean'].mean()
mean_info_con = data_male_info_con['con_mean'].mean()
mean_pressure_con = data_male_pressure_con['con_mean'].mean()

# Z-test: Compare con_mean between info and control groups
sample_mean_con_info = data_male_info_con['con_mean'].mean()
se_con_info = data_male_info_con['con_mean'].std() / np.sqrt(len(data_male_info_con))
z_score_con_info = (sample_mean_con_info - mean_control_con) / se_con_info
p_value_con_info = 2 * (1 - stats.norm.cdf(abs(z_score_con_info)))

# Z-test: Compare con_mean between pressure and control groups
sample_mean_con_pressure = data_male_pressure_con['con_mean'].mean()
se_con_pressure = data_male_pressure_con['con_mean'].std() / np.sqrt(len(data_male_pressure_con))
z_score_con_pressure = (sample_mean_con_pressure - mean_control_con) / se_con_pressure
p_value_con_pressure = 2 * (1 - stats.norm.cdf(abs(z_score_con_pressure)))

# Save con_mean results to a CSV
con_mean_results = pd.DataFrame({
    'Group': ['Control', 'Info', 'Pressure'],
    'Mean': [mean_control_con, mean_info_con, mean_pressure_con],
    'Z-Score': [np.nan, z_score_con_info, z_score_con_pressure],
    'P-Value': [np.nan, p_value_con_info, p_value_con_pressure]
})
con_mean_results.to_csv('con_mean_male_comparisons.csv', index=False)

print("Results saved to 'agg_mean_male_comparisons.csv' and 'con_mean_male_comparisons.csv'.")

#same stats, for non-concensual only

# Subset the data for males by group number (control, info, pressure)
data_male_control = data_male[data_male['group_number'] == 'control']
data_male_info = data_male[data_male['group_number'] == 'info']
data_male_pressure = data_male[data_male['group_number'] == 'pressure']

# Filter out rows with missing values for non_mean
data_male_control_non_mean = data_male_control.dropna(subset=['non_mean'])
data_male_info_non_mean = data_male_info.dropna(subset=['non_mean'])
data_male_pressure_non_mean = data_male_pressure.dropna(subset=['non_mean'])

# Calculate means for non_mean
mean_control_non_mean = data_male_control_non_mean['non_mean'].mean()
mean_info_non_mean = data_male_info_non_mean['non_mean'].mean()
mean_pressure_non_mean = data_male_pressure_non_mean['non_mean'].mean()

# Z-test: Compare non_mean between info and control groups
sample_mean_info = data_male_info_non_mean['non_mean'].mean()
se_info = data_male_info_non_mean['non_mean'].std() / np.sqrt(len(data_male_info_non_mean))
z_score_info = (sample_mean_info - mean_control_non_mean) / se_info
p_value_info = 2 * (1 - stats.norm.cdf(abs(z_score_info)))

# Z-test: Compare non_mean between pressure and control groups
sample_mean_pressure = data_male_pressure_non_mean['non_mean'].mean()
se_pressure = data_male_pressure_non_mean['non_mean'].std() / np.sqrt(len(data_male_pressure_non_mean))
z_score_pressure = (sample_mean_pressure - mean_control_non_mean) / se_pressure
p_value_pressure = 2 * (1 - stats.norm.cdf(abs(z_score_pressure)))

# Save non_mean results to a CSV
non_mean_results = pd.DataFrame({
    'Group': ['Control', 'Info', 'Pressure'],
    'Mean': [mean_control_non_mean, mean_info_non_mean, mean_pressure_non_mean],
    'Z-Score': [np.nan, z_score_info, z_score_pressure],
    'P-Value': [np.nan, p_value_info, p_value_pressure]
})
non_mean_results.to_csv('non_mean_male_comparisons.csv', index=False)

# Z-test for pressure group comparing non_mean to control group's con_mean
sample_mean_pressure = data_male_pressure_non_mean['non_mean'].mean()
se_pressure = data_male_pressure_non_mean['non_mean'].std() / np.sqrt(len(data_male_pressure_non_mean))

# Compare pressure group 'non_mean' to control group's 'con_mean'
z_score_con_pressure = (sample_mean_pressure - data_male_control_con_mean['con_mean'].mean()) / se_pressure
p_value_con_pressure = 2 * (1 - stats.norm.cdf(abs(z_score_con_pressure)))

# Save pressure vs control con_mean comparison to a CSV
pressure_vs_control_con_mean = pd.DataFrame({
    'Comparison': ['Pressure non_mean vs Control con_mean'],
    'Z-Score': [z_score_con_pressure],
    'P-Value': [p_value_con_pressure]
})
pressure_vs_control_con_mean.to_csv('pressure_vs_control_con_mean.csv', index=False)

# Repeat for unclear consent scenarios (unc_max)
# Filter out rows with missing values for unc_max
data_male_control_unc_max = data_male_control.dropna(subset=['unc_max'])
data_male_info_unc_max = data_male_info.dropna(subset=['unc_max'])
data_male_pressure_unc_max = data_male_pressure.dropna(subset=['unc_max'])

# Calculate means for unc_max
mean_control_unc_max = data_male_control_unc_max['unc_max'].mean()
mean_info_unc_max = data_male_info_unc_max['unc_max'].mean()
mean_pressure_unc_max = data_male_pressure_unc_max['unc_max'].mean()

# Z-test: Compare unc_max between info and control groups
sample_mean_info = data_male_info_unc_max['unc_max'].mean()
se_info = data_male_info_unc_max['unc_max'].std() / np.sqrt(len(data_male_info_unc_max))
z_score_info = (sample_mean_info - mean_control_unc_max) / se_info
p_value_info = 2 * (1 - stats.norm.cdf(abs(z_score_info)))

# Z-test: Compare unc_max between pressure and control groups
sample_mean_pressure = data_male_pressure_unc_max['unc_max'].mean()
se_pressure = data_male_pressure_unc_max['unc_max'].std() / np.sqrt(len(data_male_pressure_unc_max))
z_score_pressure = (sample_mean_pressure - mean_control_unc_max) / se_pressure
p_value_pressure = 2 * (1 - stats.norm.cdf(abs(z_score_pressure)))

# Save unc_max results to a CSV
unc_max_results = pd.DataFrame({
    'Group': ['Control', 'Info', 'Pressure'],
    'Mean': [mean_control_unc_max, mean_info_unc_max, mean_pressure_unc_max],
    'Z-Score': [np.nan, z_score_info, z_score_pressure],
    'P-Value': [np.nan, p_value_info, p_value_pressure]
})
unc_max_results.to_csv('unc_max_male_comparisons.csv', index=False)

print("Results saved to 'non_mean_male_comparisons.csv', 'pressure_vs_control_con_mean.csv', and 'unc_max_male_comparisons.csv'.")


In [None]:
#for women

# Function to calculate z-scores and p-values
def calculate_z_test(info_mean, control_mean, se):
    z_score = (info_mean - control_mean) / se
    p_value = 2 * (1 - stats.norm.cdf(abs(z_score)))
    return z_score, p_value

### 1. Agg Mean Analysis for Females

# Filter out rows with missing values for agg_mean
data_female_control_agg = data_female_control.dropna(subset=['agg_mean'])
data_female_info_agg = data_female_info.dropna(subset=['agg_mean'])
data_female_pressure_agg = data_female_pressure.dropna(subset=['agg_mean'])

# Calculate means
mean_control_agg = data_female_control_agg['agg_mean'].mean()
mean_info_agg = data_female_info_agg['agg_mean'].mean()
mean_pressure_agg = data_female_pressure_agg['agg_mean'].mean()

# Z-test: Compare agg_mean between info and control groups
se_info = data_female_info_agg['agg_mean'].std() / np.sqrt(len(data_female_info_agg))
z_score_info, p_value_info = calculate_z_test(mean_info_agg, mean_control_agg, se_info)

# Z-test: Compare agg_mean between pressure and control groups
se_pressure = data_female_pressure_agg['agg_mean'].std() / np.sqrt(len(data_female_pressure_agg))
z_score_pressure, p_value_pressure = calculate_z_test(mean_pressure_agg, mean_control_agg, se_pressure)

# Save agg_mean results to a CSV
agg_mean_results = pd.DataFrame({
    'Group': ['Control', 'Info', 'Pressure'],
    'Mean': [mean_control_agg, mean_info_agg, mean_pressure_agg],
    'Z-Score': [np.nan, z_score_info, z_score_pressure],
    'P-Value': [np.nan, p_value_info, p_value_pressure]
})
agg_mean_results.to_csv('agg_mean_female_comparisons.csv', index=False)

### 2. Con Mean Analysis for Females

# Filter out rows with missing values for con_mean
data_female_control_con = data_female_control.dropna(subset=['con_mean'])
data_female_info_con = data_female_info.dropna(subset=['con_mean'])
data_female_pressure_con = data_female_pressure.dropna(subset=['con_mean'])

# Calculate means
mean_control_con = data_female_control_con['con_mean'].mean()
mean_info_con = data_female_info_con['con_mean'].mean()
mean_pressure_con = data_female_pressure_con['con_mean'].mean()

# Z-test: Compare con_mean between info and control groups
se_info_con = data_female_info_con['con_mean'].std() / np.sqrt(len(data_female_info_con))
z_score_info_con, p_value_info_con = calculate_z_test(mean_info_con, mean_control_con, se_info_con)

# Z-test: Compare con_mean between pressure and control groups
se_pressure_con = data_female_pressure_con['con_mean'].std() / np.sqrt(len(data_female_pressure_con))
z_score_pressure_con, p_value_pressure_con = calculate_z_test(mean_pressure_con, mean_control_con, se_pressure_con)

# Save con_mean results to a CSV
con_mean_results = pd.DataFrame({
    'Group': ['Control', 'Info', 'Pressure'],
    'Mean': [mean_control_con, mean_info_con, mean_pressure_con],
    'Z-Score': [np.nan, z_score_info_con, z_score_pressure_con],
    'P-Value': [np.nan, p_value_info_con, p_value_pressure_con]
})
con_mean_results.to_csv('con_mean_female_comparisons.csv', index=False)

### 3. Non Mean Analysis for Females

# Filter out rows with missing values for non_mean
data_female_control_non = data_female_control.dropna(subset=['non_mean'])
data_female_info_non = data_female_info.dropna(subset=['non_mean'])
data_female_pressure_non = data_female_pressure.dropna(subset=['non_mean'])

# Calculate means
mean_control_non = data_female_control_non['non_mean'].mean()
mean_info_non = data_female_info_non['non_mean'].mean()
mean_pressure_non = data_female_pressure_non['non_mean'].mean()

# Z-test: Compare non_mean between info and control groups
se_info_non = data_female_info_non['non_mean'].std() / np.sqrt(len(data_female_info_non))
z_score_info_non, p_value_info_non = calculate_z_test(mean_info_non, mean_control_non, se_info_non)

# Z-test: Compare non_mean between pressure and control groups
se_pressure_non = data_female_pressure_non['non_mean'].std() / np.sqrt(len(data_female_pressure_non))
z_score_pressure_non, p_value_pressure_non = calculate_z_test(mean_pressure_non, mean_control_non, se_pressure_non)

# Save non_mean results to a CSV
non_mean_results = pd.DataFrame({
    'Group': ['Control', 'Info', 'Pressure'],
    'Mean': [mean_control_non, mean_info_non, mean_pressure_non],
    'Z-Score': [np.nan, z_score_info_non, z_score_pressure_non],
    'P-Value': [np.nan, p_value_info_non, p_value_pressure_non]
})
non_mean_results.to_csv('non_mean_female_comparisons.csv', index=False)

### 4. Unclear Mean Analysis for Females

# Filter out rows with missing values for unc_max
data_female_control_unc_max = data_female_control.dropna(subset=['unc_max'])
data_female_info_unc_max = data_female_info.dropna(subset=['unc_max'])
data_female_pressure_unc_max = data_female_pressure.dropna(subset=['unc_max'])

# Calculate means
mean_control_unc_max = data_female_control_unc_max['unc_max'].mean()
mean_info_unc_max = data_female_info_unc_max['unc_max'].mean()
mean_pressure_unc_max = data_female_pressure_unc_max['unc_max'].mean()

# Z-test: Compare unc_max between info and control groups
se_info_unc_max = data_female_info_unc_max['unc_max'].std() / np.sqrt(len(data_female_info_unc_max))
z_score_info_unc_max, p_value_info_unc_max = calculate_z_test(mean_info_unc_max, mean_control_unc_max, se_info_unc_max)

# Z-test: Compare unc_max between pressure and control groups
se_pressure_unc_max = data_female_pressure_unc_max['unc_max'].std() / np.sqrt(len(data_female_pressure_unc_max))
z_score_pressure_unc_max, p_value_pressure_unc_max = calculate_z_test(mean_pressure_unc_max, mean_control_unc_max, se_pressure_unc_max)

# Save unc_max results to a CSV
unc_max_results = pd.DataFrame({
    'Group': ['Control', 'Info', 'Pressure'],
    'Mean': [mean_control_unc_max, mean_info_unc_max, mean_pressure_unc_max],
    'Z-Score': [np.nan, z_score_info_unc_max, z_score_pressure_unc_max],
    'P-Value': [np.nan, p_value_info_unc_max, p_value_pressure_unc_max]
})
unc_max_results.to_csv('unc_max_female_comparisons.csv', index=False)

print("Results saved to CSV files for all analyses.")

## Analysis depending on consumption outside the experiment

Participants' level of comfort is likely to be affected by how frequently they consume pornography outside the experiments, so we examine separately by those groups here.

In [None]:
# Z-Test Function
def calculate_z_test(group1_mean, group2_mean, se):
    z_score = (group1_mean - group2_mean) / se
    p_value = 2 * (1 - stats.norm.cdf(abs(z_score)))
    return z_score, p_value

### 1. Split into low and high frequency for 'freq_porn'
data_freq = data.dropna(subset=['freq_porn'])

# Split into low and high frequency based on 'freq_porn'
data_low_freq = data_freq[data_freq['freq_porn'] <= 2].copy()
data_high_freq = data_freq[data_freq['freq_porn'] >= 3].copy()

# Save the low and high frequency data to CSV
data_low_freq.to_csv('low_freq_porn.csv', index=False)
data_high_freq.to_csv('high_freq_porn.csv', index=False)

### 2. Mean comparison for agg_mean (low freq_porn only)

# Subset data by group_number for low freq_porn
data_low_control = data_low_freq[data_low_freq['group_number'] == 'control'].dropna(subset=['agg_mean'])
data_low_info = data_low_freq[data_low_freq['group_number'] == 'info'].dropna(subset=['agg_mean'])
data_low_pressure = data_low_freq[data_low_freq['group_number'] == 'pressure'].dropna(subset=['agg_mean'])

# Calculate means for agg_mean
mean_control_agg = data_low_control['agg_mean'].mean()
mean_info_agg = data_low_info['agg_mean'].mean()
mean_pressure_agg = data_low_pressure['agg_mean'].mean()

# Z-test between info and control groups
se_info = data_low_info['agg_mean'].std() / np.sqrt(len(data_low_info))
z_score_info, p_value_info = calculate_z_test(mean_info_agg, mean_control_agg, se_info)

# Z-test between pressure and control groups
se_pressure = data_low_pressure['agg_mean'].std() / np.sqrt(len(data_low_pressure))
z_score_pressure, p_value_pressure = calculate_z_test(mean_pressure_agg, mean_control_agg, se_pressure)

# Save agg_mean low freq_porn results to a CSV
agg_mean_low_results = pd.DataFrame({
    'Group': ['Control', 'Info', 'Pressure'],
    'Mean': [mean_control_agg, mean_info_agg, mean_pressure_agg],
    'Z-Score': [np.nan, z_score_info, z_score_pressure],
    'P-Value': [np.nan, p_value_info, p_value_pressure]
})
agg_mean_low_results.to_csv('agg_mean_low_freq_porn_comparisons.csv', index=False)

### 3. Mean comparison for unc_max (low freq_porn only)

# Subset data for unc_max
data_low_control_unc = data_low_control.dropna(subset=['unc_max'])
data_low_info_unc = data_low_info.dropna(subset=['unc_max'])
data_low_pressure_unc = data_low_pressure.dropna(subset=['unc_max'])

# Calculate means for unc_max
mean_control_unc = data_low_control_unc['unc_max'].mean()
mean_info_unc = data_low_info_unc['unc_max'].mean()
mean_pressure_unc = data_low_pressure_unc['unc_max'].mean()

# Z-test for unc_max between info and control groups
se_info_unc = data_low_info_unc['unc_max'].std() / np.sqrt(len(data_low_info_unc))
z_score_info_unc, p_value_info_unc = calculate_z_test(mean_info_unc, mean_control_unc, se_info_unc)

# Z-test for unc_max between pressure and control groups
se_pressure_unc = data_low_pressure_unc['unc_max'].std() / np.sqrt(len(data_low_pressure_unc))
z_score_pressure_unc, p_value_pressure_unc = calculate_z_test(mean_pressure_unc, mean_control_unc, se_pressure_unc)

# Save unc_max low freq_porn results to a CSV
unc_max_low_results = pd.DataFrame({
    'Group': ['Control', 'Info', 'Pressure'],
    'Mean': [mean_control_unc, mean_info_unc, mean_pressure_unc],
    'Z-Score': [np.nan, z_score_info_unc, z_score_pressure_unc],
    'P-Value': [np.nan, p_value_info_unc, p_value_pressure_unc]
})
unc_max_low_results.to_csv('unc_max_low_freq_porn_comparisons.csv', index=False)

### 4. Mean comparison for agg_mean (high freq_porn only)

# Subset data by group_number for high freq_porn
data_high_control = data_high_freq[data_high_freq['group_number'] == 'control'].dropna(subset=['agg_mean'])
data_high_info = data_high_freq[data_high_freq['group_number'] == 'info'].dropna(subset=['agg_mean'])
data_high_pressure = data_high_freq[data_high_freq['group_number'] == 'pressure'].dropna(subset=['agg_mean'])

# Calculate means for agg_mean
mean_control_agg_high = data_high_control['agg_mean'].mean()
mean_info_agg_high = data_high_info['agg_mean'].mean()
mean_pressure_agg_high = data_high_pressure['agg_mean'].mean()

# Z-test for agg_mean between info and control groups
se_info_high = data_high_info['agg_mean'].std() / np.sqrt(len(data_high_info))
z_score_info_high, p_value_info_high = calculate_z_test(mean_info_agg_high, mean_control_agg_high, se_info_high)

# Z-test for agg_mean between pressure and control groups
se_pressure_high = data_high_pressure['agg_mean'].std() / np.sqrt(len(data_high_pressure))
z_score_pressure_high, p_value_pressure_high = calculate_z_test(mean_pressure_agg_high, mean_control_agg_high, se_pressure_high)

# Save agg_mean high freq_porn results to a CSV
agg_mean_high_results = pd.DataFrame({
    'Group': ['Control', 'Info', 'Pressure'],
    'Mean': [mean_control_agg_high, mean_info_agg_high, mean_pressure_agg_high],
    'Z-Score': [np.nan, z_score_info_high, z_score_pressure_high],
    'P-Value': [np.nan, p_value_info_high, p_value_pressure_high]
})
agg_mean_high_results.to_csv('agg_mean_high_freq_porn_comparisons.csv', index=False)

### 5. Z-tests for con_mean and non_mean (high freq_porn only)

# Subset data for con_mean (high frequency)
data_high_control_con = data_high_control.dropna(subset=['con_mean'])
data_high_info_con = data_high_info.dropna(subset=['con_mean'])
data_high_pressure_con = data_high_pressure.dropna(subset=['con_mean'])

# Z-test for con_mean between info and control groups
se_info_con_high = data_high_info_con['con_mean'].std() / np.sqrt(len(data_high_info_con))
z_score_info_con_high, p_value_info_con_high = calculate_z_test(data_high_info_con['con_mean'].mean(), data_high_control_con['con_mean'].mean(), se_info_con_high)

# Z-test for con_mean between pressure and control groups
se_pressure_con_high = data_high_pressure_con['con_mean'].std() / np.sqrt(len(data_high_pressure_con))
z_score_pressure_con_high, p_value_pressure_con_high = calculate_z_test(data_high_pressure_con['con_mean'].mean(), data_high_control_con['con_mean'].mean(), se_pressure_con_high)

# Save con_mean high freq_porn results to a CSV
con_mean_high_results = pd.DataFrame({
    'Group': ['Control', 'Info', 'Pressure'],
    'Z-Score': [np.nan, z_score_info_con_high, z_score_pressure_con_high],
    'P-Value': [np.nan, p_value_info_con_high, p_value_pressure_con_high]
})
con_mean_high_results.to_csv('con_mean_high_freq_porn_comparisons.csv', index=False)

# Repeat similar Z-tests for non_mean and unc_max (high freq_porn)
data_high_control_non = data_high_control.dropna(subset=['non_mean'])
data_high_info_non = data_high_info.dropna(subset=['non_mean'])
data_high_pressure_non = data_high_pressure.dropna(subset(['non_mean']))

# Z-test for non_mean between info and control groups
se_info_non_high = data_high_info_non['non_mean'].std() / np.sqrt(len(data_high_info_non))
z_score_info_non_high, p_value_info_non_high = calculate_z_test(data_high_info_non['non_mean'].mean(), data_high_control_non['non_mean'].mean(), se_info_non_high)

# Z-test for non_mean between pressure and control groups
se_pressure_non_high = data_high_pressure_non['non_mean'].std() / np.sqrt(len(data_high_pressure_non))
z_score_pressure_non_high, p_value_pressure_non_high = calculate_z_test(data_high_pressure_non['non_mean'].mean(), data_high_control_non['non_mean'].mean(), se_pressure_non_high)

# Save non_mean high freq_porn results to a CSV
non_mean_high_results = pd.DataFrame({
    'Group': ['Control', 'Info', 'Pressure'],
    'Z-Score': [np.nan, z_score_info_non_high, z_score_pressure_non_high],
    'P-Value': [np.nan, p_value_info_non_high, p_value_pressure_non_high]
})
non_mean_high_results.to_csv('non_mean_high_freq_porn_comparisons.csv', index=False)

### 6. Linear Regression Comparisons for agg_mean and freq_porn

# Linear regression for control group
regression_control = sm.OLS.from_formula('agg_mean ~ gender_identity + freq_porn', data=data_control).fit()
with open('regression_control_summary.txt', 'w') as f:
    f.write(regression_control.summary().as_text())

# Linear regression for info group
regression_info = sm.OLS.from_formula('agg_mean ~ gender_identity + freq_porn', data=data_info).fit()
with open('regression_info_summary.txt', 'w') as f:
    f.write(regression_info.summary().as_text())

# Linear regression for pressure group
regression_pressure = sm.OLS.from_formula('agg_mean ~ gender_identity + freq_porn', data=data_pressure).fit()
with open('regression_pressure_summary.txt', 'w') as f:
    f.write(regression_pressure.summary().as_text())

print("All results saved to CSV and text files.")

In [None]:
# 1. Linear regression to test if gender differences are fully explained by freq_porn
model = ols('agg_mean ~ treatment + gender_identity + freq_porn', data=data).fit()

# Save the model summary to a text file
with open('linear_regression_gender_freq_porn_summary.txt', 'w') as f:
    f.write(model.summary().as_text())

# Test the significance of the gender coefficient after controlling for freq_porn
model_controlled = ols('agg_mean ~ treatment + freq_porn + gender_identity:freq_porn', data=data).fit()
anova_results = sm.stats.anova_lm(model, model_controlled)

# Save ANOVA results to a CSV file
anova_results.to_csv('anova_gender_freq_porn.csv')

# 2. Linear regressions for male and female participants separately
# For males
model_male = ols('agg_mean ~ freq_porn', data=data_male).fit()
with open('linear_regression_male_freq_porn_summary.txt', 'w') as f:
    f.write(model_male.summary().as_text())

# For females
model_female = ols('agg_mean ~ freq_porn', data=data_female).fit()
with open('linear_regression_female_freq_porn_summary.txt', 'w') as f:
    f.write(model_female.summary().as_text())

# 3. Double difference model: Interaction between guilt and treatment, for males and females

# Create new 'guilt' and 'treat_yes' columns based on thresholds
data_male['guilt'] = np.where(data_male['guilt_feelings'] <= 2, 0, 1)
data_female['guilt'] = np.where(data_female['guilt_feelings'] <= 2, 0, 1)
data_male['treat_yes'] = np.where((data_male['group_number'] == 'info') | (data_male['group_number'] == 'pressure'), 1, 0)
data_female['treat_yes'] = np.where((data_female['group_number'] == 'info') | (data_female['group_number'] == 'pressure'), 1, 0)

# Run linear regression models with interaction terms
model_male_double = ols('agg_mean ~ treat_yes + guilt + treat_yes*guilt', data=data_male).fit()
with open('double_diff_male_summary.txt', 'w') as f:
    f.write(model_male_double.summary().as_text())

model_female_double = ols('agg_mean ~ treat_yes + guilt + treat_yes*guilt', data=data_female).fit()
with open('double_diff_female_summary.txt', 'w') as f:
    f.write(model_female_double.summary().as_text())

# 4. Interaction between guilt and group_number for men
model_male_guilt_group = ols('agg_mean ~ guilt * group_number', data=data_male).fit()
with open('interaction_guilt_group_male_summary.txt', 'w') as f:
    f.write(model_male_guilt_group.summary().as_text())

# 5. Triple difference model to test interactions between guilt, treatment, and gender
data['guilt'] = np.where(data['guilt_feelings'] <= 2, 0, 1)
data['treat_yes'] = np.where((data['group_number'] == 'info') | (data['group_number'] == 'pressure'), 1, 0)

# Linear regression model with triple interaction terms
model_triple_diff = ols('agg_mean ~ guilt + treat_yes + guilt*treat_yes + gender_identity + gender_identity*treat_yes + treat_yes*guilt*gender_identity', data=data).fit()
with open('triple_diff_summary.txt', 'w') as f:
    f.write(model_triple_diff.summary().as_text())

# 6. Correlation between freq_porn and comfort with non-consensual scenarios
non_mean_correlation = data[['non_mean', 'freq_porn']].corr().loc['non_mean', 'freq_porn']
unc_max_correlation = data[['unc_max', 'freq_porn']].corr().loc['unc_max', 'freq_porn']

# Save correlation results to a CSV file
correlation_results = pd.DataFrame({
    'Variable': ['non_mean', 'unc_max'],
    'Correlation with freq_porn': [non_mean_correlation, unc_max_correlation]
})
correlation_results.to_csv('freq_porn_correlations.csv', index=False)

print("All results saved to CSV and text files.")