In [3]:
# Import seaborn and apply its plotting styles
import seaborn as sns
sns.set(font_scale=2, style="white")

# import matplotlib
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.style as style
# set plotting size parameter
plt.rcParams['figure.figsize'] = (17, 7)

# import pandas & numpy library
import pandas as pd
import numpy as np

# Statmodels & patsy
import patsy
import statsmodels.api as sm

import re
import csv

Clean Data

In [12]:
data = pd.read_csv('164_data.csv')

# Drop and rename columns
data = data.drop(labels = ['EndDate', 'SC0', 'Duration (in seconds)', 'Status', 'IPAddress', 'Progress', 
        'Finished', 'RecordedDate', 'ResponseId',
        'RecipientLastName', 'RecipientFirstName', 'RecipientEmail',
        'ExternalReference', 'LocationLatitude', 'LocationLongitude',
        'DistributionChannel', 'UserLanguage', 'Q9',
        'Q10', 'Q11', 'Q12', 'Q13', 'Q14', 'Q15', 'Q16', 'Q17', 'Q18', 'Q20',
        'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26', 'Q27', 'Q28', 'Q29',
        'Q32_1', 'Q32_2', 'Q32_3',
        'Q32_4', 'Q32_5', 'Q32_6', 'Q32_7', 'Q32_8', 'Q32_9', 'Q32_10',
        'Q32_11', 'Q32_12', 'Q32_13', 'Q32_14', 'Q32_15', 'Q32_16'], axis=1)

data.columns = ['StartDate', 'Q_Eat', 'Q_Sleep', 'Q_Comfort', 'Q_Exercise',
       'C_First Click', 'C_Last Click', 'C_Page Submit',
       'C_Click Count', 'E_First Click', 'E_Last Click',
       'E_Page Submit', 'E_Click Count',
       'Q_Age', 'Q_Gender', 'Q_Ethn', 'Q_Income', 'Q_FinSec', 'PSS',
       'Worry', 'SIMS', 'IM', 'IR', 'ER', 'AM', 'KeyResponses', 'PlayerScore']

data = data.iloc[1:]

# Add column describing experimental/control group
data = data.assign(Group= data['C_Page Submit'].isna().apply(lambda x: 'Experimental' if x == True else 'Control'))

In [13]:
# Combine column repeats
def combine_CE(data):
    CE_cols = np.unique([col[1] for col in data.columns.str.split('_') if col[0] in ['C', 'E']])
    for label in CE_cols:
        C_col = f"C_{label}"
        E_col = f"E_{label}"
        if C_col in data.columns and E_col in data.columns:
            data = data.assign(**{label: data[C_col].fillna(data[E_col])})
            data = data.drop(labels= [C_col, E_col], axis=1)

    return data

data = combine_CE(data)

In [14]:
# Split data into data for analysis and demographic information
demographic_info = data[['Q_Age', 'Q_Gender', 'Q_Ethn', 'Q_Income', 'Q_FinSec']]
data = data[data.columns.difference(['Q_Age', 'Q_Gender', 'Q_Ethn', 'Q_Income', 'Q_FinSec'])].drop(labels= ['First Click', 'Last Click'], axis=1)

In [17]:
# Convert numeric columns to floats for handling
def float_convert(value):
    try:
        return float(value)
    except:
        return np.nan

num_cols = ['PSS', 'Worry', 'SIMS', 'IM', 'IR', 'ER', 'AM', 'PlayerScore', 'Page Submit']
for col in num_cols:
    data[col] = data[col].apply(float_convert)

In [18]:
# Add column for self-determination scale based on the formula
data = data.assign(SDI= 2*data["IM"]+data["IR"]-data["ER"]-2*data["AM"]).drop(labels= ['IM', 'IR', 'ER', 'AM'], axis=1)

In [20]:
a_cols = ['Score', 'PSS', 'Worry', 'SIMS', 'SDI', 'PlayerScore', 'Page Submit']
data


Unnamed: 0,Click Count,Group,KeyResponses,PSS,Page Submit,PlayerScore,Q_Comfort,Q_Eat,Q_Exercise,Q_Sleep,SIMS,StartDate,Worry,SDI
1,0,Experimental,,13.0,651.983,750.0,Somewhat comfortable,≤ 2 hours ago,1-2 days a week,5-7 hours,43.0,3/7/25 15:06,31.0,0.0
2,1,Control,,28.0,46.312,0.0,Somewhat uncomfortable,≤ 2 hours ago,1-2 days a week,5-7 hours,55.0,3/7/25 15:52,33.0,12.0
3,0,Control,i,10.0,312.924,600.0,Somewhat comfortable,3-5 hours ago,3-5 days a week,5-7 hours,64.0,3/7/25 17:40,14.0,30.0
4,0,Experimental,,18.0,610.575,1050.0,Somewhat uncomfortable,≤ 2 hours ago,6-7 days a week,5-7 hours,67.0,3/8/25 00:01,37.0,3.0
5,1,Experimental,,32.0,612.827,650.0,Very uncomfortable,≤ 2 hours ago,3-5 days a week,2-4 hours,43.0,3/8/25 17:51,50.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103,0,Control,,19.0,620.362,650.0,Very comfortable,≤ 2 hours ago,3-5 days a week,5-7 hours,58.0,3/13/25 14:09,38.0,27.0
104,8,Experimental,"i,i,u,i,u",40.0,78.401,0.0,Very uncomfortable,3-5 hours ago,0 days a week,2-4 hours,16.0,3/13/25 15:17,50.0,0.0
105,0,Experimental,i,20.0,535.578,650.0,Somewhat uncomfortable,≤ 2 hours ago,1-2 days a week,5-7 hours,61.0,3/13/25 15:31,26.0,21.0
106,1,Control,,18.0,22.556,0.0,Neither comfortable nor uncomfortable,3-5 hours ago,1-2 days a week,5-7 hours,58.0,3/13/25 16:57,28.0,36.0


In [None]:
def btwn_groups_plots(data, subplots, fig_size):
    v, h = subplots[:2]
    fig, axes = plt.subplots(v, h, figsize=fig_size)
    label_size = 9
    i = 0

    if v * h == 1:
        axes = [axes]
    else:
        axes = axes.flatten()
    
    for col in a_cols:
        sns.histplot(data= data, x= col, ax=axes[i], hue= 'Group', kde= True, common_norm= True)
        
        axes[i].set_title(col, fontsize=label_size)
        axes[i].set_xlabel(col, fontsize=label_size)
        axes[i].set_ylabel('Density', fontsize=label_size)
        axes[i].tick_params(axis="both", labelsize=label_size)
        i += 1
    fig.delaxes(axes[-1])
    fig.delaxes(axes[-2])
    plt.tight_layout()

btwn_groups_plots(data, [len(num_cols)//2+1, 2, 1], [15, 30])

In [None]:
# temp
E_data = data[data['Group'] == 'Experimental']
C_data = data[data['Group'] == 'Control']

def within_groups_plots(data, subplots, fig_size):
    v, h = subplots[:2]
    fig, axes = plt.subplots(v, h, figsize=fig_size)
    label_size = 9
    i = 0

    if v * h == 1:
        axes = [axes]  # Convert single-axis to a list
    else:
        axes = axes.flatten()

    for y_col in ['PlayerScore', 'Page Submit']:
        for x_col in a_cols:
            sns.scatterplot(data= data, x= x_col, y= y_col, hue= "Group", ax=axes[i])
            axes[i].set_title(f"{y_col} vs. {x_col}", fontsize=label_size)
            axes[i].set_xlabel(x_col, fontsize=label_size)
            axes[i].set_ylabel(y_col, fontsize=label_size)
            axes[i].tick_params(axis="both", labelsize=label_size)
            axes[i].legend(fontsize= label_size)
            i += 1
    fig.delaxes(axes[-1])
    fig.delaxes(axes[-2])
    plt.tight_layout()

within_groups_plots(data, [5, 3, 1], [15, 30])

In [None]:
spearman_E = E_data.corr(method= 'spearman')
spearman_C = C_data.corr(method= 'spearman')




In [None]:
spearman_E


In [None]:
spearman_C

In [None]:
means = data.groupby(by= "Group").mean()

def plot_means(data):
    fig, axes = plt.subplots(1, 2, figsize=(20, 10))
    label_size = 9

    for i, y_col in enumerate(['PlayerScore', 'Page Submit']):
        sns.lineplot(data=means, x="Group", y=col)
        axes[i].set_title(f"{y_col} Means", fontsize=label_size)
        axes[i].set_xlabel("Group", fontsize=label_size)
        axes[i].set_ylabel(y_col, fontsize=label_size)
        axes[i].tick_params(axis="both", labelsize=label_size)
    plt.tight_layout()

plot_means(means)


Demographic Information

In [None]:
# Gender
labelsize = 9
fig, ax = plt.subplots()
ax.pie(data["Q_Gender"].value_counts(), labels= data["Q_Gender"].value_counts().index, startangle=180, autopct='%1.1f%%', textprops= {"fontsize": 10})
ax.set_title("Participant Gender", fontsize= labelsize)

In [None]:
# Age
labelsize = 9
fig, ax = plt.subplots()
ax.pie(data["Q_Age"].value_counts(), labels= data["Q_Age"].value_counts().index, startangle=180, autopct='%1.1f%%', textprops= {"fontsize": 10})
ax.set_title("Participant Age", fontsize= labelsize)

In [None]:
# Ethnicity
labelsize = 9
fig, ax = plt.subplots()
ax.pie(data["Q_Ethn"].value_counts(), labels= data["Q_Ethn"].value_counts().index, startangle=180, autopct='%1.1f%%', textprops= {"fontsize": 10})
ax.set_title("Participant Ethnicity", fontsize= labelsize)

In [None]:
# Income
labelsize = 9
fig, ax = plt.subplots()
ax.pie(data["Q_Income"].value_counts(), labels= data["Q_Income"].value_counts().index, startangle=180, autopct='%1.1f%%', textprops= {"fontsize": 10})
ax.set_title("Participant Income", fontsize= labelsize)

In [None]:
# E vs C quitting time
data