## Analysis for the manuscript: data analysis

This analysis is conducted based on the Dauost experiment replication data, fielded in 2020. <br>
1) checking for randomization of treatment conditions <br>
2) experimental data analysis <br>
3) three-way interactions

In [7]:
import numpy as np
import pandas as pd
import utility as util

In [3]:
df_rep = pd.read_csv('../output/df_rep.csv')

In [4]:
df_rep.shape

(4545, 9)

In [5]:
df_rep.columns

Index(['id', 'condition', 'visit', 'over', 'outdoors', 'sex', 'marital',
       'age_group', 'education'],
      dtype='object')

### 1) checking for randomization of treatment conditions

In [10]:
democols = ['sex', 'marital', 'age_group', 'education']

for col in democols:
    util.crosstab_chisq(col, 'condition', df_rep, chisqtest=False)

condition,A,B
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,54.1,54.4
2.0,45.9,45.6
Total n,2269.0,2276.0


-----

condition,A,B
marital,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,65.1,66.7
2.0,9.2,7.6
3.0,14.1,14.9
4.0,1.0,1.0
5.0,10.6,9.9
Total n,2269.0,2276.0


-----

condition,A,B
age_group,Unnamed: 1_level_1,Unnamed: 2_level_1
2.0,2.5,1.4
3.0,4.8,3.6
4.0,5.9,8.4
5.0,14.5,15.1
6.0,29.0,29.4
7.0,43.3,42.2
Total n,2269.0,2276.0


-----

condition,A,B
education,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,0.8,1.5
2.0,16.7,16.6
3.0,38.6,37.2
4.0,26.4,27.9
5.0,17.5,16.8
Total n,2269.0,2276.0


-----

### 2) experimental data analysis

In [12]:
# numeric -> character display
SD_cols = ['visit', 'over', 'outdoors']

for col in SD_cols:
    df_rep[col] = df_rep[[col]].replace([1, 2, 8, 11], ["1a. Yes", "2. No", "8. Unsure", "1b. Only when necessary/occasionally"])

In [13]:
# raw distributions for cross-tabs
for col in SD_cols:
    util.crosstab_chisq(col, 'condition', df_rep, chisqtest=False)

condition,A,B
visit,Unnamed: 1_level_1,Unnamed: 2_level_1
1a. Yes,61.1,62.0
1b. Only when necessary/occasionally,0.0,9.5
2. No,38.6,28.3
8. Unsure,0.3,0.2
Total n,2269.0,2276.0


-----

condition,A,B
over,Unnamed: 1_level_1,Unnamed: 2_level_1
1a. Yes,63.9,67.0
1b. Only when necessary/occasionally,0.0,10.6
2. No,35.6,21.7
8. Unsure,0.5,0.6
Total n,2269.0,2276.0


-----

condition,A,B
outdoors,Unnamed: 1_level_1,Unnamed: 2_level_1
1a. Yes,67.7,68.1
1b. Only when necessary/occasionally,0.0,9.3
2. No,31.9,22.1
8. Unsure,0.5,0.4
Total n,2269.0,2276.0


-----

In [28]:
# combining 1a, 1b, and 8 together as 1/yes
combine_three = {"1a. Yes": "1. Yes", 
                 "2. No": "2. No",
                 "1b. Only when necessary/occasionally": "1. Yes", 
                 "8. Unsure": "1. Yes"}

for col in SD_cols:
    df_rep[f'{col}_r'] = df_rep[col].map(combine_three)
    
recoded_cols = ['visit_r', 'over_r', 'outdoors_r'] # use recoded_cols for further analyses

In [30]:
# cross-tabs based on recodes
for col in recoded_cols:
    util.crosstab_chisq(col, 'condition', df_rep, chisqtest=True)

condition,A,B
visit_r,Unnamed: 1_level_1,Unnamed: 2_level_1
1. Yes,61.4,71.7
2. No,38.6,28.3
Total n,2269.0,2276.0


*Chi-squared statistic = 53.3, degree of freedom = 1, p = 0.0*

-----

condition,A,B
over_r,Unnamed: 1_level_1,Unnamed: 2_level_1
1. Yes,64.4,78.3
2. No,35.6,21.7
Total n,2269.0,2276.0


*Chi-squared statistic = 105.5, degree of freedom = 1, p = 0.0*

-----

condition,A,B
outdoors_r,Unnamed: 1_level_1,Unnamed: 2_level_1
1. Yes,68.1,77.9
2. No,31.9,22.1
Total n,2269.0,2276.0


*Chi-squared statistic = 54.0, degree of freedom = 1, p = 0.0*

-----