## Analysis for the manuscript: prepare data

This analysis is conducted based on the Dauost experiment replication data, fielded in 2020. <br>
1) checking for randomization of treatment conditions <br>
2) experimental data analysis <br>
3) three-way interactions

In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [2]:
rep_exp = pd.read_csv('../input/rep_exp.csv')

In [3]:
# columns: select and rename analysis variables
rep_exp = rep_exp[['Finished', 'Q1.3', 'Q1.4', 'Q1.5', 'Q2.3', 'Q2.4', 'Q2.5', 'Q3.1', 'Q3.2', 
                   'Q5.1', 'Q5.2', 'Q5.3', 'Q5.4']]

rep_exp.columns = ['Finished', 'visit_A', 'over_A', 'outdoors_A', 'visit_B', 'over_B', 'outdoors_B', 'blame_individual', 'blame_social', 
                   'sex', 'marital', 'age_group', 'education']

In [4]:
# rows: remove breakoffs
rep_exp = rep_exp.loc[rep_exp['Finished']==1, :]

In [5]:
rep_exp.shape

(4686, 13)

In [6]:
# rows: check demographic frequencies - at analysis stage: consider whether 1) remove missing demographics, 2) recode demographics
demographic_cols = ['sex', 'marital', 'age_group', 'education']

for col in demographic_cols:
    print(rep_exp[col].value_counts(dropna=False).sort_index())
    print("................")

1.0    2488
2.0    2109
NaN      89
Name: sex, dtype: int64
................
1.0    3029
2.0     391
3.0     665
4.0      44
5.0     476
NaN      81
Name: marital, dtype: int64
................
1.0      10
2.0      89
3.0     195
4.0     329
5.0     680
6.0    1342
7.0    1966
NaN      75
Name: age_group, dtype: int64
................
1.0      55
2.0     767
3.0    1746
4.0    1253
5.0     800
NaN      65
Name: education, dtype: int64
................


In [7]:
# rows: remove those that are age<18
rep_exp = rep_exp.loc[~np.isin(rep_exp['age_group'], [1]), :]

In [8]:
# rows: remove those with missing demographics
rep_exp = rep_exp.dropna( how='any', subset= demographic_cols)

In [9]:
rep_exp.shape

(4545, 13)

In [10]:
# reset and add index column "id"
rep_exp = rep_exp.reset_index(drop=True)
rep_exp['id'] = range(1, len(rep_exp) + 1)

In [11]:
# extract the replication dataset
df_rep = rep_exp[['id', 'visit_A', 'over_A', 'outdoors_A', 'visit_B', 'over_B', 'outdoors_B', 'sex', 'marital', 'age_group', 'education']]

In [12]:
# recoding for consistency 
A_cols = ['visit_A', 'over_A', 'outdoors_A']
B_cols = ['visit_B', 'over_B', 'outdoors_B'] 

for col in A_cols:
    df_rep[col] = df_rep[[col]].replace([3], [8]) # unsure=8
    
for col in B_cols:
    df_rep[col] = df_rep[[col]].replace([4], [8]) # unsure=8
    
for col in B_cols:
    df_rep[col] = df_rep[[col]].replace([2], [11]) # only when neccessary=11
    
for col in B_cols:
    df_rep[col] = df_rep[[col]].replace([3], [2]) # no=2

In [13]:
# assign conditions
grouping = np.array(['A', 'B'])
row_groupings = []

for index, row in df_rep[['visit_A', 'visit_B']].iterrows():
    boolean_list = ~row.isna()
    selected = grouping[boolean_list]
    try:
        value_selected = selected[0]
    except IndexError:
        # when all columns are nan, no grouping is available
        row_groupings.append('NAN')
        continue
    row_groupings.append(value_selected)

In [14]:
df_rep['condition'] = row_groupings

In [15]:
df_rep['condition'].value_counts(dropna=False).sort_index()

A    2269
B    2276
Name: condition, dtype: int64

In [16]:
df_rep.head()

Unnamed: 0,id,visit_A,over_A,outdoors_A,visit_B,over_B,outdoors_B,sex,marital,age_group,education,condition
0,1,,,,2.0,2.0,1.0,1.0,3.0,7.0,5.0,B
1,2,2.0,1.0,2.0,,,,1.0,1.0,7.0,5.0,A
2,3,2.0,2.0,2.0,,,,1.0,1.0,7.0,3.0,A
3,4,2.0,2.0,1.0,,,,1.0,3.0,7.0,3.0,A
4,5,,,,2.0,2.0,2.0,1.0,1.0,7.0,5.0,B


In [17]:
# combine two sets of variables into one
df_rep['visit'] = df_rep[['visit_A', 'visit_B']].fillna(0).sum(axis=1).astype(int)
df_rep['over'] = df_rep[['over_A', 'over_B']].fillna(0).sum(axis=1).astype(int)
df_rep['outdoors'] = df_rep[['outdoors_A', 'outdoors_B']].fillna(0).sum(axis=1).astype(int)

In [18]:
df_rep = df_rep[['id', 'condition', 'visit', 'over', 'outdoors', 'sex', 'marital', 'age_group', 'education']]

In [19]:
df_rep.head()

Unnamed: 0,id,condition,visit,over,outdoors,sex,marital,age_group,education
0,1,B,2,2,1,1.0,3.0,7.0,5.0
1,2,A,2,1,2,1.0,1.0,7.0,5.0
2,3,A,2,2,2,1.0,1.0,7.0,3.0
3,4,A,2,2,1,1.0,3.0,7.0,3.0
4,5,B,2,2,2,1.0,1.0,7.0,5.0


In [20]:
df_rep.shape

(4545, 9)

In [21]:
# rows: check frequency of experimental variables
experiment_cols = ['visit', 'over', 'outdoors']

for col in experiment_cols:
    print(df_rep[col].value_counts(dropna=False).sort_index())
    print("................")

1     2797
2     1521
8       11
11     216
Name: visit, dtype: int64
................
1     2977
2     1302
8       25
11     241
Name: over, dtype: int64
................
1     3086
2     1227
8       20
11     212
Name: outdoors, dtype: int64
................


In [22]:
df_rep.to_csv('../output/df_rep.csv', index=False)