## Prepare data for the extension study

In [30]:
import numpy as np
import pandas as pd
import utility as util

import warnings
warnings.filterwarnings("ignore")

In [3]:
ext_df = pd.read_csv('../output/SD_experiment_df.csv')

In [7]:
# reset and add index column "id"
ext_df = ext_df.reset_index(drop=True)
ext_df['id'] = range(1, len(ext_df) + 1)

In [9]:
ext_df = ext_df[['id', 'condition', 'gone_to_friend', 'had_visitors', 'had_close_contact', 'gone_outside',
                 'gender', 'marital', 'age_group', 'education']]

In [12]:
# remove missing based on demographic variables
democols = ['gender', 'marital', 'age_group', 'education']

ext_df = ext_df.dropna(how='any', subset= democols)

In [18]:
# remove those that reported unsure in the marital variable
ext_df = ext_df.loc[~np.isin(ext_df['marital'], [8]), :]

In [28]:
ext_df.shape

(627, 14)

### 1) checking for randomization of treatment conditions

In [29]:
democols = ['gender', 'marital', 'age_group', 'education']

for col in democols:
    print(ext_df[col].value_counts(dropna=False).sort_index())
    print("................")

1.0    369
2.0    258
Name: gender, dtype: int64
................
1.0    465
7.0    162
Name: marital, dtype: int64
................
2.0     17
3.0     42
4.0     63
5.0    106
6.0    212
7.0    187
Name: age_group, dtype: int64
................
4.0      6
5.0     88
6.0    247
7.0    162
8.0    124
Name: education, dtype: int64
................


In [24]:
# recode demographic variables for consistency
ext_df['gender_r'] = np.nan
ext_df.loc[ext_df['gender']==1, ['gender_r']] = 2 # woman
ext_df.loc[ext_df['gender']==2, ['gender_r']] = 1 # man

ext_df['marital_r'] = np.nan
ext_df.loc[ext_df['marital']==1, ['marital_r']] = 1 # married
ext_df.loc[ext_df['marital']==7, ['marital_r']] = 2 # not married

ext_df['age_group_r'] = np.nan
ext_df.loc[np.isin(ext_df['age_group'], [2, 3, 4, 5]), ['age_group_r']] = 1 # 19-55
ext_df.loc[ext_df['age_group']==6, ['age_group_r']] = 2 # 56-65
ext_df.loc[ext_df['age_group']==7, ['age_group_r']] = 3 # 66+

ext_df['education_r'] = np.nan
ext_df.loc[np.isin(ext_df['education'], [4, 5, 6]), ['education_r']] = 1 # less than college
ext_df.loc[np.isin(ext_df['education'], [7, 8]), ['education_r']] = 2 # college or more

In [26]:
# checking recoded variables
democols_r = ['gender_r', 'marital_r', 'age_group_r', 'education_r']

for col in democols_r:
    print(ext_df[col].value_counts(dropna=False).sort_index())
    print("................")

1.0    258
2.0    369
Name: gender_r, dtype: int64
................
1.0    465
2.0    162
Name: marital_r, dtype: int64
................
1.0    228
2.0    212
3.0    187
Name: age_group_r, dtype: int64
................
1.0    341
2.0    286
Name: education_r, dtype: int64
................


In [31]:
for col in democols_r:
    util.crosstab_chisq(col, 'condition', ext_df, chisqtest=True)

condition,A,B,C,Control
gender_r,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,44.8,42.7,41.0,36.2
2.0,55.2,57.3,59.0,63.7
Total n,154.0,157.0,156.0,160.0


*Chi-squared statistic = 2.6, degree of freedom = 3, p = 0.46*

-----

condition,A,B,C,Control
marital_r,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,72.7,77.7,78.8,67.5
2.0,27.3,22.3,21.2,32.5
Total n,154.0,157.0,156.0,160.0


*Chi-squared statistic = 6.7, degree of freedom = 3, p = 0.083*

-----

condition,A,B,C,Control
age_group_r,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,34.4,42.0,37.2,31.9
2.0,32.5,31.2,31.4,40.0
3.0,33.1,26.8,31.4,28.1
Total n,154.0,157.0,156.0,160.0


*Chi-squared statistic = 6.3, degree of freedom = 6, p = 0.392*

-----

condition,A,B,C,Control
education_r,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,54.5,49.0,57.7,56.2
2.0,45.5,51.0,42.3,43.8
Total n,154.0,157.0,156.0,160.0


*Chi-squared statistic = 2.7, degree of freedom = 3, p = 0.437*

-----

### 2) experimental data analysis

In [33]:
# check missing in experiment cols
expcols = ['gone_to_friend', 'had_visitors', 'had_close_contact', 'gone_outside']

for col in expcols:
    print(ext_df[col].value_counts(dropna=False).sort_index())
    print("................")

1    327
2     59
3    238
4      3
Name: gone_to_friend, dtype: int64
................
1    309
2     56
3    258
4      4
Name: had_visitors, dtype: int64
................
1    384
2     99
3    132
4     12
Name: had_close_contact, dtype: int64
................
1    498
2     27
3     95
4      7
Name: gone_outside, dtype: int64
................


In [34]:
# numeric -> character display
for col in expcols:
    ext_df[col] = ext_df[[col]].replace([1, 2, 3, 4], ["1a. Yes/Yes for any reason", "1b. Yes but only when neccessary", 
                                                       "2. No", "3. Unsure"])

In [35]:
# raw distributions for cross-tabs
for col in expcols:
    util.crosstab_chisq(col, 'condition', ext_df, chisqtest=False)

condition,A,B,C,Control
gone_to_friend,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1a. Yes/Yes for any reason,57.8,49.0,40.4,61.3
1b. Yes but only when neccessary,0.0,15.9,21.8,0.0
2. No,40.9,35.0,37.2,38.8
3. Unsure,1.3,0.0,0.6,0.0
Total n,154.0,157.0,156.0,160.0


-----

condition,A,B,C,Control
had_visitors,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1a. Yes/Yes for any reason,56.5,45.2,44.2,51.2
1b. Yes but only when neccessary,0.0,15.9,19.9,0.0
2. No,42.9,38.2,34.6,48.8
3. Unsure,0.6,0.6,1.3,0.0
Total n,154.0,157.0,156.0,160.0


-----

condition,A,B,C,Control
had_close_contact,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1a. Yes/Yes for any reason,79.9,49.0,44.2,71.9
1b. Yes but only when neccessary,0.0,29.9,33.3,0.0
2. No,19.5,20.4,19.9,24.4
3. Unsure,0.6,0.6,2.6,3.8
Total n,154.0,157.0,156.0,160.0


-----

condition,A,B,C,Control
gone_outside,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1a. Yes/Yes for any reason,85.7,77.7,73.7,80.6
1b. Yes but only when neccessary,0.0,8.3,9.0,0.0
2. No,13.6,14.0,16.0,16.9
3. Unsure,0.6,0.0,1.3,2.5
Total n,154.0,157.0,156.0,160.0


-----

In [36]:
# combining 1a, 1b, and 8 together as 1/yes
combine_three = {"1a. Yes/Yes for any reason": "1. Yes", 
                 "2. No": "2. No",
                 "1b. Yes but only when neccessary": "1. Yes", 
                 "8. Unsure": "1. Yes"}

for col in expcols:
    ext_df[f'{col}_r'] = ext_df[col].map(combine_three)
    
recoded_cols = ['gone_to_friend_r', 'had_visitors_r', 'had_close_contact_r', 'gone_outside_r'] # use recoded_cols for further analyses