## Social desirability experiment: replicating Daoust findings - subgroup analysis

Experiment specifications: <br>
* Daoust experiment (variables: visit, over, outdoors):
  + Have you done? Yes/No/Unsure <br>
  + Some people XX have you done? Yes/Only when neccessary/No/Unsure <br><br>
  
* SD benchmark (variables: blame):
  + Individuals are more to blame? Agree/disagree <br>
  + Social conditions are more to blame? Agree/disagree <br>

In [18]:
import numpy as np
import pandas as pd

import utility as util

In [19]:
df_dst = pd.read_csv('../output/df_dst.csv')
df_benchmark = pd.read_csv('../output/df_benchmark.csv')

#### recode demographics

In [20]:
df_dst.columns

Index(['id', 'condition', 'visit', 'over', 'outdoors', 'sex', 'marital',
       'age_group', 'education'],
      dtype='object')

In [21]:
demographic_cols = ['sex', 'marital', 'age_group', 'education']

In [22]:
for col in demographic_cols:
    print(df_dst[col].value_counts(dropna=False).sort_index())
    print('>>>>>>>>>>')

1.0    2464
2.0    2084
NaN      80
Name: sex, dtype: int64
>>>>>>>>>>
1.0    3008
2.0     384
3.0     657
4.0      44
5.0     462
NaN      73
Name: marital, dtype: int64
>>>>>>>>>>
2.0      86
3.0     191
4.0     325
5.0     674
6.0    1335
7.0    1948
NaN      69
Name: age_group, dtype: int64
>>>>>>>>>>
1.0      51
2.0     756
3.0    1729
4.0    1245
5.0     788
NaN      59
Name: education, dtype: int64
>>>>>>>>>>


In [23]:
sex_map = {1: '1. male',
           2: '2. female'}

marital_map = {1: '1. married',
               2: '2. unmarried',
               3: '2. unmarried',
               4: '2. unmarried',
               5: '2. unmarried'}

age_g_map = {2: '18-34',
             3: '18-34',
             4: '35-54', 
             5: '35-54',
             6: '55+',
             7: '55+'}

education_map ={1: '1. Less than college',
                2: '1. Less than college',
                3: '1. Less than college',
                4: '2. College+',
                5: '2. College+'}

In [24]:
df_dst['sex_r'] = df_dst['sex'].map(sex_map)
df_dst['marital_r'] = df_dst['marital'].map(marital_map)
df_dst['age_group_r'] = df_dst['age_group'].map(age_g_map)
df_dst['education_r'] = df_dst['education'].map(education_map)

df_benchmark['sex_r'] = df_benchmark['sex'].map(sex_map)
df_benchmark['marital_r'] = df_benchmark['marital'].map(marital_map)
df_benchmark['age_group_r'] = df_benchmark['age_group'].map(age_g_map)
df_benchmark['education_r'] = df_benchmark['education'].map(education_map)

In [25]:
recoded_demo_cols = ['sex_r', 'marital_r', 'age_group_r', 'education_r']

In [26]:
for col in recoded_demo_cols:
    print(df_dst[col].value_counts(dropna=False).sort_index())
    print('>>>>>>>>>>')

1. male      2464
2. female    2084
NaN            80
Name: sex_r, dtype: int64
>>>>>>>>>>
1. married      3008
2. unmarried    1547
NaN               73
Name: marital_r, dtype: int64
>>>>>>>>>>
18-34     277
35-54     999
55+      3283
NaN        69
Name: age_group_r, dtype: int64
>>>>>>>>>>
1. Less than college    2536
2. College+             2033
NaN                       59
Name: education_r, dtype: int64
>>>>>>>>>>


In [27]:
df_dst.groupby('sex_r').mean()

Unnamed: 0_level_0,id,visit,over,outdoors,sex,marital,age_group,education
sex_r,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1. male,2288.085633,3.337662,3.012175,2.727679,1.0,1.804967,5.891738,3.381515
2. female,2343.664107,4.177063,3.757198,3.775912,2.0,1.809065,5.9889,3.485549


In [28]:
SD_cols = ['visit', 'over', 'outdoors']

for col in SD_cols:
    df_dst[col] = df_dst[[col]].replace([1, 2, 9], ["1. Yes", "2. Only when necessary/occasionally", "9. No"])

In [29]:
# combine top two response options into "yes"
combine_top_two = {"1. Yes": "1. Yes", 
                   "2. Only when necessary/occasionally": "1. Yes", 
                   "9. No": "9. No"}

In [30]:
for col in SD_cols:
    df_dst[f'{col}_r'] = df_dst[col].map(combine_top_two)

In [31]:
recoded_cols = ['visit_r', 'over_r', 'outdoors_r']

In [32]:
woman_df = df_dst.loc[df_dst['sex'] == 1]
man_df = df_dst.loc[df_dst['sex'] == 2]

In [33]:
woman_df.head()

Unnamed: 0,id,condition,visit,over,outdoors,sex,marital,age_group,education,sex_r,marital_r,age_group_r,education_r,visit_r,over_r,outdoors_r
0,1,B,9. No,9. No,1. Yes,1.0,3.0,7.0,5.0,1. male,2. unmarried,55+,2. College+,9. No,9. No,1. Yes
1,2,A,9. No,1. Yes,9. No,1.0,1.0,7.0,5.0,1. male,1. married,55+,2. College+,9. No,1. Yes,9. No
2,3,A,9. No,9. No,9. No,1.0,1.0,7.0,3.0,1. male,1. married,55+,1. Less than college,9. No,9. No,9. No
3,4,A,9. No,9. No,1. Yes,1.0,3.0,7.0,3.0,1. male,2. unmarried,55+,1. Less than college,9. No,9. No,1. Yes
4,5,B,9. No,9. No,9. No,1.0,1.0,7.0,5.0,1. male,1. married,55+,2. College+,9. No,9. No,9. No


In [34]:
for col in recoded_cols:
    util.crosstab_chisq(col, 'condition', woman_df)

condition,A,B
visit_r,Unnamed: 1_level_1,Unnamed: 2_level_1
1. Yes,65.7,76.9
9. No,34.3,23.1
Total n,1224.0,1240.0


*Chi-squared statistic = 37.0, degree of freedom = 1, p = 0.0*

-----

condition,A,B
over_r,Unnamed: 1_level_1,Unnamed: 2_level_1
1. Yes,68.4,82.4
9. No,31.6,17.6
Total n,1224.0,1240.0


*Chi-squared statistic = 64.8, degree of freedom = 1, p = 0.0*

-----

condition,A,B
outdoors_r,Unnamed: 1_level_1,Unnamed: 2_level_1
1. Yes,74.3,83.5
9. No,25.7,16.5
Total n,1224.0,1240.0


*Chi-squared statistic = 30.3, degree of freedom = 1, p = 0.0*

-----

In [35]:
for col in recoded_cols:
    util.crosstab_chisq(col, 'condition', man_df)

condition,A,B
visit_r,Unnamed: 1_level_1,Unnamed: 2_level_1
1. Yes,56.5,65.4
9. No,43.5,34.6
Total n,1045.0,1039.0


*Chi-squared statistic = 17.3, degree of freedom = 1, p = 0.0*

-----

condition,A,B
over_r,Unnamed: 1_level_1,Unnamed: 2_level_1
1. Yes,59.5,73.1
9. No,40.5,26.9
Total n,1045.0,1039.0


*Chi-squared statistic = 42.1, degree of freedom = 1, p = 0.0*

-----

condition,A,B
outdoors_r,Unnamed: 1_level_1,Unnamed: 2_level_1
1. Yes,60.9,71.0
9. No,39.1,29.0
Total n,1045.0,1039.0


*Chi-squared statistic = 23.5, degree of freedom = 1, p = 0.0*

-----