## Social desirability experiment data analysis: checking image effect - control

In [42]:
import numpy as np
import pandas as pd
import utility as util

import warnings
warnings.filterwarnings("ignore")

In [2]:
week3and4control = pd.read_csv('../input/34control.csv')

In [4]:
# manage columns
keep_columns = ['ResponseId', 'Finished', 'Q5', 'Q6',
                'Control1', 'Control2', 'Control3', 'Control4', 
                'A1', 'A2', 'A3', 'A4', 
                'B1', 'B2', 'B3', 'B4', 
                'C1', 'C2', 'C3', 'C4',
                'Q9', 'Q9_6_TEXT', 'Q10', 'Q11', 'Q12']

In [5]:
week3and4control = week3and4control[keep_columns]

In [6]:
week3and4control.shape

(484, 25)

In [8]:
# drop breakoffs
completes = week3and4control.loc[week3and4control['Finished']==1, :]

In [9]:
# keep only those indicated gender as male and female and missing
df = completes.loc[~np.isin(completes['Q9'], [5, 6]), :]

In [10]:
# exclude those that are age <18
df = df.loc[~np.isin(df['Q11'], [1]), :]

In [15]:
recode_set2_cols = ['Control2', 'Control3', 'Control4', 'A1', 'A2', 'A3', 'A4']
recode_set3_cols = ['B1', 'B2', 'B3', 'B4', 'C2', 'C4']

In [16]:
df['Control1'] = df[['Control1']].replace([2, 3], [3, 4])
df['C3'] = df[['C3']].replace([7, 8, 9], [2, 3, 4])

In [17]:
for col in recode_set2_cols:
    df[col] = df[[col]].replace([4, 5], [3, 4])

In [18]:
for col in recode_set3_cols:
    df[col] = df[[col]].replace([4, 5, 6], [2, 3, 4])

In [19]:
grouping = np.array(['Control', 'A', 'B', 'C'])
row_groupings = []

for index, row in df[['Control1', 'A1', 'B1', 'C1']].iterrows():
    boolean_list = ~row.isna()
    selected = grouping[boolean_list]
    try:
        value_selected = selected[0]
    except IndexError:
        # when all columns are nan, no grouping is available
        row_groupings.append('NAN')
        continue
    row_groupings.append(value_selected)

In [20]:
df['condition'] = row_groupings

In [21]:
df['condition'].value_counts(dropna=False).sort_index()

A          74
B          83
C          77
Control    83
NAN         3
Name: condition, dtype: int64

In [22]:
df['gone_to_friend'] = df[['Control1', 'A1', 'B1', 'C1']].fillna(0).sum(axis=1).astype(int)
df['had_visitors'] = df[['Control2', 'A2', 'B2', 'C2']].fillna(0).sum(axis=1).astype(int)
df['had_close_contact'] = df[['Control3', 'A3', 'B3', 'C3']].fillna(0).sum(axis=1).astype(int)
df['gone_outside'] = df[['Control4', 'A4', 'B4', 'C4']].fillna(0).sum(axis=1).astype(int)

In [23]:
# exclude those with complete missing data in experimental variables
df = df.loc[df['condition'] != 'NAN'].reset_index(drop=True)

In [24]:
df.columns

Index(['ResponseId', 'Finished', 'Q5', 'Q6', 'Control1', 'Control2',
       'Control3', 'Control4', 'A1', 'A2', 'A3', 'A4', 'B1', 'B2', 'B3', 'B4',
       'C1', 'C2', 'C3', 'C4', 'Q9', 'Q9_6_TEXT', 'Q10', 'Q11', 'Q12',
       'condition', 'gone_to_friend', 'had_visitors', 'had_close_contact',
       'gone_outside'],
      dtype='object')

In [26]:
df = df[['ResponseId', 'Q5', 'Q6', 'Q9', 'Q10', 'Q11', 'Q12',
         'gone_to_friend', 'had_visitors', 'had_close_contact', 'gone_outside', 'condition']]

In [27]:
df.columns = ['ID', 'vaccine', 'mandate', 'gender', 'marital', 'age_group', 'education', 'gone_to_friend', 'had_visitors', 'had_close_contact', 'gone_outside', 'condition']

In [28]:
df.to_csv('../output/SD_experiment_df_34control.csv', index=False)

In [29]:
# numeric -> character display
display_change_cols = ['gone_to_friend', 'had_visitors', 'had_close_contact', 'gone_outside']

for col in display_change_cols:
    df[col] = df[[col]].replace([1, 2, 3, 4], ["1. Yes/Yes any time", "2. Yes only when neccessary", "3. No", "4. Unsure"])

In [30]:
# combine top two response options into "yes"
combine_top_two = {"1. Yes/Yes any time": "1. Yes", 
                   "2. Yes only when neccessary": "1. Yes", 
                   "3. No": "2. No",
                   "4. Unsure": "3. Unsure"}

In [31]:
# create recodes that reflect the combinations
SD_cols = ['gone_to_friend', 'had_visitors', 'had_close_contact', 'gone_outside']

for col in SD_cols:
    df[f'{col}_r'] = df[col].map(combine_top_two)

In [32]:
recoded_cols = ['gone_to_friend_r', 'had_visitors_r', 'had_close_contact_r', 'gone_outside_r']

In [33]:
# create two separate factor variables
excuse_map = {'Control': 'no_excuse',
              'B': 'no_excuse',
              'A': 'with_excuse', 
              'C': 'with_excuse'}

response_map = {'Control': 'two_response',
                'A': 'two_response',
                'B': 'three_response', 
                'C': 'three_response'}

In [34]:
df['excuse_statement_condition'] = df['condition'].map(excuse_map)
df['response_set_condition'] = df['condition'].map(response_map)

In [35]:
# use variable label for table display
gender_map = {1.0: '1. Woman',
              2.0: '2. Man',
              np.nan: np.nan}

marital_map = {1.0: '1. Married',
               7.0: '2. Not married',
               8.0: np.nan,
               np.nan: np.nan}

age_group_map = {2.0: '19-25',
                 3.0: '26-35',
                 4.0: '36-45', 
                 5.0: '46-55',
                 6.0: '56-65',
                 7.0: '66+',
                 np.nan: np.nan}

education_map = {4.0: '1. Less than high school diploma',
                 5.0: '2. High school diploma',
                 6.0: '3. Some college',
                 7.0: '4. Bachelor degree',
                 8.0: '5. Graduate degree',
                 np.nan: np.nan}

In [36]:
df['gender'] = df['gender'].map(gender_map)
df['marital'] = df['marital'].map(marital_map)
df['age_group'] = df['age_group'].map(age_group_map)
df['education'] = df['education'].map(education_map)

In [37]:
# create a version of the data that excludes "unsure"
df_no_miss = df[(df['gone_to_friend'] != '4. Unsure') & (df['had_visitors'] != '4. Unsure') & 
                (df['had_close_contact'] != '4. Unsure') & (df['gone_outside'] != '4. Unsure')]

In [38]:
df_no_miss.shape

(268, 18)

In [39]:
conditions = ['Control', 'A', 'B', 'C']

In [43]:
for col in SD_cols:
    util.crosstab_chisq(col, 'condition', df_no_miss, conditions, chisqtest=False)

condition,Control,A,B,C
gone_to_friend,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1. Yes/Yes any time,100.0,52.1,31.7,28.0
2. Yes only when neccessary,0.0,0.0,14.6,25.3
3. No,0.0,47.9,53.7,46.7
Total n,38.0,73.0,82.0,75.0


-----

condition,Control,A,B,C
had_visitors,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1. Yes/Yes any time,60.5,57.5,41.5,36.0
2. Yes only when neccessary,0.0,0.0,19.5,25.3
3. No,39.5,42.5,39.0,38.7
Total n,38.0,73.0,82.0,75.0


-----

condition,Control,A,B,C
had_close_contact,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1. Yes/Yes any time,65.8,76.7,39.0,26.7
2. Yes only when neccessary,0.0,0.0,34.1,44.0
3. No,34.2,23.3,26.8,29.3
Total n,38.0,73.0,82.0,75.0


-----

condition,Control,A,B,C
gone_outside,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1. Yes/Yes any time,84.2,75.3,62.2,61.3
2. Yes only when neccessary,0.0,0.0,17.1,18.7
3. No,15.8,24.7,20.7,20.0
Total n,38.0,73.0,82.0,75.0


-----