## Social desirability experiment data analysis: checking image effect - COVID

In [1]:
import numpy as np
import pandas as pd
import utility as util

import warnings
warnings.filterwarnings("ignore")

In [2]:
week3and4covid = pd.read_csv('../input/34COVID.csv')

In [3]:
# manage columns
keep_columns = ['ResponseId', 'Finished', 'Q5', 'Q6',
                'Control1', 'Control2', 'Control3', 'Control4', 
                'A1', 'A2', 'A3', 'A4', 
                'B1', 'B2', 'B3', 'B4', 
                'C1', 'C2', 'C3', 'C4',
                'Q9', 'Q9_6_TEXT', 'Q10', 'Q11', 'Q12']

In [5]:
week3and4covid = week3and4covid[keep_columns]

In [6]:
week3and4covid.shape

(723, 25)

In [7]:
# drop breakoffs
completes = week3and4covid.loc[week3and4covid['Finished']==1, :]

In [8]:
# keep only those indicated gender as male and female and missing
df = completes.loc[~np.isin(completes['Q9'], [5, 6]), :]

In [9]:
# exclude those that are age <18
df = df.loc[~np.isin(df['Q11'], [1]), :]

In [10]:
recode_set2_cols = ['Control2', 'Control3', 'Control4', 'A1', 'A2', 'A3', 'A4']
recode_set3_cols = ['B1', 'B2', 'B3', 'B4', 'C2', 'C4']

In [11]:
df['Control1'] = df[['Control1']].replace([2, 3], [3, 4])
df['C3'] = df[['C3']].replace([7, 8, 9], [2, 3, 4])

In [12]:
for col in recode_set2_cols:
    df[col] = df[[col]].replace([4, 5], [3, 4])

In [13]:
for col in recode_set3_cols:
    df[col] = df[[col]].replace([4, 5, 6], [2, 3, 4])

In [14]:
grouping = np.array(['Control', 'A', 'B', 'C'])
row_groupings = []

for index, row in df[['Control1', 'A1', 'B1', 'C1']].iterrows():
    boolean_list = ~row.isna()
    selected = grouping[boolean_list]
    try:
        value_selected = selected[0]
    except IndexError:
        # when all columns are nan, no grouping is available
        row_groupings.append('NAN')
        continue
    row_groupings.append(value_selected)

In [15]:
df['condition'] = row_groupings

In [16]:
df['condition'].value_counts(dropna=False).sort_index()

A          131
B          130
C          128
Control    131
NAN          4
Name: condition, dtype: int64

In [17]:
df['gone_to_friend'] = df[['Control1', 'A1', 'B1', 'C1']].fillna(0).sum(axis=1).astype(int)
df['had_visitors'] = df[['Control2', 'A2', 'B2', 'C2']].fillna(0).sum(axis=1).astype(int)
df['had_close_contact'] = df[['Control3', 'A3', 'B3', 'C3']].fillna(0).sum(axis=1).astype(int)
df['gone_outside'] = df[['Control4', 'A4', 'B4', 'C4']].fillna(0).sum(axis=1).astype(int)

In [18]:
# exclude those with complete missing data in experimental variables
df = df.loc[df['condition'] != 'NAN'].reset_index(drop=True)

In [19]:
df.columns

Index(['ResponseId', 'Finished', 'Q5', 'Q6', 'Control1', 'Control2',
       'Control3', 'Control4', 'A1', 'A2', 'A3', 'A4', 'B1', 'B2', 'B3', 'B4',
       'C1', 'C2', 'C3', 'C4', 'Q9', 'Q9_6_TEXT', 'Q10', 'Q11', 'Q12',
       'condition', 'gone_to_friend', 'had_visitors', 'had_close_contact',
       'gone_outside'],
      dtype='object')

In [20]:
df = df[['ResponseId', 'Q5', 'Q6', 'Q9', 'Q10', 'Q11', 'Q12',
         'gone_to_friend', 'had_visitors', 'had_close_contact', 'gone_outside', 'condition']]

In [21]:
df.columns = ['ID', 'vaccine', 'mandate', 'gender', 'marital', 'age_group', 'education', 'gone_to_friend', 'had_visitors', 'had_close_contact', 'gone_outside', 'condition']

In [22]:
df.to_csv('../output/SD_experiment_df_34control.csv', index=False)

In [23]:
# numeric -> character display
display_change_cols = ['gone_to_friend', 'had_visitors', 'had_close_contact', 'gone_outside']

for col in display_change_cols:
    df[col] = df[[col]].replace([1, 2, 3, 4], ["1. Yes/Yes any time", "2. Yes only when neccessary", "3. No", "4. Unsure"])

In [24]:
# combine top two response options into "yes"
combine_top_two = {"1. Yes/Yes any time": "1. Yes", 
                   "2. Yes only when neccessary": "1. Yes", 
                   "3. No": "2. No",
                   "4. Unsure": "3. Unsure"}

In [25]:
# create recodes that reflect the combinations
SD_cols = ['gone_to_friend', 'had_visitors', 'had_close_contact', 'gone_outside']

for col in SD_cols:
    df[f'{col}_r'] = df[col].map(combine_top_two)

In [26]:
recoded_cols = ['gone_to_friend_r', 'had_visitors_r', 'had_close_contact_r', 'gone_outside_r']

In [27]:
# create two separate factor variables
excuse_map = {'Control': 'no_excuse',
              'B': 'no_excuse',
              'A': 'with_excuse', 
              'C': 'with_excuse'}

response_map = {'Control': 'two_response',
                'A': 'two_response',
                'B': 'three_response', 
                'C': 'three_response'}

In [28]:
df['excuse_statement_condition'] = df['condition'].map(excuse_map)
df['response_set_condition'] = df['condition'].map(response_map)

In [29]:
# use variable label for table display
gender_map = {1.0: '1. Woman',
              2.0: '2. Man',
              np.nan: np.nan}

marital_map = {1.0: '1. Married',
               7.0: '2. Not married',
               8.0: np.nan,
               np.nan: np.nan}

age_group_map = {2.0: '19-25',
                 3.0: '26-35',
                 4.0: '36-45', 
                 5.0: '46-55',
                 6.0: '56-65',
                 7.0: '66+',
                 np.nan: np.nan}

education_map = {4.0: '1. Less than high school diploma',
                 5.0: '2. High school diploma',
                 6.0: '3. Some college',
                 7.0: '4. Bachelor degree',
                 8.0: '5. Graduate degree',
                 np.nan: np.nan}

In [30]:
df['gender'] = df['gender'].map(gender_map)
df['marital'] = df['marital'].map(marital_map)
df['age_group'] = df['age_group'].map(age_group_map)
df['education'] = df['education'].map(education_map)

In [31]:
# create a version of the data that excludes "unsure"
df_no_miss = df[(df['gone_to_friend'] != '4. Unsure') & (df['had_visitors'] != '4. Unsure') & 
                (df['had_close_contact'] != '4. Unsure') & (df['gone_outside'] != '4. Unsure')]

In [32]:
df_no_miss.shape

(505, 18)

In [33]:
conditions = ['Control', 'A', 'B', 'C']

In [34]:
for col in SD_cols:
    util.crosstab_chisq(col, 'condition', df_no_miss, conditions, chisqtest=False)

condition,Control,A,B,C
gone_to_friend,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1. Yes/Yes any time,60.5,61.2,48.8,46.3
2. Yes only when neccessary,0.0,0.0,17.1,20.3
3. No,39.5,38.8,34.1,33.3
Total n,124.0,129.0,129.0,123.0


-----

condition,Control,A,B,C
had_visitors,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1. Yes/Yes any time,50.8,57.4,45.0,48.0
2. Yes only when neccessary,0.0,0.0,15.5,20.3
3. No,49.2,42.6,39.5,31.7
Total n,124.0,129.0,129.0,123.0


-----

condition,Control,A,B,C
had_close_contact,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1. Yes/Yes any time,77.4,82.9,51.2,52.8
2. Yes only when neccessary,0.0,0.0,28.7,30.1
3. No,22.6,17.1,20.2,17.1
Total n,124.0,129.0,129.0,123.0


-----

condition,Control,A,B,C
gone_outside,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1. Yes/Yes any time,85.5,88.4,80.6,77.2
2. Yes only when neccessary,0.0,0.0,8.5,8.1
3. No,14.5,11.6,10.9,14.6
Total n,124.0,129.0,129.0,123.0


-----