## Social desirability experiment data merge

Running this notebook will pull week 3, 4 covid image condition data and week 5 covid image only data together to create an analysis dataset

In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [2]:
week3and4 = pd.read_csv('../input/34COVID.csv')
week5 = pd.read_csv('../input/week5.csv')

In [3]:
week3and4.columns==week5.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True])

In [4]:
week5.columns

Index(['StartDate', 'EndDate', 'Status', 'IPAddress', 'Progress',
       'Duration (in seconds)', 'Finished', 'RecordedDate', 'ResponseId',
       'RecipientLastName', 'RecipientFirstName', 'RecipientEmail',
       'ExternalReference', 'LocationLatitude', 'LocationLongitude',
       'DistributionChannel', 'UserLanguage', 'timer_First Click',
       'timer_Last Click', 'timer_Page Submit', 'timer_Click Count', 'Q1',
       'Q2', 'Q3', 'Q3_1', 'Q4', 'Q5', 'Q6', 'Q7_1', 'Q8', 'Control1',
       'Control2', 'Control3', 'Control4', 'A1', 'A2', 'A3', 'A4', 'B1', 'B2',
       'B3', 'B4', 'C1', 'C2', 'C3', 'C4', 'Q9', 'Q9_6_TEXT', 'Q10', 'Q11',
       'Q12', 'Control_1', 'Control_2', 'Control_3', 'Control_4',
       'Treatment_A_1', 'Treatment_A_2', 'Treatment_A_3', 'Treatment_A_4',
       'Treatment_B_1', 'Treatment_B_2', 'Treatment_B_3', 'Treatment_B_4',
       'Treatment_C_1', 'Treatment_C_2', 'Treatment_C_3', 'Treatment_C_4',
       'SC0', 'timeload', 'DeviceIdentifier', 'ResponseID', 'ipa

In [5]:
# manage columns
keep_columns = ['ResponseId', 'Finished', 'Q5', 'Q6',
                'Control1', 'Control2', 'Control3', 'Control4', 
                'A1', 'A2', 'A3', 'A4', 
                'B1', 'B2', 'B3', 'B4', 
                'C1', 'C2', 'C3', 'C4',
                'Q9', 'Q9_6_TEXT', 'Q10', 'Q11', 'Q12']

In [6]:
week3and4 = week3and4[keep_columns]
week5 = week5[keep_columns]
week_all = week3and4.append(week5, ignore_index=True)

In [7]:
week_all.shape

(881, 25)

In [8]:
# drop breakoffs
week_all_completes = week_all.loc[week_all['Finished']==1, :]

In [9]:
week_all_completes['Q9'].value_counts(dropna=False).sort_index()

1.0    375
2.0    261
5.0      3
6.0     21
NaN      8
Name: Q9, dtype: int64

In [10]:
# keep only those indicated gender as male and female and missing
df = week_all_completes.loc[~np.isin(week_all_completes['Q9'], [5, 6]), :]

In [11]:
# exclude those that are age <18
df = df.loc[~np.isin(df['Q11'], [1]), :]

#### Examine frequencies

Some level of missing data exist in demographics and the COVID questions

In [12]:
df['Q9'].value_counts(dropna=False).sort_index()

1.0    374
2.0    260
NaN      8
Name: Q9, dtype: int64

In [13]:
df['Q10'].value_counts(dropna=False).sort_index()

1.0    467
7.0    162
8.0      2
NaN     11
Name: Q10, dtype: int64

In [14]:
df['Q11'].value_counts(dropna=False).sort_index()

2.0     17
3.0     42
4.0     63
5.0    106
6.0    214
7.0    189
NaN     11
Name: Q11, dtype: int64

In [15]:
df['Q12'].value_counts(dropna=False).sort_index()

4.0      6
5.0     88
6.0    248
7.0    163
8.0    125
NaN     12
Name: Q12, dtype: int64

In [16]:
df['Q5'].value_counts(dropna=False).sort_index()

1.0    304
2.0     46
3.0     78
4.0    172
5.0     38
NaN      4
Name: Q5, dtype: int64

In [17]:
df['Q6'].value_counts(dropna=False).sort_index()

1.0    280
2.0    263
3.0     95
NaN      4
Name: Q6, dtype: int64

#### Reverse recodes

This step is to reverse the recoded values generated by Qualtrics for all experimental conditions <br>
Yes for any reason (1), Yes but only when neccessary (2), No (3), Unsure (4) <br>
Yes (1), No (3), Unsure (4)

Recodes correction: <br>
Control1: 2=3, 3=4 <br>
Control2, Control3, Control4, A1, A2, A3, A4: 4=3, 5=4 <br>
B1, B2, B3, B4, C2, C4: 4=2, 5=3, 6=4 <br>
C3: 7=2, 8=3, 9=4

In [18]:
recode_set2_cols = ['Control2', 'Control3', 'Control4', 'A1', 'A2', 'A3', 'A4']
recode_set3_cols = ['B1', 'B2', 'B3', 'B4', 'C2', 'C4']

In [19]:
df['Control1'] = df[['Control1']].replace([2, 3], [3, 4])
df['C3'] = df[['C3']].replace([7, 8, 9], [2, 3, 4])

In [20]:
for col in recode_set2_cols:
    df[col] = df[[col]].replace([4, 5], [3, 4])

In [21]:
for col in recode_set3_cols:
    df[col] = df[[col]].replace([4, 5, 6], [2, 3, 4])

#### Clean up experimental conditions groupings

In [22]:
df = df.reset_index(drop=True)

In [23]:
grouping = np.array(['Control', 'A', 'B', 'C'])
row_groupings = []

for index, row in df[['Control1', 'A1', 'B1', 'C1']].iterrows():
    boolean_list = ~row.isna()
    selected = grouping[boolean_list]
    try:
        value_selected = selected[0]
    except IndexError:
        # when all columns are nan, no grouping is available
        row_groupings.append('NAN')
        continue
    row_groupings.append(value_selected)

In [24]:
df['condition'] = row_groupings

In [25]:
df['condition'].value_counts(dropna=False).sort_index()

A          157
B          160
C          157
Control    163
NAN          5
Name: condition, dtype: int64

In [26]:
df['gone_to_friend'] = df[['Control1', 'A1', 'B1', 'C1']].fillna(0).sum(axis=1).astype(int)
df['had_visitors'] = df[['Control2', 'A2', 'B2', 'C2']].fillna(0).sum(axis=1).astype(int)
df['had_close_contact'] = df[['Control3', 'A3', 'B3', 'C3']].fillna(0).sum(axis=1).astype(int)
df['gone_outside'] = df[['Control4', 'A4', 'B4', 'C4']].fillna(0).sum(axis=1).astype(int)

In [27]:
# exclude those with complete missing data in experimental variables
df = df.loc[df['condition'] != 'NAN'].reset_index(drop=True)

In [28]:
df.columns

Index(['ResponseId', 'Finished', 'Q5', 'Q6', 'Control1', 'Control2',
       'Control3', 'Control4', 'A1', 'A2', 'A3', 'A4', 'B1', 'B2', 'B3', 'B4',
       'C1', 'C2', 'C3', 'C4', 'Q9', 'Q9_6_TEXT', 'Q10', 'Q11', 'Q12',
       'condition', 'gone_to_friend', 'had_visitors', 'had_close_contact',
       'gone_outside'],
      dtype='object')

In [29]:
df = df[['ResponseId', 'Q5', 'Q6', 'Q9', 'Q10', 'Q11', 'Q12',
         'gone_to_friend', 'had_visitors', 'had_close_contact', 'gone_outside', 'condition']]

In [30]:
df.columns = ['ID', 'vaccine', 'mandate', 'gender', 'marital', 'age_group', 'education', 'gone_to_friend', 'had_visitors', 'had_close_contact', 'gone_outside', 'condition']

In [31]:
df.to_csv('../output/SD_experiment_df.csv', index=False)