In [1]:
import pandas as pd
import pickle

# Data Processing

## Load Data

### Pre-survey

Load treatment data, rename columns, and make a dictionary so we can reference back to the verbose names given in the raw survey data

In [2]:
presurvey = pd.read_csv('./data/presurvey_raw_treatment.csv')

presurvey_names = ['timestamp_pre', 'id', 'freq_browser', 'freq_phone',
         'length_use', 'freq_use', 'ability_find_tasks',
         'promptly_find_tasks', 'text_search_preference',
         'notification_preference', 'message_preference',
         'understood', 'browsers', 'work_aids', 'fora',
         'other_markets', 'other_markets_length_use',
         'authority_comfort', 'collective_individual_scale',
         'planning_scale', 'timeliness_scale', 'emotion_scale',
         'success_rating', 'leadership_rating', 'leadership_preference',
         'gender_preference', 'multitasking', 'short_term_plans',
         'linear_work_style', 'plan_change_aversion', 'plan_change_adaptable',
         'lateness_aversion', 'blank', 'spare_time_alone',
         'commitment_others_over_self', 'close_over_casual_friends',
         'internet_friends', 'support_from_friends', 'pandemic_effect_magnitude',
         'chosen_task_completion_confidence', 'complex_task_completion_confidence',
         'neutral', 'task_satisfaction_pre', 'likelihood_continued_use',
         'task_motivation', 'plugin_preference', 'current_country',
         'home_country', 'only_one_country', 'education', 'gender', 'age']

presurvey_alias = dict(zip(presurvey_names, presurvey.columns))

presurvey.columns = presurvey_names

Load control data and add a column marking observations according to their group

In [3]:
presurvey['group'] = 'treatment'

control1 = pd.read_csv('./data/presurvey_raw_control-1.csv', names=presurvey_names, header=0)
control2 = pd.read_csv('./data/presurvey_raw_control-2.csv', names=presurvey_names, header=0)
control1['group'] = 'control'
control2['group'] = 'control'

presurvey = pd.concat([presurvey, control1, control2])
del control1
del control2

### Post-survey
Load treatment data and change the column names into simpler ones

In [4]:
postsurvey = pd.read_csv('./data/postsurvey_raw_control.csv', encoding='latin1')

postsurvey_names = ['timestamp_post', 'id', 'change_work', 'explain_change_work',
                    'create_new_work', 'explain_new_work', 'change_work_enjoyment',
                    'explain_change_work_enjoyment', 'create_new_experience',
                    'explain_new_experience', 'alert_usefulness', 
                    'explain_alert_usefulness', 'work_style_change', 'likelihood_use', 
                    'likelihood_use_in_page_alert','likelihood_use_in_browser_alert', 
                    'likelihood_receive_msg_request', 'likelihood_receive_msg_worker', 
                    'fav_thing','least_fav_thing', 'feature_change','further_participation', 
                    'confidence_all_task_completion', 'confidence_hard_task', 'attention_check',
                    'task_satisfaction_post', 'future_toloka_use',
                    'best_attempt_on_tokola', 'plugin_assistance', 'comment']

postsurvey_alias = dict(zip(postsurvey_names, postsurvey.columns))

postsurvey.columns = postsurvey_names

Load treatment data and add a column marking observations according to their group

In [5]:
postsurvey['group'] = 'control'

treatment = pd.read_csv('./data/postsurvey_raw_treatment.csv', names=postsurvey_names, header=0)
treatment['group'] = 'treatment'

postsurvey = pd.concat([postsurvey, treatment])
del treatment

Make sure everyone in the post-survey was seen in the pre-survey

In [6]:
postsurvey[~postsurvey['id'].isin(presurvey['id'])].drop_duplicates('id')

Unnamed: 0,timestamp_post,id,change_work,explain_change_work,create_new_work,explain_new_work,change_work_enjoyment,explain_change_work_enjoyment,create_new_experience,explain_new_experience,...,further_participation,confidence_all_task_completion,confidence_hard_task,attention_check,task_satisfaction_post,future_toloka_use,best_attempt_on_tokola,plugin_assistance,comment,group
0,1/16/2022 18:47:31,FINALFORM,Incredible perform very interesting tasks,,,,,,,,...,Yes,,,Strongly disagree,Slightly Agree,,,,Increasing homework and exams for the most act...,control
1,1/16/2022 18:49:45,FINALSURVEY2,nothing,,,,,,,,...,Yes,,,Strongly disagree,Neutral,,,,,control
2,1/23/2022 2:02:08,FINALSURVEY3,3,very fast,2.0,2 days,4.0,I enjoy it,5.0,very helpful,...,Yes,5.0,5.0,Strongly agree,Strongly agree,4.0,5.0,5.0,Nothing else. Thanks for the plug-in,control
5,2/20/2022 15:08:05,d27e322316b99d8bd347bf330e4f5b2,4,Cho tÃ´i nhÃ¬n tháº¥y khÃ¡ nhiá»u nhiá»m vu ...,4.0,LÃ m tÃ´i quáº£n lÃ½ viá»c tá»t hÆ¡n,5.0,lÃ m tÃ´i khÃ¡ thÃ­ch hÆ¡n vÃ¬ nÃ³ dá» sá»­ d...,3.0,khÃ¡ á»n,...,Yes,3.0,4.0,Disagree,Agree,5.0,3.0,4.0,Toi nghÄ© tÃ´i váº«n sáº½ dÃ¹ng plugin,control


Some records have invalid ids so let's remove those

In [7]:
postsurvey = postsurvey[~postsurvey['id'].isin(['FINALFORM','FINALSURVEY2','FINALSURVEY3',
                                                'd27e322316b99d8bd347bf330e4f5b2'])]

### Add Chronicity Data

Chronicity data is manually inferred and/or recorded for each country based on Table 14 from [1] and Table 3 from [2]. We label each subject with the chronicity corresponding to their home country.

[1] https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.531.7470&rep=rep1&type=pdf

[2] https://www.researchgate.net/publication/4781353_A_multi-country_study_of_the_adoption_of_ERP_systems

In [8]:
chronicity = pd.read_csv('./data/country_chronicity.csv')
chronicity = dict(zip(chronicity['country'], chronicity['chronicity']))
presurvey['chronicity'] = [chronicity[country] for country in presurvey['home_country']]
id_chronicity = dict(zip(presurvey[['id', 'chronicity']].drop_duplicates()['id'],
                         presurvey[['id', 'chronicity']].drop_duplicates()['chronicity']))
postsurvey['chronicity'] = [id_chronicity.get(_id, None) for _id in postsurvey['id']]
del chronicity
del id_chronicity

### Add Economic Region Data

Add a column indicating whether the worker is from Global North or South

data source: https://github.com/wikimedia-research/canonical-data/blob/master/countries.csv

In [9]:
region = pd.read_csv('./data/country_region.csv')
region = dict(zip(region['name'], region['economic_region']))
presurvey['region'] = [region[country].lower().replace(' ', '_')
                       for country in presurvey['home_country']]
id_region = dict(zip(presurvey[['id', 'region']].drop_duplicates()['id'],
                     presurvey[['id', 'region']].drop_duplicates()['region']))
postsurvey['region'] = [id_region[_id] for _id in postsurvey['id']]
del region
del id_region

## Data Cleaning

There are a considerable number of respondents who completed the survey multiple times, and they did not give the same answers each time. Worse, some managed to fill out both treatment and control group surveys. Let's go through and clean up these things. For respondents with multiple submissions, we'll keep only their first submission. First, let's look at all of the repeat submissions to the pre-survey.

In [10]:
presurvey.sort_values(['timestamp_pre'], inplace=True)

for row in presurvey['id'].value_counts().rename_axis('id').reset_index(name='count').itertuples(index=False):
    if row.count == 1:
        pass
    else:
        print(presurvey[presurvey['id'] == row.id][['timestamp_pre', 'id', 'group']])
        print('-----------------------------------------------------------------------')

         timestamp_pre                              id      group
21  1/23/2022 11:19:08  ae18ab1aea6162de2276501ad651b6  treatment
44  1/26/2022 11:00:05  ae18ab1aea6162de2276501ad651b6  treatment
59   1/31/2022 6:23:19  ae18ab1aea6162de2276501ad651b6  treatment
66   2/11/2022 0:28:17  ae18ab1aea6162de2276501ad651b6  treatment
69  2/11/2022 10:42:47  ae18ab1aea6162de2276501ad651b6  treatment
82   2/12/2022 8:43:45  ae18ab1aea6162de2276501ad651b6  treatment
-----------------------------------------------------------------------
         timestamp_pre                                id      group
4   1/26/2022 23:38:35  5fc991717e944f5142979e8878555271    control
5   1/26/2022 23:42:48  5fc991717e944f5142979e8878555271    control
22   1/31/2022 0:30:53  5fc991717e944f5142979e8878555271    control
29   2/11/2022 1:28:46  5fc991717e944f5142979e8878555271    control
88  2/13/2022 23:55:05  5fc991717e944f5142979e8878555271  treatment
----------------------------------------------------------

It seems safe to drop duplicates and keep only the first submission, it seems that the respondents who filled out both treatment and control surveys were in the control group and somehow got the treatment survey after.

In [11]:
presurvey.drop_duplicates(subset=['id'], keep='first', ignore_index=True, inplace=True)

Now let's do the same for the post-survey. First, look at all of the repeat submissions to ensure it's safe to drop duplicates and keep only the first submission.

In [12]:
postsurvey.sort_values(['timestamp_post'], inplace=True)

for row in postsurvey['id'].value_counts().rename_axis('id').reset_index(name='count').itertuples(index=False):
    if row.count == 1:
        pass
    else:
        print(postsurvey[postsurvey['id'] == row.id][['timestamp_post', 'id', 'group']])
        print('-----------------------------------------------------------------------')

        timestamp_post                                id      group
28  2/24/2022 12:57:42  356cd2c5f07830925f6a46ee42e9c748  treatment
28  2/24/2022 12:57:42  356cd2c5f07830925f6a46ee42e9c748    control
29  2/24/2022 13:17:03  356cd2c5f07830925f6a46ee42e9c748  treatment
29  2/24/2022 13:17:03  356cd2c5f07830925f6a46ee42e9c748    control
-----------------------------------------------------------------------
        timestamp_post                              id      group
30  2/24/2022 16:56:10  fbd2ee5b8dcfa3d939cb5df1a92552    control
30  2/24/2022 16:56:10  fbd2ee5b8dcfa3d939cb5df1a92552  treatment
-----------------------------------------------------------------------
        timestamp_post                              id      group
25  2/21/2022 13:47:44  6b4cd1c49a60f0cd87c83910d58249  treatment
25  2/21/2022 13:47:44  6b4cd1c49a60f0cd87c83910d58249    control
-----------------------------------------------------------------------
        timestamp_post                          

Since we have duplicated records with the same timestamp and different groups, we will assign records in the postsurvey data to the group associated with their id in the now cleaned presurvey data.

In [13]:
id_group = dict(zip(presurvey['id'], presurvey['group']))
postsurvey['group'] = [id_group[_id] for _id in postsurvey['id']]
del id_group

postsurvey.drop_duplicates(subset=['id'], keep='first', ignore_index=True, inplace=True)

## Numericize Likert Scales

Translate likert scales to numerical representations and create alias dictionaries so we can recover the original entries.

Start by identifying which variables need to be numericized and creating dictionaries to help switch out values:

In [14]:
# NOTE: field 'understood' does contain a likert scale but the desired input to this question is
# the word 'understood' written in an open text field, so we do not convert it
presurvey_scales = ['freq_browser', 'freq_phone',
                    'length_use', 'freq_use', 'ability_find_tasks',
                    'promptly_find_tasks', 'text_search_preference',
                    'notification_preference', 'message_preference',
                    'other_markets_length_use', 'authority_comfort', 'collective_individual_scale',
                    'planning_scale', 'timeliness_scale', 'emotion_scale',
                    'success_rating', 'leadership_rating', 'leadership_preference',
                    'gender_preference', 'neutral', 'task_satisfaction_pre']

postsurvey_scales = ['likelihood_use', 'likelihood_use_in_page_alert',
                     'likelihood_use_in_browser_alert', 'likelihood_receive_msg_request',
                     'likelihood_receive_msg_worker', 'attention_check',
                     'task_satisfaction_post']

freq_scale = {'Never': 1, 'Rarely': 2, 'Sometimes': 3, 'Often': 4, 'Always': 5}
agree_scale = {'Strongly disagree': 1, 'Disagree': 2, 'Neutral': 3, 'Agree': 4, 'Strongly Agree': 5}
likely_scale = {'Very unlikely': 1, 'Unlikely': 2, 'Neutral': 3, 'Likely': 4, 'Very likely': 5}
year_range = {'More than 3 years': 7, 'Less than a month': 1, 'Between 1 and 2 years': 5,
              'Between 6 months and 1 year': 4, 'Between 1 and 3 months': 2,
              'Between 3 and 6 months': 3, 'More than 2 and 3 years': 6}
day_range = {'Everyday': 5, 'From three to four days a week': 3,
             'From five to six days a week': 4, 'Once or twice a week': 2,
             'Less than once a week': 1}
freq_scale_inv = {y:x for x,y in freq_scale.items()}
agree_scale_inv = {y:x for x,y in agree_scale.items()}
likely_scale_inv = {y:x for x,y in likely_scale.items()}
year_range_inv = {y:x for x,y in year_range.items()}
day_range_inv = {y:x for x,y in day_range.items()}

authority_comfort = dict(zip([int(item[:1]) for item in presurvey['authority_comfort'].unique()],
                             presurvey['authority_comfort'].unique()))
collective_individual_scale = dict(zip([int(item[:1]) 
                                        for item in presurvey['collective_individual_scale'].unique()],
                                       presurvey['collective_individual_scale'].unique()))
planning_scale = dict(zip([int(item[:1]) for item in presurvey['planning_scale'].unique()],
                          presurvey['planning_scale'].unique()))
timeliness_scale = dict(zip([int(item[:1]) for item in presurvey['timeliness_scale'].unique()],
                            presurvey['timeliness_scale'].unique()))
emotion_scale = dict(zip([int(item[:1]) for item in presurvey['emotion_scale'].unique()],
                         presurvey['emotion_scale'].unique()))
success_rating = dict(zip([int(item[:1]) for item in presurvey['success_rating'].unique()],
                          presurvey['success_rating'].unique()))
leadership_rating = dict(zip([int(item[:1]) for item in presurvey['leadership_rating'].unique()],
                             presurvey['leadership_rating'].unique()))
leadership_preference = dict(zip([int(item[:1]) 
                                  for item in presurvey['leadership_preference'].unique()],
                                 presurvey['leadership_preference'].unique()))
gender_preference = dict(zip([int(item[:1]) for item in presurvey['gender_preference'].unique()],
                             presurvey['gender_preference'].unique()))

Create alias dictionaries:

In [15]:
presurvey_scale_alias_values = [freq_scale_inv, freq_scale_inv, year_range_inv, day_range_inv,
                                agree_scale_inv, agree_scale_inv, agree_scale_inv, agree_scale_inv,
                                agree_scale_inv, year_range_inv, authority_comfort,
                                collective_individual_scale, planning_scale, timeliness_scale,
                                emotion_scale, success_rating, leadership_rating, leadership_preference,
                                gender_preference, agree_scale_inv, agree_scale_inv]
presurvey_scale_alias = dict(zip(presurvey_scales, presurvey_scale_alias_values))

postsurvey_scale_alias_values = [likely_scale_inv, likely_scale_inv, likely_scale_inv, likely_scale_inv,
                                 likely_scale_inv, agree_scale_inv, agree_scale_inv]
postsurvey_scale_alias = dict(zip(postsurvey_scales, postsurvey_scale_alias_values))

Numericize values in the presurvey dataset:

In [16]:
presurvey['freq_browser'] = [freq_scale.get(response) for response in presurvey['freq_browser']]
presurvey['freq_phone'] = [freq_scale.get(response) for response in presurvey['freq_phone']]
presurvey['length_use'] = [year_range.get(response) for response in presurvey['length_use']]
presurvey['freq_use'] = [day_range.get(response) for response in presurvey['freq_use']]
presurvey['ability_find_tasks'] = [agree_scale.get(response) 
                                   for response in presurvey['ability_find_tasks']]
presurvey['promptly_find_tasks'] = [agree_scale.get(response)
                                    for response in presurvey['promptly_find_tasks']]
presurvey['text_search_preference'] = [agree_scale.get(response)
                                       for response in presurvey['text_search_preference']]
presurvey['notification_preference'] = [agree_scale.get(response)
                                        for response in presurvey['notification_preference']]
presurvey['message_preference'] = [agree_scale.get(response)
                                   for response in presurvey['message_preference']]
presurvey['other_markets_length_use'] = [year_range.get(response)
                                         for response in presurvey['other_markets_length_use']]
presurvey['authority_comfort'] = [int(response[:1]) for response in presurvey['authority_comfort']]
presurvey['collective_individual_scale'] = [int(response[:1])
                                            for response in presurvey['collective_individual_scale']]
presurvey['planning_scale'] = [int(response[:1]) for response in presurvey['planning_scale']]
presurvey['timeliness_scale'] = [int(response[:1]) for response in presurvey['timeliness_scale']]
presurvey['emotion_scale'] = [int(response[:1]) for response in presurvey['emotion_scale']]
presurvey['success_rating'] = [int(response[:1]) for response in presurvey['success_rating']]
presurvey['leadership_rating'] = [int(response[:1]) for response in presurvey['leadership_rating']]
presurvey['leadership_preference'] = [int(response[:1])
                                      for response in presurvey['leadership_preference']]
presurvey['gender_preference'] = [int(response[:1]) for response in presurvey['gender_preference']]
presurvey['neutral'] = [agree_scale.get(response) for response in presurvey['neutral']]
presurvey['task_satisfaction_pre'] = [agree_scale.get(response)
                                      for response in presurvey['task_satisfaction_pre']]

Numericize values in the postsurvey dataset:

In [17]:
postsurvey['likelihood_use'] = [likely_scale.get(response) for response in postsurvey['likelihood_use']]
postsurvey['likelihood_use_in_page_alert'] = [likely_scale.get(response) 
                                              for response 
                                              in postsurvey['likelihood_use_in_page_alert']]
postsurvey['likelihood_use_in_browser_alert'] = [likely_scale.get(response)
                                                 for response 
                                                 in postsurvey['likelihood_use_in_browser_alert']]
postsurvey['likelihood_receive_msg_request'] = [likely_scale.get(response)
                                                for response 
                                                in postsurvey['likelihood_receive_msg_request']]
postsurvey['likelihood_receive_msg_worker'] = [likely_scale.get(response)
                                               for response 
                                               in postsurvey['likelihood_receive_msg_worker']]
postsurvey['attention_check'] = [agree_scale.get(response) 
                                 for response in postsurvey['attention_check']]
postsurvey['task_satisfaction_post'] = [agree_scale.get(response)
                                        for response in postsurvey['task_satisfaction_post']]

## Join Datasets

In [18]:
combined = pd.merge(presurvey, postsurvey, how="outer", on=["id", "group", "chronicity", "region"])

combined_alias = {**presurvey_alias, **postsurvey_alias}

combined_scale_alias = {**presurvey_scale_alias, **postsurvey_scale_alias}

## Export Data

In [19]:
presurvey.to_pickle('./data/presurvey.pkl')
postsurvey.to_pickle('./data/postsurvey.pkl')
combined.to_pickle('./data/combined.pkl')
with open('./data/presurvey_alias.pkl', 'wb') as handle:
    pickle.dump(presurvey_alias, handle)
with open('./data/presurvey_scale_alias.pkl', 'wb') as handle:
    pickle.dump(presurvey_scale_alias, handle)
with open('./data/postsurvey_alias.pkl', 'wb') as handle:
    pickle.dump(postsurvey_alias, handle)
with open('./data/postsurvey_scale_alias.pkl', 'wb') as handle:
    pickle.dump(postsurvey_scale_alias, handle)
with open('./data/combined_alias.pkl', 'wb') as handle:
    pickle.dump(combined_alias, handle)
with open('./data/combined_scale_alias.pkl', 'wb') as handle:
    pickle.dump(combined_scale_alias, handle)