In [1]:
import pandas as pd
import pickle

# Data Processing

## Load Data

### Pre-survey

Load treatment data, rename columns, and make a dictionary so we can reference back to the verbose names given in the raw survey data

In [2]:
presurvey = pd.read_csv('./data/presurvey_raw_treatment.csv')

presurvey_names = ['timestamp', 'id', 'freq_browser', 'freq_phone',
         'length_use', 'freq_use', 'ability_find_tasks',
         'promptly_find_tasks', 'text_search_preference',
         'notification_preference', 'message_preference',
         'understood', 'browsers', 'work_aids', 'fora',
         'other_markets', 'other_markets_length_use',
         'authority_comfort', 'collective_individual_scale',
         'planning_scale', 'timeliness_scale', 'emotion_scale',
         'success_rating', 'leadership_rating', 'leadership_preference',
         'gender_preference', 'multitasking', 'short_term_plans',
         'linear_work_style', 'plan_change_aversion', 'plan_change_adaptable',
         'lateness_aversion', 'blank', 'spare_time_alone',
         'commitment_others_over_self', 'close_over_casual_friends',
         'internet_friends', 'support_from_friends', 'pandemic_effect_magnitude',
         'chosen_task_completion_confidence', 'complex_task_completion_confidence',
         'neutral', 'task_satisfaction', 'likelihood_continued_use',
         'task_motivation', 'plugin_preference', 'current_country',
         'home_country', 'only_one_country', 'education', 'gender', 'age']

presurvey_alias = dict(zip(presurvey_names, presurvey.columns))

presurvey.columns = presurvey_names

Load control data and add a column marking observations according to their group

In [3]:
presurvey['group'] = 'treatment'

control1 = pd.read_csv('./data/presurvey_raw_control-1.csv', names=presurvey_names, header=0)
control2 = pd.read_csv('./data/presurvey_raw_control-2.csv', names=presurvey_names, header=0)
control1['group'] = 'control'
control2['group'] = 'control'

presurvey = pd.concat([presurvey, control1, control2])
del control1
del control2

### Post-survey
Load treatment data and change the column names into simpler ones

In [4]:
postsurvey = pd.read_csv('./data/postsurvey_raw_control.csv', encoding='latin1')

postsurvey_names = ['timestamp', 'id', 'change_work', 'explain_change_work',
                    'create_new_work', 'explain_new_work', 'change_work_enjoyment',
                    'explain_change_work_enjoyment', 'create_new_experience',
                    'explain_new_experience', 'alert_usefulness', 
                    'explain_alert_usefulness', 'work_style_change', 'likelihood_use', 
                    'likelihood_use_in_page_alert','likelihood_use_in_browser_alert', 
                    'likelihood_receive_msg_request', 'likelihood_receive_msg_worker', 
                    'fav_thing','least_fav_thing', 'feature_change','further_participation', 
                    'confidence_all_task_completion', 'confidence_hard_task', 'attention_check',
                    'task_satisfaction', 'future_toloka_use',
                    'best_attempt_on_tokola', 'plugin_assistance', 'comment']

postsurvey_alias = dict(zip(postsurvey_names, postsurvey.columns))

postsurvey.columns = postsurvey_names

Load treatment data and add a column marking observations according to their group

In [5]:
postsurvey['group'] = 'control'

treatment = pd.read_csv('./data/postsurvey_raw_treatment.csv', names=postsurvey_names, header=0)
treatment['group'] = 'treatment'

postsurvey = pd.concat([postsurvey, treatment])
del treatment

### Add Chronicity Data

Chronicity data is manually inferred and/or recorded for each country based on Table 14 from [1] and Table 3 from [2]. We label each subject with the chronicity corresponding to their home country.

[1] https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.531.7470&rep=rep1&type=pdf

[2] https://www.researchgate.net/publication/4781353_A_multi-country_study_of_the_adoption_of_ERP_systems

In [6]:
chronicity = pd.read_csv('./data/country_chronicity.csv')
chronicity = dict(zip(chronicity['country'], chronicity['chronicity']))
presurvey['chronicity'] = [chronicity[country] for country in presurvey['home_country']]
id_chronicity = dict(zip(presurvey[['id', 'chronicity']].drop_duplicates()['id'],
                         presurvey[['id', 'chronicity']].drop_duplicates()['chronicity']))
postsurvey['chronicity'] = [id_chronicity.get(_id, None) for _id in postsurvey['id']]
del chronicity
del id_chronicity

### Add Economic Region Data

Add a column indicating whether the worker is from Global North or South

data source: https://github.com/wikimedia-research/canonical-data/blob/master/countries.csv

In [7]:
region = pd.read_csv('./data/country_region.csv')
region = dict(zip(region['name'], region['economic_region']))
presurvey['region'] = [region[country].lower().replace(' ', '_')
                       for country in presurvey['home_country']]
id_region = dict(zip(presurvey[['id', 'region']].drop_duplicates()['id'],
                     presurvey[['id', 'region']].drop_duplicates()['region']))
postsurvey['region'] = [id_region.get(_id, None) for _id in postsurvey['id']]
del region
del id_region

## Data Cleaning

There are a considerable number of respondents who completed the survey multiple times, and they did not give the same answers each time. Worse, some managed to fill out both treatment and control group surveys. Let's go through and clean up these things. For respondents with multiple submissions, we'll keep only their first submission. First, let's look at all of the repeat submissions to the pre-survey.

In [8]:
presurvey.sort_values(['timestamp'], inplace=True)

for row in presurvey['id'].value_counts().rename_axis('id').reset_index(name='count').itertuples(index=False):
    if row.count == 1:
        pass
    else:
        print(presurvey[presurvey['id'] == row.id][['timestamp', 'id', 'group']])
        print('-----------------------------------------------------------------------')

             timestamp                              id      group
21  1/23/2022 11:19:08  ae18ab1aea6162de2276501ad651b6  treatment
44  1/26/2022 11:00:05  ae18ab1aea6162de2276501ad651b6  treatment
59   1/31/2022 6:23:19  ae18ab1aea6162de2276501ad651b6  treatment
66   2/11/2022 0:28:17  ae18ab1aea6162de2276501ad651b6  treatment
69  2/11/2022 10:42:47  ae18ab1aea6162de2276501ad651b6  treatment
82   2/12/2022 8:43:45  ae18ab1aea6162de2276501ad651b6  treatment
-----------------------------------------------------------------------
             timestamp                                id      group
4   1/26/2022 23:38:35  5fc991717e944f5142979e8878555271    control
5   1/26/2022 23:42:48  5fc991717e944f5142979e8878555271    control
22   1/31/2022 0:30:53  5fc991717e944f5142979e8878555271    control
29   2/11/2022 1:28:46  5fc991717e944f5142979e8878555271    control
88  2/13/2022 23:55:05  5fc991717e944f5142979e8878555271  treatment
----------------------------------------------------------

It seems safe to drop duplicates and keep only the first submission, it seems that the respondents who filled out both treatment and control surveys were in the control group and somehow got the treatment survey after.

In [9]:
presurvey.drop_duplicates(subset=['id'], keep='first', ignore_index=True, inplace=True)

Now let's do the same for the post-survey. First, look at all of the repeat submissions to ensure it's safe to drop duplicates and keep only the first submission.

In [10]:
postsurvey.sort_values(['timestamp'], inplace=True)

for row in postsurvey['id'].value_counts().rename_axis('id').reset_index(name='count').itertuples(index=False):
    if row.count == 1:
        pass
    else:
        print(postsurvey[postsurvey['id'] == row.id][['timestamp', 'id', 'group']])
        print('-----------------------------------------------------------------------')

             timestamp                                id      group
28  2/24/2022 12:57:42  356cd2c5f07830925f6a46ee42e9c748  treatment
28  2/24/2022 12:57:42  356cd2c5f07830925f6a46ee42e9c748    control
29  2/24/2022 13:17:03  356cd2c5f07830925f6a46ee42e9c748  treatment
29  2/24/2022 13:17:03  356cd2c5f07830925f6a46ee42e9c748    control
-----------------------------------------------------------------------
            timestamp         id      group
0  1/16/2022 18:47:31  FINALFORM    control
0  1/16/2022 18:47:31  FINALFORM  treatment
-----------------------------------------------------------------------
            timestamp            id      group
1  1/16/2022 18:49:45  FINALSURVEY2    control
1  1/16/2022 18:49:45  FINALSURVEY2  treatment
-----------------------------------------------------------------------
            timestamp                                id      group
27  2/22/2022 5:32:47  5fc991717e944f5142979e8878555271  treatment
27  2/22/2022 5:32:47  5fc991717e944f

This time it's more confusing. There are some ids that seem invalid, and it looks like somehow each submission was entirely duplicated into both treatment and control groups. If we can confirm that the deduplication was correct for the presurvey, we can make sure that we only keep the records where group in postsurvey matches group in presurvey, and then deduplicate for actuall repeat submissions.

## Export Data

In [11]:
presurvey.to_pickle('./data/presurvey.pkl')
postsurvey.to_pickle('./data/postsurvey.pkl')
with open('./data/presurvey_alias.pkl', 'wb') as handle:
    pickle.dump(presurvey_alias, handle)
with open('./data/postsurvey_alias.pkl', 'wb') as handle:
    pickle.dump(postsurvey_alias, handle)