In [1]:
import pandas as pd
import pickle

# Data Processing

## Load Data

### Pre-survey

Load treatment data, rename columns, and make a dictionary so we can reference back to the verbose names given in the raw survey data

In [2]:
presurvey = pd.read_csv('./data/presurvey_raw_treatment.csv')

presurvey_names = ['timestamp', 'id', 'freq_browser', 'freq_phone',
         'length_use', 'freq_use', 'ability_find_tasks',
         'promptly_find_tasks', 'text_search_preference',
         'notification_preference', 'message_preference',
         'understood', 'browsers', 'work_aids', 'fora',
         'other_markets', 'other_markets_length_use',
         'authority_comfort', 'collective_individual_scale',
         'planning_scale', 'timeliness_scale', 'emotion_scale',
         'success_rating', 'leadership_rating', 'leadership_preference',
         'gender_preference', 'multitasking', 'short_term_plans',
         'linear_work_style', 'plan_change_aversion', 'plan_change_adaptable',
         'lateness_aversion', 'blank', 'spare_time_alone',
         'commitment_others_over_self', 'close_over_casual_friends',
         'internet_friends', 'support_from_friends', 'pandemic_effect_magnitude',
         'chosen_task_completion_confidence', 'complex_task_completion_confidence',
         'neutral', 'task_satisfaction', 'likelihood_continued_use',
         'task_motivation', 'plugin_preference', 'current_country',
         'home_country', 'only_one_country', 'education', 'gender', 'age']

presurvey_alias = dict(zip(presurvey_names, presurvey.columns))

presurvey.columns = presurvey_names

Load control data and add a column marking observations according to their group

In [3]:
presurvey['group'] = 'treatment'

control1 = pd.read_csv('./data/presurvey_raw_control-1.csv', names=presurvey_names, header=0)
control2 = pd.read_csv('./data/presurvey_raw_control-2.csv', names=presurvey_names, header=0)
control1['group'] = 'control'
control2['group'] = 'control'

presurvey = pd.concat([presurvey, control1, control2])
del control1
del control2

### Post-survey
Load treatment data and change the column names into simpler ones

In [4]:
postsurvey = pd.read_csv('./data/postsurvey_raw_control.csv', encoding='latin1')

postsurvey_names = ['timestamp', 'id', 'change_work', 'explain_change_work',
                    'create_new_work', 'explain_new_work', 'change_work_enjoyment',
                    'explain_change_work_enjoyment', 'create_new_experience',
                    'explain_new_experience', 'alert_usefulness', 
                    'explain_alert_usefulness', 'work_style_change', 'likelihood_use', 
                    'likelihood_use_in_page_alert','likelihood_use_in_browser_alert', 
                    'likelihood_receive_msg_request', 'likelihood_receive_msg_worker', 
                    'fav_thing','least_fav_thing', 'feature_change','further_participation', 
                    'confidence_all_task_completion', 'confidence_hard_task', 'attention_check',
                    'task_satisfaction', 'future_toloka_use',
                    'best_attempt_on_tokola', 'plugin_assistance', 'comment']

postsurvey_alias = dict(zip(postsurvey_names, postsurvey.columns))

postsurvey.columns = postsurvey_names

Load treatment data and add a column marking observations according to their group

In [5]:
postsurvey['group'] = 'control'

treatment = pd.read_csv('./data/postsurvey_raw_treatment.csv', names=postsurvey_names, header=0)
treatment['group'] = 'treatment'

postsurvey = pd.concat([postsurvey, treatment])
del treatment

Make sure everyone in the post-survey was seen in the pre-survey

In [6]:
postsurvey[~postsurvey['id'].isin(presurvey['id'])].drop_duplicates('id')

Unnamed: 0,timestamp,id,change_work,explain_change_work,create_new_work,explain_new_work,change_work_enjoyment,explain_change_work_enjoyment,create_new_experience,explain_new_experience,...,further_participation,confidence_all_task_completion,confidence_hard_task,attention_check,task_satisfaction,future_toloka_use,best_attempt_on_tokola,plugin_assistance,comment,group
0,1/16/2022 18:47:31,FINALFORM,Incredible perform very interesting tasks,,,,,,,,...,Yes,,,Strongly disagree,Slightly Agree,,,,Increasing homework and exams for the most act...,control
1,1/16/2022 18:49:45,FINALSURVEY2,nothing,,,,,,,,...,Yes,,,Strongly disagree,Neutral,,,,,control
2,1/23/2022 2:02:08,FINALSURVEY3,3,very fast,2.0,2 days,4.0,I enjoy it,5.0,very helpful,...,Yes,5.0,5.0,Strongly agree,Strongly agree,4.0,5.0,5.0,Nothing else. Thanks for the plug-in,control
5,2/20/2022 15:08:05,d27e322316b99d8bd347bf330e4f5b2,4,Cho tÃ´i nhÃ¬n tháº¥y khÃ¡ nhiá»u nhiá»m vu ...,4.0,LÃ m tÃ´i quáº£n lÃ½ viá»c tá»t hÆ¡n,5.0,lÃ m tÃ´i khÃ¡ thÃ­ch hÆ¡n vÃ¬ nÃ³ dá» sá»­ d...,3.0,khÃ¡ á»n,...,Yes,3.0,4.0,Disagree,Agree,5.0,3.0,4.0,Toi nghÄ© tÃ´i váº«n sáº½ dÃ¹ng plugin,control


Some records have invalid ids so let's remove those

In [7]:
postsurvey = postsurvey[~postsurvey['id'].isin(['FINALFORM','FINALSURVEY2','FINALSURVEY3',
                                                'd27e322316b99d8bd347bf330e4f5b2'])]

### Add Chronicity Data

Chronicity data is manually inferred and/or recorded for each country based on Table 14 from [1] and Table 3 from [2]. We label each subject with the chronicity corresponding to their home country.

[1] https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.531.7470&rep=rep1&type=pdf

[2] https://www.researchgate.net/publication/4781353_A_multi-country_study_of_the_adoption_of_ERP_systems

In [8]:
chronicity = pd.read_csv('./data/country_chronicity.csv')
chronicity = dict(zip(chronicity['country'], chronicity['chronicity']))
presurvey['chronicity'] = [chronicity[country] for country in presurvey['home_country']]
id_chronicity = dict(zip(presurvey[['id', 'chronicity']].drop_duplicates()['id'],
                         presurvey[['id', 'chronicity']].drop_duplicates()['chronicity']))
postsurvey['chronicity'] = [id_chronicity.get(_id, None) for _id in postsurvey['id']]
del chronicity
del id_chronicity

### Add Economic Region Data

Add a column indicating whether the worker is from Global North or South

data source: https://github.com/wikimedia-research/canonical-data/blob/master/countries.csv

In [9]:
region = pd.read_csv('./data/country_region.csv')
region = dict(zip(region['name'], region['economic_region']))
presurvey['region'] = [region[country].lower().replace(' ', '_')
                       for country in presurvey['home_country']]
id_region = dict(zip(presurvey[['id', 'region']].drop_duplicates()['id'],
                     presurvey[['id', 'region']].drop_duplicates()['region']))
postsurvey['region'] = [id_region[_id] for _id in postsurvey['id']]
del region
del id_region

## Data Cleaning

There are a considerable number of respondents who completed the survey multiple times, and they did not give the same answers each time. Worse, some managed to fill out both treatment and control group surveys. Let's go through and clean up these things. For respondents with multiple submissions, we'll keep only their first submission. First, let's look at all of the repeat submissions to the pre-survey.

In [10]:
presurvey.sort_values(['timestamp'], inplace=True)

for row in presurvey['id'].value_counts().rename_axis('id').reset_index(name='count').itertuples(index=False):
    if row.count == 1:
        pass
    else:
        print(presurvey[presurvey['id'] == row.id][['timestamp', 'id', 'group']])
        print('-----------------------------------------------------------------------')

             timestamp                              id      group
21  1/23/2022 11:19:08  ae18ab1aea6162de2276501ad651b6  treatment
44  1/26/2022 11:00:05  ae18ab1aea6162de2276501ad651b6  treatment
59   1/31/2022 6:23:19  ae18ab1aea6162de2276501ad651b6  treatment
66   2/11/2022 0:28:17  ae18ab1aea6162de2276501ad651b6  treatment
69  2/11/2022 10:42:47  ae18ab1aea6162de2276501ad651b6  treatment
82   2/12/2022 8:43:45  ae18ab1aea6162de2276501ad651b6  treatment
-----------------------------------------------------------------------
             timestamp                                id      group
4   1/26/2022 23:38:35  5fc991717e944f5142979e8878555271    control
5   1/26/2022 23:42:48  5fc991717e944f5142979e8878555271    control
22   1/31/2022 0:30:53  5fc991717e944f5142979e8878555271    control
29   2/11/2022 1:28:46  5fc991717e944f5142979e8878555271    control
88  2/13/2022 23:55:05  5fc991717e944f5142979e8878555271  treatment
----------------------------------------------------------

It seems safe to drop duplicates and keep only the first submission, it seems that the respondents who filled out both treatment and control surveys were in the control group and somehow got the treatment survey after.

In [11]:
presurvey.drop_duplicates(subset=['id'], keep='first', ignore_index=True, inplace=True)

Now let's do the same for the post-survey. First, look at all of the repeat submissions to ensure it's safe to drop duplicates and keep only the first submission.

In [12]:
postsurvey.sort_values(['timestamp'], inplace=True)

for row in postsurvey['id'].value_counts().rename_axis('id').reset_index(name='count').itertuples(index=False):
    if row.count == 1:
        pass
    else:
        print(postsurvey[postsurvey['id'] == row.id][['timestamp', 'id', 'group']])
        print('-----------------------------------------------------------------------')

             timestamp                                id      group
28  2/24/2022 12:57:42  356cd2c5f07830925f6a46ee42e9c748  treatment
28  2/24/2022 12:57:42  356cd2c5f07830925f6a46ee42e9c748    control
29  2/24/2022 13:17:03  356cd2c5f07830925f6a46ee42e9c748  treatment
29  2/24/2022 13:17:03  356cd2c5f07830925f6a46ee42e9c748    control
-----------------------------------------------------------------------
             timestamp                               id      group
19  2/20/2022 22:18:25  3e276dee7587aa0b0db8d1aca5998c5  treatment
19  2/20/2022 22:18:25  3e276dee7587aa0b0db8d1aca5998c5    control
-----------------------------------------------------------------------
             timestamp                              id      group
30  2/24/2022 16:56:10  fbd2ee5b8dcfa3d939cb5df1a92552    control
30  2/24/2022 16:56:10  fbd2ee5b8dcfa3d939cb5df1a92552  treatment
-----------------------------------------------------------------------
            timestamp                        

Since we have duplicated records with the same timestamp and different groups, we will assign records in the postsurvey data to the group associated with their id in the now cleaned presurvey data.

In [18]:
id_group = dict(zip(presurvey['id'], presurvey['group']))
postsurvey['group'] = [id_group[_id] for _id in postsurvey['id']]
del id_group

postsurvey.drop_duplicates(subset=['id'], keep='first', ignore_index=True, inplace=True)

In [19]:
postsurvey

Unnamed: 0,timestamp,id,change_work,explain_change_work,create_new_work,explain_new_work,change_work_enjoyment,explain_change_work_enjoyment,create_new_experience,explain_new_experience,...,confidence_hard_task,attention_check,task_satisfaction,future_toloka_use,best_attempt_on_tokola,plugin_assistance,comment,group,chronicity,region
0,2/19/2022 13:08:36,518cd56975ccc59032da3137d845c6b,5,helps search tasks,5.0,it made me adjust my timing to complete tasks,5.0,finding convenient tasks,5.0,made interact more with toloka tasks,...,4.0,Strongly disagree,Strongly agree,4.0,5.0,5.0,great plugin,control,Polychronic,global_south
1,2/20/2022 14:59:35,32ae306c14a39c07f685ea94adf6475,3,It didn't change how I work that much really b...,2.0,The plugin didn't really change my work schedu...,4.0,Since installing the plugin I felt more inclin...,4.0,I checked into Toloka more during the period t...,...,4.0,Strongly disagree,Agree,4.0,5.0,5.0,Enjoyed this project and I am currently enjoyi...,control,Monochronic,global_north
2,2/20/2022 15:08:31,7efc45e5132625d5314ddfc349384541,1,It didn't changed much of my work on Toloka. I...,1.0,I work on tasks whenever i want or when my fav...,3.0,I always enjoy my work on toloka but i Believe...,3.0,Not much. But i think it is good to have a plu...,...,3.0,Strongly agree,Slightly Agree,5.0,5.0,2.0,It would be really helpful if there are a lot ...,control,Polychronic,global_south
3,2/20/2022 15:21:27,27fab4b7ccdc585dc35897de76e44a8c,4,I am satisfied with it but there should be a a...,4.0,It made me to remember to do the tasks,5.0,It engaged me more,5.0,It is good but it takes more time to load,...,4.0,Strongly disagree,Agree,4.0,5.0,4.0,Nice experience with the plugin but one thing ...,treatment,Polychronic,global_south
4,2/20/2022 15:23:55,86a36fbbe35a283dee16ff8280e5ed5,4,More tasks seemed available,4.0,Worked more in daytime,5.0,Created more success,4.0,General improvement in all areas,...,4.0,Strongly disagree,Slightly Agree,4.0,5.0,5.0,,control,Monochronic,global_north
5,2/20/2022 15:35:22,6ecf5c6464172449bad6820af297e50,3,The plugin didn't change anything about my wor...,5.0,I complete any task that is available on Toluc...,3.0,The plug-in didn't change anything in my perfo...,3.0,The tool did not add much to my performance in...,...,5.0,Strongly disagree,Strongly agree,5.0,5.0,3.0,"First, I want the plugin to be a tool to provi...",treatment,Monochronic,global_north
6,2/20/2022 16:25:42,d3b3aaccd59dbdb3385a06dba56113,4,Tool much good for remeber tasks.,5.0,Many times of the day.,5.0,Changed all for better.,5.0,Create new mode to do tasks more efficient.,...,4.0,Strongly disagree,Strongly agree,5.0,5.0,5.0,,treatment,Polychronic,global_south
7,2/20/2022 16:50:36,25daec93559c8d639738bb2c2d8f3194,2,It took a long time for the extension to activate,2.0,It didn't change anything because the extensio...,1.0,nothing change,1.0,nothing change,...,4.0,Strongly disagree,Neutral,4.0,4.0,2.0,,treatment,Polychronic,global_south
8,2/20/2022 19:17:09,3741da8e9dd5e15f64a570dbb218a7,5,easy to findd available tasks,5.0,one could easily find the available jobs,5.0,faster access,5.0,faster access,...,5.0,Strongly disagree,Agree,5.0,5.0,5.0,no,treatment,Polychronic,global_south
9,2/20/2022 19:36:05,95d3785df122a99679fdd8cb4d6e4a0,4,Thanks to the plugin I could notice tasks that...,3.0,I could do more tasks during the week but it d...,3.0,It was a productive tool,4.0,It was a tool that helped me to improve my pro...,...,4.0,Strongly disagree,Strongly agree,5.0,5.0,4.0,,control,Monochronic,global_north


## Export Data

In [20]:
presurvey.to_pickle('./data/presurvey.pkl')
postsurvey.to_pickle('./data/postsurvey.pkl')
with open('./data/presurvey_alias.pkl', 'wb') as handle:
    pickle.dump(presurvey_alias, handle)
with open('./data/postsurvey_alias.pkl', 'wb') as handle:
    pickle.dump(postsurvey_alias, handle)