In [16]:
import pandas as pd
import plotly.express as px

ModuleNotFoundError: No module named 'plotly'

# Data Processing

## Load data, join treatment and control groups

Load treatment data, rename columns, and make a dictionary so we can reference back to the verbose names given in the raw survey data

In [17]:
df = pd.read_csv('./data/survey_raw_treatment.csv')

names = ['timestamp', 'id', 'freq_browser', 'freq_phone',
         'length_use', 'freq_use', 'ability_find_tasks',
         'promptly_find_tasks', 'text_search_preference',
         'notification_preference', 'message_preference',
         'understood', 'browsers', 'work_aids', 'fora',
         'other_markets', 'other_markets_length_use',
         'authority_comfort', 'collective_individual_scale',
         'planning_scale', 'timeliness_scale', 'emotion_scale',
         'success_rating', 'leadership_rating', 'leadership_preference',
         'gender_preference', 'multitasking', 'short_term_plans',
         'linear_work_style', 'plan_change_aversion', 'plan_change_adaptable',
         'lateness_aversion', 'blank', 'spare_time_alone',
         'commitment_others_over_self', 'close_over_casual_friends',
         'internet_friends', 'support_from_friends', 'pandemic_effect_magnitude',
         'chosen_task_completion_confidence', 'complex_task_completion_confidence',
         'neutral', 'task_satisfaction', 'likelihood_continued_use',
         'task_motivation', 'plugin_preference', 'current_country',
         'home_country', 'only_one_country', 'education', 'gender', 'age']

aliases = dict(zip(names, df.columns))

df.columns = names

Load control data and add a column marking observations according to their group

In [18]:
df['group'] = 'treatment'

control1 = pd.read_csv('./data/survey_raw_control-1.csv', names=names, header=0)
control2 = pd.read_csv('./data/survey_raw_control-2.csv', names=names, header=0)
control1['group'] = 'control'
control2['group'] = 'control'

df = pd.concat([df, control1, control2])
del control1
del control2

## Add Chronicity Data

Chronicity data is manually inferred and/or recorded for each country based on Table 14 from [1] and Table 3 from [2]. We label each subject with the chronicity corresponding to their home country.

[1] https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.531.7470&rep=rep1&type=pdf

[2] https://www.researchgate.net/publication/4781353_A_multi-country_study_of_the_adoption_of_ERP_systems

In [19]:
chronicity = pd.read_csv('./data/country_chronicity.csv')
chronicity = dict(zip(chronicity['country'], chronicity['chronicity']))
df['chronicity'] = [chronicity[country] for country in df['home_country']]
del chronicity

## Add Economic Region Data

Add a column indicating whether the worker is from Global North or South

data source: https://github.com/wikimedia-research/canonical-data/blob/master/countries.csv

In [20]:
# Inspect what countries are the workers from
df['home_country'].unique()

array(['Belarus', 'Mexico', 'Russia', 'Philippines', 'United States',
       'United Kingdom', 'Ghana', 'Jordan', 'Dominican Republic',
       'Nigeria', 'Pakistan', 'Brazil', 'Costa Rica', 'Vietnam',
       'Venezuela', 'Canada', 'Kenya', 'Egypt', 'Morocco', 'India',
       'Algeria', 'Myanmar', 'Portugal', 'Spain', 'South Africa',
       'Bulgaria', 'Georgia', 'Poland', 'Saudi Arabia', 'Tanzania',
       'Kazakhstan', 'Ukraine'], dtype=object)

In [21]:
country_region = pd.read_csv('./data/country_region.csv')
country_region.index = country_region.index + 1
country_region = country_region.iloc[:-1 , :][["name","economic_region"]]
country_region

Unnamed: 0,name,economic_region
1,Afghanistan,Global South
2,Åland Islands,Global North
3,Albania,Global North
4,Algeria,Global South
5,American Samoa,Global South
...,...,...
246,Wallis and Futuna,Global South
247,Western Sahara,Global South
248,Yemen,Global South
249,Zambia,Global South


In [22]:
global_south = list(country_region[country_region['economic_region'] == 'Global South']['name'])
global_north = list(country_region[country_region['economic_region'] == 'Global North']['name'])

# Add a column indicating whether the worker is from Global North or South
def replace_region(row):
    if row['home_country'] in global_south:
        return 'global_south'
    
    if row['home_country'] in global_north:
        return 'global_north'
    else: 
        print ("Error!")
        
df['region'] = df.apply(lambda row: replace_region(row), axis=1)
df[['id', 'home_country','region']]

# fron here we can slice the data based on region

Unnamed: 0,id,home_country,region
0,2aef4422f3276e44b1eea0c2a89374,Belarus,global_south
1,af2a7da96237f5baa54f6e67c1c0f1,Mexico,global_south
2,1cb48555a624b4c6fbe5e811271eb,Russia,global_north
3,8d87dd60d895e59dd459a6593144461,Philippines,global_south
4,e2f37f95c0b82dc037a56f768d591b1e,United States,global_north
...,...,...,...
33,732db8b6b7c6daeb4e36ab82758b9e42,Tanzania,global_south
34,559cbf8dad41224391887f67b67777f6,Kenya,global_south
35,e11e709a1d4b887ae15b821629a899,Russia,global_north
36,fa50dd3e40cd54201a6f4a3c5f45ecda,Kazakhstan,global_south


In [23]:
# count the number of people from global_north and global south
print("num_south: ", len(list(filter(lambda x: x == "global_south", df['region']))))
print("num_north: ", len(list(filter(lambda x: x == "global_north", df['region']))))

num_south:  93
num_north:  71


## Data Exploration

### Treatment Group

Summarize what workers from global north and global south responded to question 1-4

In [24]:
fig = px.histogram(df[df['group']=='treatment'], y='freq_browser', color='region', barmode='group',
                   histnorm='percent', title=aliases['freq_browser'],
                   category_orders=dict(freq_browser=["Always", "Often", "Sometimes", "Rarely", "Never"]))
fig.show()

NameError: name 'px' is not defined

1. Always: People from global_south are more likely (and more often) to use Toloka on the web browser vs. people from global north.
2. Often: People from global_north are more likely to use Toloka often on the web browser
3. Sometimes: People from global_north are more likely to use Toloka some times on the web browser

In [25]:
fig = px.histogram(df[df['group']=='treatment'], y='freq_phone', color='region', barmode='group',
                   histnorm='percent', title=aliases['freq_phone'],
                   category_orders=dict(freq_phone=["Always", "Often", "Sometimes", "Rarely", "Never"]))
fig.show()

NameError: name 'px' is not defined

1. Always: People from global_north are more likely to always use Toloka 

2. Often: People from global_south are more likely to use Toloka often
3. Sometimes: People from global_north are more likely to use Toloka sometimes 
4. Rarely: People from global_north are more likely to use Toloka rarely
5. Never: People from global_north are more likely to never use Toloka 

In [26]:
fig = px.histogram(df[df['group']=='treatment'], y='length_use', color='region', barmode='group',
                   histnorm='percent', title=aliases['length_use'],
                   category_orders=dict(length_use=["Less than a month",
                                                    "Between 1 and 3 months",
                                                    "Between 3 and 6 months",
                                                    "Between 6 months and 1 year",
                                                    "Between 1 and 2 years",
                                                    "More than 2 and 3 years",
                                                    "More than 3 years"]))
fig.show()

NameError: name 'px' is not defined

People from global south tend to work on Toloka longer than people from global north.

In [27]:
fig = px.histogram(df[df['group']=='treatment'], y='freq_use', color='region', barmode='group',
                   histnorm='percent', title=aliases['freq_use'],
                   category_orders=dict(freq_use=["Everyday", "From five to six days a week",
                                                  "From three to four days a week",
                                                  "Once or twice a week", "Less than once a week"]))
fig.show()

NameError: name 'px' is not defined

People from globak_south are more likely to work on Toloka everyday, while people from global_north are more likely to work on Toloka from 5-6 days a week, from 3-4 days a week, and once or twice a week.

In [28]:
fig = px.histogram(df[df['group']=='treatment'], y='ability_find_tasks', color='region',
                   barmode='group', histnorm='percent',
                   title=aliases['ability_find_tasks'].replace(' [', '<br>['),
                   category_orders=dict(ability_find_tasks=["Strongly disagree", "Disagree",
                                                            "Neutral", "Agree", "Strongly Agree"]))
fig.show()

NameError: name 'px' is not defined

Strongly Agree:
    North: 13/30 = 0.433
    South: 27/59 = 0.458
Agree:
    North: 12/30 = 0.4
    South: 16/59 = 0.271
    
People from global_north and global_south have relatively similar possibilities of having a positive feeling towards Toloka.
While people from global_north are more likely to strongly disgaree that Toloka's interface allows them to find their preferred tasks.

In [29]:
fig = px.histogram(df[df['group']=='treatment'], y='promptly_find_tasks', color='region',
                   barmode='group', histnorm='percent',
                   title=aliases['promptly_find_tasks'].replace(' [', '<br>['),
                   category_orders=dict(promptly_find_tasks=["Strongly disagree", "Disagree",
                                                            "Neutral", "Agree", "Strongly Agree"]))
fig.show()

NameError: name 'px' is not defined

More than half of the people from global_north strongly agrees that they are able to find promptly new tasks that interests them the most while only 37.3% of people from global_south think so.

About 18.6% of people from global south think they neither can or cannot find promptly new tasks that interests them the most, while only 6.7% of people from global north thinks so.

In [30]:
fig = px.histogram(df[df['group']=='treatment'], y='text_search_preference', color='region',
                   barmode='group', histnorm='percent',
                   title=aliases['text_search_preference'].replace(' [', '<br>['),
                   category_orders=dict(text_search_preference=["Strongly disagree", "Disagree",
                                                            "Neutral", "Agree", "Strongly Agree"]))
fig.show()

NameError: name 'px' is not defined

63.3% of people from global_north strongly agrees that a text search for tasks feature would improve the interface; while 44.1% of people from global south thinks so. 

26.7% of people from north agrees vs. 37.3% of people agrees from the south

In [31]:
fig = px.histogram(df[df['group']=='treatment'], y='notification_preference', color='region', 
                   barmode='group', histnorm='percent',
                   title=aliases['notification_preference'].replace(' [', '<br>['),
                   category_orders=dict(notification_preference=["Strongly disagree", "Disagree",
                                                            "Neutral", "Agree", "Strongly Agree"]))
fig.show()

NameError: name 'px' is not defined

In [32]:
fig = px.histogram(df[df['group']=='treatment'], y='message_preference', color='region',
                   barmode='group', histnorm='percent',
                   title=aliases['message_preference'].replace(
                       ' [', '<br>[').replace('would ', 'would<br>'),
                   category_orders=dict(message_preference=["Strongly disagree", "Disagree",
                                                            "Neutral", "Agree", "Strongly Agree"]))
fig.show()

NameError: name 'px' is not defined

### Control vs. Treatment Group Comparison

#### Summary Statistics (Quantitative Variables only? I think...)

In [33]:
df[df['group']=='treatment'].describe()

Unnamed: 0,multitasking,short_term_plans,linear_work_style,plan_change_aversion,plan_change_adaptable,lateness_aversion,blank,spare_time_alone,commitment_others_over_self,close_over_casual_friends,internet_friends,support_from_friends,pandemic_effect_magnitude,chosen_task_completion_confidence,complex_task_completion_confidence,likelihood_continued_use,task_motivation,plugin_preference
count,92.0,92.0,92.0,92.0,92.0,92.0,16.0,92.0,92.0,92.0,92.0,92.0,92.0,89.0,87.0,86.0,88.0,88.0
mean,3.152174,3.521739,4.26087,4.173913,3.434783,4.304348,3.875,3.891304,3.641304,3.934783,3.315217,3.554348,3.26087,4.370787,3.965517,4.569767,4.772727,4.511364
std,1.482015,1.143335,0.924271,1.001194,1.21623,0.946238,1.258306,0.954533,1.043889,1.211704,1.221864,1.286994,1.451559,0.788872,0.957811,0.759887,0.496592,0.758012
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,3.0,2.0
25%,2.0,3.0,4.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,4.0,3.0,4.0,5.0,4.0
50%,3.0,3.0,5.0,4.5,3.0,5.0,4.0,4.0,4.0,4.0,3.0,4.0,3.0,5.0,4.0,5.0,5.0,5.0
75%,4.25,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,4.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
max,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0


In [34]:
df[df['group']=='control'].describe()

Unnamed: 0,multitasking,short_term_plans,linear_work_style,plan_change_aversion,plan_change_adaptable,lateness_aversion,blank,spare_time_alone,commitment_others_over_self,close_over_casual_friends,internet_friends,support_from_friends,pandemic_effect_magnitude,chosen_task_completion_confidence,complex_task_completion_confidence,likelihood_continued_use,task_motivation,plugin_preference
count,72.0,72.0,72.0,72.0,72.0,72.0,4.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,69.0
mean,2.805556,3.305556,3.986111,4.055556,3.486111,4.319444,4.0,4.111111,3.638889,4.083333,3.111111,3.25,2.652778,4.319444,3.986111,4.305556,4.736111,3.956522
std,1.507156,1.108722,0.999902,0.917573,1.16272,0.885348,1.154701,0.986608,0.968599,1.097372,1.273296,1.329513,1.557894,0.747322,0.778099,0.882361,0.649877,0.881756
min,1.0,1.0,1.0,2.0,1.0,2.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0
25%,1.0,3.0,4.0,3.0,2.75,4.0,3.0,3.0,3.0,3.0,2.0,2.0,1.0,4.0,3.0,4.0,5.0,3.0
50%,3.0,3.0,4.0,4.0,4.0,5.0,4.0,4.0,4.0,4.0,3.0,3.0,2.0,4.0,4.0,5.0,5.0,4.0
75%,4.0,4.0,5.0,5.0,4.0,5.0,5.0,5.0,4.0,5.0,4.0,4.0,4.0,5.0,5.0,5.0,5.0,5.0
max,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0


#### Number of Workers from Global North vs. South

In [35]:
print("\nNumber of Workers from Global North vs. South\n")
# Treament group
print("Treatment")
print("num_south: ", len(list(filter(lambda x: x == "global_south", df[df['group']=='treatment']['region']))))
print("num_north: ", len(list(filter(lambda x: x == "global_north", df[df['group']=='treatment']['region']))))

print("---------------")

# Control group
print("Control")
print("num_south: ", len(list(filter(lambda x: x == "global_south", df[df['group']=='control']['region']))))
print("num_north: ", len(list(filter(lambda x: x == "global_north", df[df['group']=='control']['region']))), "\n")


Number of Workers from Global North vs. South

Treatment
num_south:  61
num_north:  31
---------------
Control
num_south:  32
num_north:  40 



#### Number of Workers from Polychronic and Monochronic cultures

In [36]:
print("\nNumber of Workers from Polychronic vs. Monochronic cultures\n")

# Treament group
print("Treatment")
print("num_poly: ", len(list(filter(lambda x: x == "Polychronic", df[df['group']=='treatment']['chronicity']))))
print("num_mono: ", len(list(filter(lambda x: x == "Monochronic", df[df['group']=='treatment']['chronicity']))))

print("--------------")

# Control group
print("Control")
print("num_poly: ", len(list(filter(lambda x: x == "Polychronic", df[df['group']=='control']['chronicity']))))
print("num_mono: ", len(list(filter(lambda x: x == "Monochronic", df[df['group']=='control']['chronicity']))), "\n")


Number of Workers from Polychronic vs. Monochronic cultures

Treatment
num_poly:  54
num_mono:  38
--------------
Control
num_poly:  31
num_mono:  41 



### Basic Demographics

Note that 'current_country' and 'home_country' is not summarized here as we are more interested in the economic region and chronicity (which are implied by the countries)

#### Level of education

In [37]:
# Treatment
df[df['group']=='treatment'].groupby(['education']).size().reset_index(name='counts')

Unnamed: 0,education,counts
0,Associate degree,3
1,Bachelor’s degree,37
2,Doctorate degree,1
3,Elementary school,1
4,"High school graduate, diploma or the equivalen...",22
5,Master’s degree,12
6,"Some college credit, no degree",12
7,"Some high school, no diploma",3
8,Trade/technical/vocational training,1


In [38]:
# Control
df[df['group']=='control'].groupby(['education']).size().reset_index(name='counts')

Unnamed: 0,education,counts
0,Associate degree,1
1,Bachelor’s degree,22
2,"High school graduate, diploma or the equivalen...",11
3,Master’s degree,9
4,Professional degree,2
5,"Some college credit, no degree",23
6,"Some high school, no diploma",2
7,Trade/technical/vocational training,2


Note that the control group is missing the "elementary" category

#### Gender

In [39]:
# Treatment
df[df['group']=='treatment'].groupby(['gender']).size().reset_index(name='counts')

Unnamed: 0,gender,counts
0,Female,25
1,Male,67


In [40]:
# Control
df[df['group']=='control'].groupby(['gender']).size().reset_index(name='counts')

Unnamed: 0,gender,counts
0,Female,33
1,Male,39


#### Age

In [41]:
# Treatment
df[df['group']=='treatment'].groupby(['age']).size().reset_index(name='counts')

Unnamed: 0,age,counts
0,18-24 years old,25
1,25-34 years old,47
2,35-44 years old,16
3,45-54 years old,1
4,55-64 years old,3


In [42]:
# Control
df[df['group']=='control'].groupby(['age']).size().reset_index(name='counts')

Unnamed: 0,age,counts
0,18-24 years old,22
1,25-34 years old,30
2,35-44 years old,15
3,45-54 years old,4
4,55-64 years old,1


### Summary

### Next Step: Demographics given in North vs. South, or Poly vs. Mono?