In [1]:
import pandas as pd
import os
import pickle
from collections import Counter
import time

### Loading in data

In [2]:
# setting up the data directory
data_dir = '../../AI_perception_survey_data/WE/cleaned'

In [3]:
# listing data files
all_files = os.listdir(data_dir)

In [4]:
# extracting the .pickle files
all_files = [file for file in all_files if file.split('.')[1] == 'pickle']
all_files

['gm_all.pickle', 'pf_all.pickle']

In [5]:
# loading in the dataframes
with open(os.path.join(data_dir, all_files[0]), 'rb') as handle:
    gm_all = pickle.load(handle)
with open(os.path.join(data_dir, all_files[1]), 'rb') as handle:
    pf_all = pickle.load(handle)

In [66]:
# concatenating
we = pd.concat([gm_all, pf_all])

### Standardisation of Country of Residence

We went ahead and created a conversion dict, see country_conversion.py

In [3]:
from country_conversion import country_conversion_dict

In [68]:
# recode
we.country_of_residence = [country_conversion_dict[country] for country in we.country_of_residence]

In [69]:
Counter(we.country_of_residence)

Counter({'BE': 12,
         'CH': 4,
         'DE': 77,
         'ES': 2,
         'FR': 125,
         'IE': 1,
         'LU': 1,
         'NL': 8,
         'PT': 1,
         'UK': 89,
         'non_WE': 8})

OK so we've cleaned this up nicely, now let's go on and remove the non_WE entries

In [70]:
we = we[[country != 'non_WE' for country in we.country_of_residence]].reset_index(drop = True)

In [71]:
we.head(3)

Unnamed: 0,country_of_residence,q01,q02,q03,q04,q05,q06,q07,q08,q09,...,q20,q21,q22,q23,q24,q25,q26,q27,q28,q29
0,UK,C,"[B, C, D, G, I]",A,6,2,4,9,4,5,...,A,,C,B,"[B, C]",D,It engineer,C,A,3
1,UK,C,"[B, D]",A,3,2,7,7,4,3,...,B,"[A, B, C, D]",E,B,"[A, B, C, D]",C,Marketing,B,A,6
2,UK,C,"[B, D]",C,6,6,6,7,4,6,...,B,[D],D,B,"[A, B, D]",C,Consultancy,B,A,6


### Categorisation of profession

In [4]:
# we have gone done written a very simple class to translate
# it uses selenium (web version and not the API) and sends the texts to google translate
# see gtranslate.py
from gtranslate import selenium_gtranslate

In [79]:
# initiate
transl = selenium_gtranslate()

In [80]:
# translate in bulk
translated = transl.translate_in_bulk(we.q26)

navigating to google translate
now translating ...
text no.12 complete,total time lapsed: 12.88108229637146 seconds, estimated time remaining: 332.7612926562627 secondsss... stale element ...!
text no.96 complete,total time lapsed: 102.23595905303955 seconds, estimated time remaining: 240.68048693736392 seconds... stale element ...!
text no.112 complete,total time lapsed: 119.28154015541077 seconds, estimated time remaining: 223.6528877913952 secondss... stale element ...!
text no.320 complete,total time lapsed: 341.778391122818 seconds, estimated time remaining: 2.1361149445176126 secondsss

As indicated above, we only had only 3 stale element error! hooray!

let's comparthe res and the OGs

In [81]:
# just for double safe guarding, let's see if we have any duplicated values
suspect_idx = [i for i in range(1, len(translated)) if translated[i-1] == translated[i]]

In [82]:
for suspect in suspect_idx:
    print('**************** {} ****************'.format(suspect))
    print('before: {} vs after: {}'.format(translated[suspect-1], translated[suspect]))

**************** 17 ****************
before: HACKING vs after: HACKING
**************** 22 ****************
before: ENGINEER vs after: ENGINEER
**************** 59 ****************
before: Self Employed... vs after: Self Employed...
**************** 107 ****************
before: student vs after: student
**************** 134 ****************
before: Financial Services Consultant vs after: Financial Services Consultant
**************** 179 ****************
before: Data science student vs after: Data science student


Ok so we need to do this by hand for these values + the couple of stable element error lines

In [83]:
# stale element idx
empty_idx = [i for i,text in enumerate(translated) if text == '']

In [84]:
# combing the lists
all_idx_manual = empty_idx + suspect_idx

In [85]:
# let's see what the original values are
for idx in all_idx_manual:
    print('**************** {} ****************'.format(idx))
    print('Original: {} vs Translated: {}'.format(we.q26[idx], translated[idx]))

**************** 12 ****************
Original: Student vs Translated: 
**************** 96 ****************
Original: health psychology student vs Translated: 
**************** 112 ****************
Original: Étudiant en commerce vs Translated: 
**************** 17 ****************
Original: HACKING vs Translated: HACKING
**************** 22 ****************
Original: ENGINEER vs Translated: ENGINEER
**************** 59 ****************
Original: Project Management vs Translated: Self Employed...
**************** 107 ****************
Original: student vs Translated: student
**************** 134 ****************
Original: Consultant Services Financiers vs Translated: Financial Services Consultant
**************** 179 ****************
Original: Data science student vs Translated: Data science student


Funnily nothing is wrong, we could have a couple of duplicated responses coming directly from the surveys (raw data), let's compare the suspect idx

We deal with the untranslated cases first otherwise the index is going to get messy

In [86]:
empty_idx

[12, 96, 112]

In [87]:
# reassigning values
we.loc[empty_idx[0], 'q26'] = 'student'
we.loc[empty_idx[1], 'q26'] = 'health psychology student'
we.loc[empty_idx[2], 'q26'] = 'business student'

Now we deal with the duplicated lines

In [88]:
# indices of the rows directly above the suspects
above_suspect_idx = [idx-1 for idx in suspect_idx]

In [89]:
# Let's measure the corresponding row's similarity
similarity = []

for i in range(len(suspect_idx)):
    above = list(we.loc[above_suspect_idx[i], ])
    below = list(we.loc[suspect_idx[i], ])
    similarity.append(sum([above[i] == below[i] for i in range(len(text_above))])/len(text_above))

In [90]:
similarity

[0.5, 0.5, 0.26666666666666666, 0.16666666666666666, 0.4, 0.26666666666666666]

None of them are exactly the same though, it's still highly unlikely that we have consecutive jobs = 'HACKING', 'ENGINEERING' and 'self_employed ...' <br>
We'll go ahead and remove these 6 lines because I'm scared they come from amazon mechanical turk and haven't been filtered out properly

In [91]:
# dropping suspects
we.drop(suspect_idx[:3], axis = 0, inplace = True)

In [92]:
# dropping suspects' upstairs neightbours
we.drop(above_suspect_idx[:3], axis = 0, inplace = True)

In [93]:
# resetting the index
we.reset_index(drop = True, inplace = True)

In [94]:
# specifying which ones to remove from the translated list
to_remove_from_translated = suspect_idx[:3] + above_suspect_idx[:3]

# removing these values from the translated list
translated = [text for i,text in enumerate(translated) if i not in to_remove_from_translated]

Now let's take care of the untranslated cases

In [97]:
len(translated), len(we)

(314, 314)

Ok we can now replace q26 with the translated texts

In [98]:
we.q26 = translated

In [100]:
# we save this for now so that we can continue tomorrow
with open(os.path.join(data_dir, 'we.pickle'), 'wb') as handle:
    pickle.dump(we, handle)

## Cont'd: recoding jobs
Dis gon be messy doe, argh

In [3]:
# loading in the data again
with open(os.path.join(data_dir, 'we.pickle'), 'rb') as handle:
    we = pickle.load(handle)

In [4]:
# some minor processing: getting rid of empty space, lower case everything, recoding empty entries
we.q26 = [job.strip().lower() for job in we.q26]
we.q26 = ['unemployed' if job in ['', '.', 'no', 'no occupation', 'none', 'unfortunately, i am currently unemployed. but when i work it has always been in retail.', "i'm not working right now", 'without'] else job for job in we.q26]
we.q26 = ['carpenter' if job == 'tischlermeister' else job for job in we.q26]
we.q26 = ['restaurant service' if job == 'servicefachkraft (gastronomy)' else job for job in we.q26]
we.q26 = ['real estate' if job == 'immobilienkauffrau' else job for job in we.q26]

In [5]:
# getting chars (all unique alphanumeric characters)
chars = [[char for char in job] for job in we.q26]
# flatten and creat one single list of unique vals
chars_unique = list(set([item for sublist in chars for item in sublist]))
# getting all the unique jobs
jobs_unique = list(set(we.q26))

In [6]:
len(chars_unique), len(we), len(jobs_unique)

(38, 314, 219)

In [7]:
# let's take a look
jobs_unique

['software professional',
 'product manager',
 'economist',
 'lehramtstudent',
 'administrative employee',
 'student, business administration, working student software company',
 'ladder',
 'delivery man do products.',
 'software developer',
 'baker',
 'banking',
 'financial services consultant',
 'business employee',
 'phd student',
 'it student',
 'education',
 'aeronautical engineer',
 'm&a analyst',
 'business administrator',
 'data processing',
 'real estate',
 'students',
 'ict',
 'automotive designer',
 'hr employee',
 'official',
 'nursery',
 'student in data science',
 'educator',
 'special educational needs teacher',
 'doctor',
 'pathologue',
 'business school student',
 'prison officer',
 'it engineer',
 'lecturer',
 'sales representative',
 'medical student',
 'electrical engineering/computer science',
 'employee',
 'financial audit',
 'dipl.physiker + computer scientist',
 'economics',
 'i am an educator',
 'process and biotechnology engineer',
 'student',
 'business schoo

We have a few suspects: 'eu overcomes disability', 'pka', 'ceeo', 'as', 'koch', 'official', 'with', 'pathologue' <br>
We'll go ahead and remove them

In [8]:
jobs_to_remove = ['eu overcomes disability', 'pka', 'ceeo', 'as', 'koch', 'official', 'with', 'pathologue','metallurgical engineer','mechanic','location netzer','in interest', 'ladder','unable acquisition']

In [9]:
# only 8 of them, so the risk associated with the removal is very limited, let's go!
we = we[[job not in jobs_to_remove for job in we.q26]]
we.reset_index(drop = True, inplace = True)

In [10]:
len(we)

299

We still have nearly 300 results, so we've gotten rid of around 80 data points

### Going forward:
in order to keep consistency with the Cn dataset, and considering the volumn is not too big, we choose to do this by hand (which will be probably fast than running clusters etc)

In [11]:
from jobs_dict import jobs_dict

In [12]:
# let's take a look at the distribution
Counter([jobs_dict[job] for job in we.q26])

Counter({'comp_sci_software_eng': 37,
         'education': 15,
         'engineering_various': 20,
         'finance_acc_aud': 17,
         'internet_digital_non_engineering': 4,
         'media_culture_creative': 6,
         'medical_professionals': 6,
         'mgmt_various': 25,
         'other': 78,
         'professional_services': 6,
         'public_service': 2,
         'research': 16,
         'student_various': 53,
         'unemployed_retired': 14})

In [13]:
# recode jobs
we.q26 = [jobs_dict[job] for job in we.q26]

In [14]:
we.head(3)

Unnamed: 0,country_of_residence,q01,q02,q03,q04,q05,q06,q07,q08,q09,...,q20,q21,q22,q23,q24,q25,q26,q27,q28,q29
0,UK,C,"[B, C, D, G, I]",A,6,2,4,9,4,5,...,A,,C,B,"[B, C]",D,comp_sci_software_eng,C,A,3
1,UK,C,"[B, D]",A,3,2,7,7,4,3,...,B,"[A, B, C, D]",E,B,"[A, B, C, D]",C,other,B,A,6
2,UK,C,"[B, D]",C,6,6,6,7,4,6,...,B,[D],D,B,"[A, B, D]",C,professional_services,B,A,6


### VERYGOOD
We done this stage of cleaning, now let's move onto the actual analysis and make a dash board of some sorts

In [15]:
# save_again
# we save this for now so that we can continue tomorrow
with open(os.path.join(data_dir, 'we_v2.pickle'), 'wb') as handle:
    pickle.dump(we, handle)