In [1]:
import pandas as pd
import os
import pickle
from collections import Counter

### Loading in data

In [2]:
# setting up the data directory
data_dir = '../../AI_perception_survey_data/WE/cleaned'

In [3]:
# listing data files
all_files = os.listdir(data_dir)

In [4]:
# extracting the .pickle files
all_files = [file for file in all_files if file.split('.')[1] == 'pickle']
all_files

['gm_all.pickle', 'pf_all.pickle']

In [5]:
# loading in the dataframes
with open(os.path.join(data_dir, all_files[0]), 'rb') as handle:
    gm_all = pickle.load(handle)
with open(os.path.join(data_dir, all_files[1]), 'rb') as handle:
    pf_all = pickle.load(handle)

In [6]:
# concatenating
we = pd.concat([gm_all, pf_all])

### Standardisation of Country of Residence

We went ahead and created a conversion dict, see country_conversion.py

In [7]:
from country_conversion import country_conversion_dict

In [8]:
# recode
we.country_of_residence = [country_conversion_dict[country] for country in we.country_of_residence]

In [9]:
Counter(we.country_of_residence)

Counter({'BE': 18,
         'CH': 6,
         'DE': 102,
         'ES': 2,
         'FR': 153,
         'IE': 1,
         'LU': 1,
         'NL': 8,
         'PT': 1,
         'UK': 89,
         'non_WE': 8})

OK so we've cleaned this up nicely, now let's go on and remove the non_WE entries

In [10]:
we = we[[country != 'non_WE' for country in we.country_of_residence]].reset_index(drop = True)

In [11]:
we.head(3)

Unnamed: 0,country_of_residence,q01,q02,q03,q04,q05,q06,q07,q08,q09,...,q20,q21,q22,q23,q24,q25,q26,q27,q28,q29
0,UK,C,"[B, C, D, G, I]",A,6,2,4,9,4,5,...,A,,C,B,"[B, C]",D,It engineer,C,A,3
1,UK,C,"[B, D]",A,3,2,7,7,4,3,...,B,"[A, B, C, D]",E,B,"[A, B, C, D]",C,Marketing,B,A,6
2,UK,C,"[B, D]",C,6,6,6,7,4,6,...,B,[D],D,B,"[A, B, D]",C,Consultancy,B,A,6


### Categorisation of profession

In [12]:
# we have gone done written a very simple class to translate
# it uses selenium (web version and not the API) and sends the texts to google translate
# see gtranslate.py
from gtranslate import selenium_gtranslate

In [13]:
# initiate
transl = selenium_gtranslate()

In [14]:
# translate in bulk
transl.translate_in_bulk(we.q26)

navigating to google translate
now translating ...
text no.1 translation complete 
total time lapsed so far: 1.5933358669281006 seconds
estimated time remaining: 608.6543011665344 seconds
			
text no.2 translation complete 
total time lapsed so far: 3.1667938232421875 seconds
estimated time remaining: 603.2742233276367 seconds
			
text no.3 translation complete 
total time lapsed so far: 4.743923902511597 seconds
estimated time remaining: 600.8970276514689 seconds
			
text no.4 translation complete 
total time lapsed so far: 6.300476789474487 seconds
estimated time remaining: 596.9701758027077 seconds
			


KeyboardInterrupt: 