## Data cleaning & exploring

In [3]:
import json
import pandas as pd

ModuleNotFoundError: No module named 'pandas'

### San Francisco

In [9]:
df_sf = pd.read_json('data_raw/san_francisco.json', orient='records', encoding='utf-8')
df_sf.head()

Unnamed: 0,name,position
0,Jennifer Hasche,Head of Tech Recruiting - EMEA
1,Nevada Griffin,Strategic Partnerships Lead - Uber Health at U...
2,Rachel Liu,Analytics @ Uber
3,Tanmay Binaykiya,Software Engineer II at Uber
4,Anthony Dang,Senior Corporate Recruiter at Uber


In [10]:
df_sf.describe()

Unnamed: 0,name,position
count,940,940
unique,936,680
top,LinkedIn Member,Software Engineer at Uber
freq,5,92


In [11]:
print(f'Initial San Francisco employee count: {len(df_sf)}')

Initial San Francisco employee count: 940


In [12]:
# Check for duplicates
df_sf[df_sf.duplicated(keep=False)]

Unnamed: 0,name,position
114,LinkedIn Member,Software Engineer at Uber
801,LinkedIn Member,Software Engineer at Uber


In [13]:
df_sf[df_sf['name'] == 'LinkedIn Member']

Unnamed: 0,name,position
114,LinkedIn Member,Software Engineer at Uber
280,LinkedIn Member,Sr Software Engineer at Uber
617,LinkedIn Member,Senior Software Engineer at Uber
801,LinkedIn Member,Software Engineer at Uber
893,LinkedIn Member,"Sr. Software Engineer, Reliability Engineering..."


In [14]:
# Remove LinkedIn Member as a name
df_sf = df_sf[df_sf['name'] != 'LinkedIn Member']
df_sf.name.str.contains('LinkedIn Member').any()

False

In [15]:
df_sf.groupby('position').describe()

Unnamed: 0_level_0,name,name,name,name
Unnamed: 0_level_1,count,unique,top,freq
position,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
"""""",1,1,Harshit Mittal,1
"A creative at heart, processor at core, brewer of imagination, plus a doer and more.",1,1,Sean Lopez,1
"AI & Product Strategy | ex-Uber, Eats, Elevate",1,1,Ryan Cunningham,1
AI leader / Manager of Uber's Routing Engine & APIs / 1st 5 hires @ Siri / Ex Senior Manager SiriKit / Ex WWDC,1,1,Vineet Khosla,1
Account Executive at Uber,1,1,Alexander Letvinchuk,1
...,...,...,...,...
iOS Engineer at Uber,1,1,Xiaoxue Wang,1
incoming @uber / building cscareers.dev,1,1,Joey Colon,1
senior software engineer @Uber | ex data scientist @Uber @FB,1,1,Larry Wei,1
software engineer,1,1,Wei Sun,1


In [16]:
# Group positions to see the most common
df_position = df_sf.groupby('position').describe()['name'].sort_values(by='count', ascending=False).iloc[:, :1]
df_position

Unnamed: 0_level_0,count
position,Unnamed: 1_level_1
Software Engineer at Uber,90
Senior Software Engineer at Uber,52
Software Engineer II at Uber,19
Engineering Manager at Uber,13
Product Manager at Uber,12
...,...
Head of Engineering @ Uber Rides & Marketplace,1
Head of Enterprise at Uber for Business,1
"Head of Financial Products, Uber Money",1
Head of Global Marketing & Strategic Initiatives,1


In [17]:
# Removes all employees not currently working at Uber
df_sf_uber = df_sf[df_sf['position'].str.contains('uber', case=False)]
df_sf_uber.describe()

Unnamed: 0,name,position
count,783,783
unique,783,536
top,Zeyuan Tan,Software Engineer at Uber
freq,1,90


In [18]:
# Removes all employees working at other Uber services
ignore = 'uber air | uberair | freight | elevate | ex-uber | driver'
df_sf_uber[df_sf_uber['position'].str.contains(ignore, case=False)]

Unnamed: 0,name,position
12,Tolga Irdem,Actively seeking new opportunities | Ex-Uber &...
154,Nathan Alan Dinh,Senior Mobile Marketing Lead - Rider/ Driver a...
443,Reid Smiegiel,"Senior Sales Operations Associate, Uber Freigh..."
618,Scott G.,Human Resources at Uber Freight and Uber Elevate


In [19]:
df_sf_final = df_sf_uber[~df_sf_uber['position'].str.contains(ignore, case=False)]
df_sf_final.describe()

Unnamed: 0,name,position
count,779,779
unique,779,532
top,Zeyuan Tan,Software Engineer at Uber
freq,1,90


In [20]:
print(f'Final San Francisco employee count: {len(df_sf_final)}')

Final San Francisco employee count: 779


## São Paulo

In [21]:
df_sp = pd.read_json('data_raw/sao_paulo.json', orient='records', encoding='utf-8')
df_sp.head()

Unnamed: 0,name,position
0,Claudia Woods,General Manager at Uber Brasil
1,Ralph Weigand,"Sales Manager, Uber for Business"
2,Rayana Peled,Sr Community Operations Manager at Uber
3,Luiz Felipe da Costa Silva,Estagiario no Grupo Kobayashi Fintech & Cowork
4,Ana Luiza Faraco,Associate Consultant


In [22]:
df_sp.describe()

Unnamed: 0,name,position
count,1000,1000
unique,986,861
top,LinkedIn Member,Motorista na Uber
freq,12,24


In [23]:
print(f'Initial Sao Paulo employee count: {len(df_sf)}')

Initial Sao Paulo employee count: 935


In [24]:
df_sp[df_sp.duplicated(keep=False)]

Unnamed: 0,name,position
199,Beatriz Hercules,Central Operations Intern | Uber
200,Beatriz Hercules,Central Operations Intern | Uber
519,Aline de Oliveira,Chauffeur na Uber
520,Aline de Oliveira,Chauffeur na Uber
759,Valdirene Alexandre da Silva,Analytics Security and Risk | Uber
760,Valdirene Alexandre da Silva,Analytics Security and Risk | Uber


In [25]:
df_sp = df_sp[df_sp['name'] != 'LinkedIn Member']
df_sp.name.str.contains('LinkedIn Member').any()

False

In [26]:
df_sp.groupby('position').describe()

Unnamed: 0_level_0,name,name,name,name
Unnamed: 0_level_1,count,unique,top,freq
position,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
--,3,3,marcel giordo,1
.,3,3,Alan Nascimento,1
A procura de novas oportunidades,1,1,Rubens Tavares,1
AR LATAM manager at UBER,1,1,Eduardo Santos Araujo,1
Aberto a novas oportunidades e desafios.,1,1,George Alexandre Souza Barbosa,1
...,...,...,...,...
tecnico em desempenho at Inmetrics,1,1,David Ferreira,1
teste,1,1,Raiane Oliveira,1
universidade uninove memoria,1,1,Diego cruz souza,1
| Aux. Juridico | Estudante de Direito | Uber*Driver |,1,1,Wellington Rodrigues Fernandes,1


In [27]:
# Group positions to see the most common
df_position = df_sp.groupby('position').describe()['name'].sort_values(by='count', ascending=False).iloc[:, :1]
df_position

Unnamed: 0_level_0,count
position,Unnamed: 1_level_1
Motorista na Uber,23
Motorista profissional na Uber,14
Motorista | Uber,11
Motorista de automovel na Uber,11
Software Engineer at Uber,6
...,...
Especialista em Gestao de Seguranca Privada,1
Especialista em atendimento II na Uber,1
Especialista em atendimento | Uber Eats,1
Especialista em qualidade | Uber,1


In [28]:
# Removes all employees not currently working at Uber
df_sp_uber = df_sp[df_sp['position'].str.contains('uber', case=False)]
df_sp_uber.describe()

Unnamed: 0,name,position
count,694,694
unique,691,571
top,Valdirene Alexandre da Silva,Motorista na Uber
freq,2,23


In [29]:
# Removes all employees working at other Uber services
ignore = 'uber air | uberair | freight | elevate | ex-uber | motorista'
df_sp_uber[df_sp_uber['position'].str.contains(ignore, case=False)]

Unnamed: 0,name,position
561,Edson Spina,Motorista de automovel | Uber e motorista part...
723,Anderson Leao Gimenes,Desenvolvedor Full Stack - Analista CRM - Moto...


In [30]:
df_sp_final = df_sp_uber[~df_sp_uber['position'].str.contains(ignore, case=False)]
df_sp_final.describe()

Unnamed: 0,name,position
count,692,692
unique,689,569
top,Beatriz Hercules,Motorista na Uber
freq,2,23


In [31]:
print(f'Final Sao Paulo employee count: {len(df_sp_final)}')

Final Sao Paulo employee count: 692
