In [231]:
import pandas as pd
import plotly.express as px
import country_converter as coco

In [232]:
# Read Salaries Data

df = pd.read_csv('./data/ds_salaries.csv', index_col='Unnamed: 0')
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2020,MI,FT,Data Scientist,70000,EUR,79833,DE,0,DE,L
1,2020,SE,FT,Machine Learning Scientist,260000,USD,260000,JP,0,JP,S
2,2020,SE,FT,Big Data Engineer,85000,GBP,109024,GB,50,GB,M
3,2020,MI,FT,Product Data Analyst,20000,USD,20000,HN,0,HN,S
4,2020,SE,FT,Machine Learning Engineer,150000,USD,150000,US,50,US,L


In [234]:
# Convert countries to ISO 3 code for possible merge later

df['employee_residence'] = coco.convert(names=df['employee_residence'], to="ISO3")
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2020,MI,FT,Data Scientist,79833,DEU,0,DE,L
1,2020,SE,FT,Machine Learning Scientist,260000,JPN,0,JP,S
2,2020,SE,FT,Big Data Engineer,109024,GBR,50,GB,M
3,2020,MI,FT,Product Data Analyst,20000,HND,0,HN,S
4,2020,SE,FT,Machine Learning Engineer,150000,USA,50,US,L


In [235]:
# Drop columns that are not used for this analysis

df.drop(['experience_level', 'employment_type', 'remote_ratio','company_size', 'salary', 'salary_currency'], inplace=True, axis=1)
df.head()

Unnamed: 0,work_year,job_title,salary_in_usd,employee_residence,company_location
0,2020,Data Scientist,79833,DEU,DE
1,2020,Machine Learning Scientist,260000,JPN,JP
2,2020,Big Data Engineer,109024,GBR,GB
3,2020,Product Data Analyst,20000,HND,HN
4,2020,Machine Learning Engineer,150000,USA,US


In [236]:
# Read BigMac index Data

df_big_mac = pd.read_csv('https://raw.githubusercontent.com/TheEconomist/big-mac-data/master/output-data/big-mac-full-index.csv')
df_big_mac.head()

Unnamed: 0,date,iso_a3,currency_code,name,local_price,dollar_ex,dollar_price,USD_raw,EUR_raw,GBP_raw,JPY_raw,CNY_raw,GDP_bigmac,adj_price,USD_adjusted,EUR_adjusted,GBP_adjusted,JPY_adjusted,CNY_adjusted
0,2000-04-01,ARG,ARS,Argentina,2.5,1.0,2.5,0.11607,0.05007,-0.16722,-0.09864,1.09091,7803.328512,1.922652,0.39117,,-0.06626,0.10096,0.97153
1,2000-04-01,AUS,AUD,Australia,2.59,1.68,1.541667,-0.31176,-0.35246,-0.48645,-0.44416,0.28939,29144.876973,2.30155,-0.28335,,-0.51898,-0.43285,0.01563
2,2000-04-01,BRA,BRL,Brazil,2.95,1.79,1.648045,-0.26427,-0.30778,-0.45102,-0.40581,0.37836,4822.738983,1.869734,-0.05696,,-0.36704,-0.25369,0.33645
3,2000-04-01,CAN,CAD,Canada,2.85,1.47,1.938776,-0.13448,-0.18566,-0.35417,-0.30099,0.62152,26087.329235,2.247266,-0.07698,,-0.38047,-0.26953,0.30809
4,2000-04-01,CHE,CHF,Switzerland,5.9,1.7,3.470588,0.54937,0.45774,0.15609,0.2513,1.90267,23872.716095,2.207948,0.68172,,0.12876,0.3309,1.3833


In [237]:
# Convert date to datetime for easier filtering

df_big_mac.date = pd.to_datetime(df_big_mac.date)

In [238]:
# Filter BigMac data to only use dates after 2018

df_big_mac = df_big_mac[df_big_mac['date'] > '2019']
df_big_mac.reset_index(drop=True, inplace=True)
df_big_mac.head()

Unnamed: 0,date,iso_a3,currency_code,name,local_price,dollar_ex,dollar_price,USD_raw,EUR_raw,GBP_raw,JPY_raw,CNY_raw,GDP_bigmac,adj_price,USD_adjusted,EUR_adjusted,GBP_adjusted,JPY_adjusted,CNY_adjusted
0,2019-07-09,ARE,AED,United Arab Emirates,14.75,3.67315,4.015627,-0.14743,-0.12174,-0.02089,0.1199,0.31648,,,,,,,
1,2019-07-09,ARG,ARS,Argentina,120.0,41.8045,2.870504,-0.39055,-0.37219,-0.3001,-0.19946,-0.05894,13006.861368,3.240098,-0.15031,-0.23968,-0.11244,0.05486,-0.04695
2,2019-07-09,AUS,AUD,Australia,6.15,1.443626,4.260105,-0.09552,-0.06827,0.03872,0.18808,0.39662,57626.147712,4.385329,-0.06829,-0.16629,-0.02677,0.15668,0.04504
3,2019-07-09,AZE,AZN,Azerbaijan,3.95,1.6965,2.328323,-0.50566,-0.49077,-0.4323,-0.35067,-0.23669,,,,,,,
4,2019-07-09,BHR,BHD,Bahrain,1.4,0.377,3.713528,-0.21157,-0.18781,-0.09455,0.03565,0.21744,,,,,,,


In [240]:
#Group bigmac by year

df_big_mac = pd.DataFrame(df_big_mac.groupby([df_big_mac.date.dt.year, df_big_mac.iso_a3]).mean())
df_big_mac.reset_index( inplace=True)
df_big_mac.head()


Unnamed: 0,date,iso_a3,local_price,dollar_ex,dollar_price,USD_raw,EUR_raw,GBP_raw,JPY_raw,CNY_raw,GDP_bigmac,adj_price,USD_adjusted,EUR_adjusted,GBP_adjusted,JPY_adjusted,CNY_adjusted
0,2019,ARE,14.75,3.67315,4.015627,-0.14743,-0.12174,-0.02089,0.1199,0.31648,,,,,,,
1,2019,ARG,120.0,41.8045,2.870504,-0.39055,-0.37219,-0.3001,-0.19946,-0.05894,13006.861368,3.240098,-0.15031,-0.23968,-0.11244,0.05486,-0.04695
2,2019,AUS,6.15,1.443626,4.260105,-0.09552,-0.06827,0.03872,0.18808,0.39662,57626.147712,4.385329,-0.06829,-0.16629,-0.02677,0.15668,0.04504
3,2019,AZE,3.95,1.6965,2.328323,-0.50566,-0.49077,-0.4323,-0.35067,-0.23669,,,,,,,
4,2019,BHR,1.4,0.377,3.713528,-0.21157,-0.18781,-0.09455,0.03565,0.21744,,,,,,,


In [241]:
# Filter bigmac for relevant data

merge_df = df_big_mac.filter(['date', 'iso_a3','adj_price'])
merge_df.dropna(inplace=True)
merge_df.head()

Unnamed: 0,date,iso_a3,adj_price
1,2019,ARG,3.240098
2,2019,AUS,4.385329
5,2019,BRA,3.138321
6,2019,CAN,3.985136
7,2019,CHE,4.484086


In [242]:
#merge filtered salary data with bigmac data

new_df = pd.merge(df,merge_df, left_on=['work_year','employee_residence'], right_on=['date','iso_a3']).drop(columns=['date','iso_a3'])
new_df.head()

Unnamed: 0,work_year,job_title,salary_in_usd,employee_residence,company_location,adj_price
0,2020,Machine Learning Scientist,260000,JPN,JP,4.386802
1,2020,Data Engineer,41689,JPN,JP,4.386802
2,2020,Big Data Engineer,109024,GBR,GB,4.2047
3,2020,Data Engineer,112872,GBR,GB,4.2047
4,2020,Data Scientist,76958,GBR,GB,4.2047


In [243]:
#calculate number of bigmacs a salary can by in a country

new_df['num_bigmacs'] = new_df['salary_in_usd'] / new_df['adj_price']
new_df.head()

Unnamed: 0,work_year,job_title,salary_in_usd,employee_residence,company_location,adj_price,num_bigmacs
0,2020,Machine Learning Scientist,260000,JPN,JP,4.386802,59268.687683
1,2020,Data Engineer,41689,JPN,JP,4.386802,9503.278157
2,2020,Big Data Engineer,109024,GBR,GB,4.2047,25929.082022
3,2020,Data Engineer,112872,GBR,GB,4.2047,26844.248477
4,2020,Data Scientist,76958,GBR,GB,4.2047,18302.853447


In [244]:
# Map Number of bigmacs 

bigmac_location = new_df.groupby(['num_bigmacs','employee_residence']).size().reset_index()
average = bigmac_location.groupby('employee_residence').median().reset_index()

fig = px.choropleth(locations=average['employee_residence'],
                    color=average['num_bigmacs'],
                    color_continuous_scale=px.colors.sequential.haline,
                    template='plotly_dark',
                    title = 'Median Number of BigMacs a salary would be able to buy by Employee Location')
fig.update_layout(font = dict(size=15,family="Helvetica"))
fig.show()

In [245]:
# What's going on in Malaysia?

new_df[new_df['employee_residence'] == 'MYS']

Unnamed: 0,work_year,job_title,salary_in_usd,employee_residence,company_location,adj_price,num_bigmacs
484,2022,Head of Data,200000,MYS,US,3.719575,53769.578229
