In [4]:
import pandas as pd
import plotly.express as px
import country_converter as coco

In [5]:
df = pd.read_csv('./data/salaries.csv')
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2021,MI,FT,Data Engineer,100000,AUD,75050,AU,50,AU,L
1,2022,SE,FT,Data Engineer,225000,USD,225000,US,0,US,M
2,2022,SE,FT,Data Engineer,184100,USD,184100,US,0,US,M
3,2022,SE,FT,Data Scientist,185900,USD,185900,US,0,US,M
4,2022,SE,FT,Data Scientist,129300,USD,129300,US,0,US,M


In [6]:
df.isna().sum()

work_year             0
experience_level      0
employment_type       0
job_title             0
salary                0
salary_currency       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64

In [7]:
# Convert countries to ISO 3 code for possible merge later
df['employee_residence'] = coco.convert(names=df['employee_residence'], to="ISO3")
df.drop(['experience_level', 'employment_type', 'remote_ratio','company_size', 'salary', 'salary_currency'], inplace=True, axis=1)

df.head()

Unnamed: 0,work_year,job_title,salary_in_usd,employee_residence,company_location
0,2021,Data Engineer,75050,AUS,AU
1,2022,Data Engineer,225000,USA,US
2,2022,Data Engineer,184100,USA,US
3,2022,Data Scientist,185900,USA,US
4,2022,Data Scientist,129300,USA,US


In [8]:
# Read BigMac index Data
df_big_mac = pd.read_csv('https://raw.githubusercontent.com/TheEconomist/big-mac-data/master/output-data/big-mac-full-index.csv')

# Convert date to datetime for easier filtering
df_big_mac.date = pd.to_datetime(df_big_mac.date)

# Filter BigMac data to only use dates after 2018
df_big_mac = df_big_mac[df_big_mac['date'] > '2019']
df_big_mac.reset_index(drop=True, inplace=True)

#Group bigmac by year
df_big_mac = pd.DataFrame(df_big_mac.groupby([df_big_mac.date.dt.year, df_big_mac.iso_a3]).mean())
df_big_mac.reset_index( inplace=True)

# Filter bigmac for relevant data
merge_df = df_big_mac.filter(['date', 'iso_a3','adj_price'])
merge_df.dropna(inplace=True)

#merge filtered salary data with bigmac data
new_df = pd.merge(df,merge_df, left_on=['work_year','employee_residence'], right_on=['date','iso_a3']).drop(columns=['date','iso_a3'])

#calculate number of bigmacs a salary can by in a country
new_df['num_bigmacs'] = new_df['salary_in_usd'] / new_df['adj_price']
new_df.head()


Unnamed: 0,work_year,job_title,salary_in_usd,employee_residence,company_location,adj_price,num_bigmacs
0,2021,Data Engineer,75050,AUS,AU,4.642132,16167.140283
1,2021,Data Analyst,42028,AUS,AU,4.642132,9053.598558
2,2022,Data Engineer,225000,USA,US,4.606592,48843.046973
3,2022,Data Engineer,184100,USA,US,4.606592,39964.466434
4,2022,Data Scientist,185900,USA,US,4.606592,40355.21081


In [9]:
#create function to draw a map 
def create_map(df, measurement, location):
    '''
    Takes a Dataframe and two column names to aggregate by measurement and map aggregates by location
    The Median measurement is shown
    '''
    agg_location = df.groupby([measurement, location]).size().reset_index()
    average = agg_location.groupby(location).median().reset_index()

    fig = px.choropleth(locations=average[location],
                        color=average[measurement],
                        color_continuous_scale=px.colors.sequential.haline,
                        template='plotly_dark',
                        title = f'{measurement} for {location}')
    fig.update_layout(font = dict(size=15,family="Helvetica"))
    fig.show()

In [10]:
create_map(new_df, 'num_bigmacs','employee_residence')

In [11]:
# What's going on in Russia?

new_df[new_df['employee_residence'] == 'RUS']

Unnamed: 0,work_year,job_title,salary_in_usd,employee_residence,company_location,adj_price,num_bigmacs
947,2022,Data Scientist,48000,RUS,US,3.913002,12266.795985
985,2021,Head of Data Science,85000,RUS,RU,3.815083,22279.988843
986,2021,Head of Data,230000,RUS,RU,3.815083,60287.028635
1026,2020,Computer Vision Engineer,60000,RUS,US,3.622955,16561.066649


In [12]:
#read data table from url
url = 'https://www.worlddata.info/cost-of-living.php'
#Convert table to data frame
cost_df = pd.read_html(url)[0]
#Convert country names to ISO code
cost_df['iso_a3'] =  coco.convert(names=cost_df['Country'], to="ISO3")
#Merge with salary data
df_cost = pd.merge(df,cost_df, left_on='employee_residence', right_on='iso_a3')
#drop irrelevant columns
df_cost.drop(['Ø Monthly income','iso_a3'], axis=1, inplace=True)
#calculate adjusted salaries
df_cost['adj_salary_purch'] = (df_cost['salary_in_usd'] / df_cost['Purchasing power index']*100)
df_cost['adj_salary_cost'] = (df_cost['salary_in_usd'] / df_cost['Cost index']*100)

df_cost.head()


Unnamed: 0,work_year,job_title,salary_in_usd,employee_residence,company_location,Rank,Country,Cost index,Purchasing power index,adj_salary_purch,adj_salary_cost
0,2021,Data Engineer,75050,AUS,AU,12,Australia,118.0,68.3,109882.869693,63601.694915
1,2022,Data Scientist,83964,AUS,AU,12,Australia,118.0,68.3,122934.114202,71155.932203
2,2022,BI Data Analyst,45480,AUS,AU,12,Australia,118.0,68.3,66588.579795,38542.372881
3,2021,Data Analyst,42028,AUS,AU,12,Australia,118.0,68.3,61534.407028,35616.949153
4,2022,BI Data Analyst,53877,AUS,AU,12,Australia,118.0,68.3,78882.869693,45658.474576


In [13]:
create_map(df_cost, 'adj_salary_purch','employee_residence')

In [14]:
create_map(df_cost, 'adj_salary_cost','employee_residence')