#### Latest Data Science Job Salaries 2020 - 2024

Credits by: SAURABH BADOLE <br>

Link to dataset: https://www.kaggle.com/datasets/saurabhbadole/latest-data-science-job-salaries-2024

Description: This dataset provides insights into data science job salaries from 2020 to 2024, including information on experience levels, employment types, job titles, and company characteristics. It serves as a valuable resource for understanding salary trends and factors influencing compensation in the data science field.


In [43]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
import holoviews as hv
import hvplot.pandas
import plotly.express as px
import plotly.graph_objects as go
import streamlit as st

warnings.filterwarnings('ignore')


In [44]:
job_salaries = pd.read_csv("Latest Data Science Job Salaries 2020 - 2024/DataScience_salaries_2024.csv")
job_salaries.head(5)

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2021,MI,FT,Data Scientist,30400000,CLP,40038,CL,100,CL,L
1,2021,MI,FT,BI Data Analyst,11000000,HUF,36259,HU,50,US,L
2,2020,MI,FT,Data Scientist,11000000,HUF,35735,HU,50,HU,L
3,2021,MI,FT,ML Engineer,8500000,JPY,77364,JP,50,JP,S
4,2022,SE,FT,Lead Machine Learning Engineer,7500000,INR,95386,IN,50,IN,L


#### Data Cleaning


In [45]:
job_salaries.isnull().sum()

work_year             0
experience_level      0
employment_type       0
job_title             0
salary                0
salary_currency       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64

In [46]:
job_salaries.dtypes

work_year              int64
experience_level      object
employment_type       object
job_title             object
salary                 int64
salary_currency       object
salary_in_usd          int64
employee_residence    object
remote_ratio           int64
company_location      object
company_size          object
dtype: object

In [47]:
# Unique Values in the columns
unique_values = {}

for column in job_salaries.columns:
    unique_values[column] = job_salaries[column].unique()
    
unique_values


{'work_year': array([2021, 2020, 2022, 2023, 2024], dtype=int64),
 'experience_level': array(['MI', 'SE', 'EN', 'EX'], dtype=object),
 'employment_type': array(['FT', 'FL', 'CT', 'PT'], dtype=object),
 'job_title': array(['Data Scientist', 'BI Data Analyst', 'ML Engineer',
        'Lead Machine Learning Engineer', 'Data Science Manager',
        'Head of Machine Learning', 'Research Engineer',
        'Head of Data Science', 'AI Programmer',
        'Machine Learning Engineer', 'Lead Data Scientist',
        'Data Engineer', 'Applied Machine Learning Scientist',
        'Lead Data Analyst', 'Data Analytics Manager',
        'Data Integration Specialist', 'Principal Data Architect',
        'NLP Engineer', 'Big Data Engineer', 'AI Research Engineer',
        'Machine Learning Software Engineer', 'Data Analyst',
        'Applied Data Scientist', 'AI Scientist', 'Data Analytics Lead',
        'Business Data Analyst', 'Product Data Analyst',
        'Computer Vision Engineer', 'Data Scienc

In [48]:
job_salaries.describe()

Unnamed: 0,work_year,salary,salary_in_usd,remote_ratio
count,14838.0,14838.0,14838.0,14838.0
mean,2023.1389,165022.7,149874.718763,32.76048
std,0.700799,356235.4,69009.181349,46.488278
min,2020.0,14000.0,15000.0,0.0
25%,2023.0,102100.0,102000.0,0.0
50%,2023.0,142200.0,141300.0,0.0
75%,2024.0,187500.0,185900.0,100.0
max,2024.0,30400000.0,800000.0,100.0


### EDA Visualization

In [92]:
# Work Count By Year

year_count = job_salaries['work_year'].value_counts()

year_count_df = year_count.reset_index()
year_count_df.columns = ['work_year', 'count']


fig = px.bar(year_count_df, x='work_year', y='count', color_discrete_sequence=['#AEC6CF'],title='Work Year Count', labels={'work_year': 'Work Year', 'count': 'Job Count Per Year'},)

# Show the plot
fig.show()

In [91]:
# Work Count By Salary

year_salary = job_salaries.groupby('work_year')['salary'].sum().sort_values()

year_salary_df = year_salary.reset_index()

# Create a bar chart using Plotly Express (px.bar)
fig = px.bar(year_salary_df, x='work_year', y='salary', title='Total Salary Per Work Year',
             labels={'work_year': 'Work Year', 'salary': 'Total Salary'},color_discrete_sequence=['#AEC6CF'])

# Show the plot
fig.show()

In [113]:
# Experience Level Total Count

exp_level = job_salaries['experience_level'].value_counts()
fig = px.pie(exp_level, values=exp_level.values, names=exp_level.index,
             title='Experience Level Distribution',color_discrete_sequence=['#00435c','#006971','#008c59','#81a51c','#ffa600'])

# Show the plot
fig.show()

In [106]:
# Work Year and Experience

year_experience = job_salaries.groupby(['work_year','experience_level']).size().unstack()
year_experience_df = year_experience.reset_index()

fig = px.bar(year_experience_df,y='work_year',x=['EN', 'EX', 'MI', 'SE'],barmode='group',orientation =  'h',color_discrete_sequence=['#00435c','#006971','#008c59','#81a51c','#ffa600'],labels={'work_year': 'Work Year', 'value': 'Count', 'variable': 'Experience Level'})

fig.show()



In [124]:
# Work Year and Remote Work Distribution

year_remote = job_salaries.groupby(['work_year','remote_ratio']).size().unstack()
year_remote_df = year_remote.reset_index()

x_columns = year_remote_df.columns[1:]

fig = px.bar(year_remote_df,y='work_year',x=x_columns,barmode='group',orientation =  'h',color_discrete_sequence=['#00435c','#006971','#008c59'],labels={'work_year': 'Work Year', 'value': 'Count', 'variable': 'Remote Ratio'})

fig.show()


In [137]:
# Job Title Count 

job_title = job_salaries.groupby(['job_title']).size().nlargest(10)
job_title_df = job_title.reset_index()
job_title_df.columns = ['Job Title', 'Count']
job_title_df

fig = px.bar(job_title_df, x='Job Title', y='Count', color_discrete_sequence=['#AEC6CF'],title='Job Title Count',barmode='group')

fig.show()


In [142]:
# Work Title and Experience Level
title_experience = job_salaries.groupby(['job_title','experience_level']).size().nlargest(10)
title_experience


job_title                  experience_level
Data Scientist             SE                  2142
Data Engineer              SE                  2085
Machine Learning Engineer  SE                  1243
Data Analyst               SE                  1189
Data Engineer              MI                   809
Data Scientist             MI                   702
Data Analyst               MI                   526
                           EN                   452
Data Architect             SE                   342
Research Scientist         SE                   327
dtype: int64