#### Latest Data Science Job Salaries 2020 - 2024

Credits by: SAURABH BADOLE <br>

Link to dataset: https://www.kaggle.com/datasets/saurabhbadole/latest-data-science-job-salaries-2024

Description: This dataset provides insights into data science job salaries from 2020 to 2024, including information on experience levels, employment types, job titles, and company characteristics. It serves as a valuable resource for understanding salary trends and factors influencing compensation in the data science field.


In [42]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
import holoviews as hv
import hvplot.pandas
import plotly.express as px
import plotly.graph_objects as go
import streamlit as st

warnings.filterwarnings('ignore')


In [43]:
job_salaries = pd.read_csv("Latest Data Science Job Salaries 2020 - 2024/DataScience_salaries_2024.csv")
job_salaries.head(5)

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2021,MI,FT,Data Scientist,30400000,CLP,40038,CL,100,CL,L
1,2021,MI,FT,BI Data Analyst,11000000,HUF,36259,HU,50,US,L
2,2020,MI,FT,Data Scientist,11000000,HUF,35735,HU,50,HU,L
3,2021,MI,FT,ML Engineer,8500000,JPY,77364,JP,50,JP,S
4,2022,SE,FT,Lead Machine Learning Engineer,7500000,INR,95386,IN,50,IN,L


#### Data Cleaning


In [44]:
job_salaries.isnull().sum()

work_year             0
experience_level      0
employment_type       0
job_title             0
salary                0
salary_currency       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64

In [45]:
job_salaries.dtypes

work_year              int64
experience_level      object
employment_type       object
job_title             object
salary                 int64
salary_currency       object
salary_in_usd          int64
employee_residence    object
remote_ratio           int64
company_location      object
company_size          object
dtype: object

In [46]:
# Unique Values in the columns
unique_values = {}

for column in job_salaries.columns:
    unique_values[column] = job_salaries[column].unique()
    
unique_values


{'work_year': array([2021, 2020, 2022, 2023, 2024], dtype=int64),
 'experience_level': array(['MI', 'SE', 'EN', 'EX'], dtype=object),
 'employment_type': array(['FT', 'FL', 'CT', 'PT'], dtype=object),
 'job_title': array(['Data Scientist', 'BI Data Analyst', 'ML Engineer',
        'Lead Machine Learning Engineer', 'Data Science Manager',
        'Head of Machine Learning', 'Research Engineer',
        'Head of Data Science', 'AI Programmer',
        'Machine Learning Engineer', 'Lead Data Scientist',
        'Data Engineer', 'Applied Machine Learning Scientist',
        'Lead Data Analyst', 'Data Analytics Manager',
        'Data Integration Specialist', 'Principal Data Architect',
        'NLP Engineer', 'Big Data Engineer', 'AI Research Engineer',
        'Machine Learning Software Engineer', 'Data Analyst',
        'Applied Data Scientist', 'AI Scientist', 'Data Analytics Lead',
        'Business Data Analyst', 'Product Data Analyst',
        'Computer Vision Engineer', 'Data Scienc

In [47]:
job_salaries.describe()

Unnamed: 0,work_year,salary,salary_in_usd,remote_ratio
count,14838.0,14838.0,14838.0,14838.0
mean,2023.1389,165022.7,149874.718763,32.76048
std,0.700799,356235.4,69009.181349,46.488278
min,2020.0,14000.0,15000.0,0.0
25%,2023.0,102100.0,102000.0,0.0
50%,2023.0,142200.0,141300.0,0.0
75%,2024.0,187500.0,185900.0,100.0
max,2024.0,30400000.0,800000.0,100.0


### EDA Visualization

In [48]:
# Work Count By Year

year_count = job_salaries['work_year'].value_counts()

year_count_df = year_count.reset_index()
year_count_df.columns = ['work_year', 'count']


fig = px.bar(year_count_df, x='work_year', y='count', color_discrete_sequence=['#AEC6CF'],title='Work Year Count', labels={'work_year': 'Work Year', 'count': 'Job Count Per Year'},)

# Show the plot
fig.show()

In [49]:
# Work Count By Salary

year_salary = job_salaries.groupby('work_year')['salary'].sum().sort_values()

year_salary_df = year_salary.reset_index()

# Create a bar chart using Plotly Express (px.bar)
fig = px.bar(year_salary_df, x='work_year', y='salary', title='Total Salary Per Work Year',
             labels={'work_year': 'Work Year', 'salary': 'Total Salary'},color_discrete_sequence=['#AEC6CF'])

# Show the plot
fig.show()

In [50]:
# Experience Level Total Count

exp_level = job_salaries['experience_level'].value_counts()
fig = px.pie(exp_level, values=exp_level.values, names=exp_level.index,
             title='Experience Level Distribution',color_discrete_sequence=['#00435c','#006971','#008c59','#81a51c','#ffa600'])

# Show the plot
fig.show()

In [51]:
# Employment Type

employment_type = job_salaries['employment_type'].value_counts()
employment_type_df = employment_type.reset_index()
employment_type_df.columns = ['Employment Type', 'count']

total_jobs = employment_type_df['count'].sum()
employment_type_df['Percentage'] = (employment_type_df['count'] / total_jobs) * 100

employment_type_df['Percentage'] = employment_type_df['Percentage'].map('{:.2f}%'.format)

employment_type_df


Unnamed: 0,Employment Type,count,Percentage
0,FT,14772,99.56%
1,PT,27,0.18%
2,CT,26,0.18%
3,FL,13,0.09%


In [52]:
# Work Year and Experience

year_experience = job_salaries.groupby(['work_year','experience_level']).size().unstack()
year_experience_df = year_experience.reset_index()

fig = px.bar(year_experience_df,y='work_year',x=['EN', 'EX', 'MI', 'SE'],barmode='stack',orientation =  'h',color_discrete_sequence=['#00435c','#006971','#008c59','#81a51c','#ffa600'],labels={'work_year': 'Work Year', 'value': 'Count', 'variable': 'Experience Level'})

fig.show()



In [53]:
# Work Year and Remote Work Distribution

year_remote = job_salaries.groupby(['work_year','remote_ratio']).size().unstack()
year_remote_df = year_remote.reset_index()

x_columns = year_remote_df.columns[1:]

fig = px.bar(year_remote_df,y='work_year',x=x_columns,barmode='stack',orientation =  'h',color_discrete_sequence=['#00435c','#006971','#008c59'],labels={'work_year': 'Work Year', 'value': 'Count', 'variable': 'Remote Ratio'})

fig.show()


In [54]:
# Job Title Count 

job_title = job_salaries.groupby(['job_title']).size().nlargest(10)
job_title_df = job_title.reset_index()
job_title_df.columns = ['Job Title', 'Count']
job_title_df

fig = px.bar(job_title_df, x='Job Title', y='Count', color_discrete_sequence=['#AEC6CF'],title='Job Title Count',barmode='group')

fig.show()


In [55]:
# High Paying Job TItle
highest_job_title = job_salaries.groupby(['job_title'])['salary'].sum().nlargest(10)
highest_job_title_df = highest_job_title.reset_index()
highest_job_title_df


Unnamed: 0,job_title,salary
0,Data Scientist,542863406
1,Data Engineer,475118058
2,Machine Learning Engineer,297781197
3,Data Analyst,239963676
4,Research Scientist,92212888
5,Applied Scientist,72904321
6,Analytics Engineer,63834856
7,Data Architect,60261371
8,Research Engineer,57815316
9,ML Engineer,47553681


In [56]:
# Work Title and Experience Level by Salary
title_experience = job_salaries.groupby(['job_title','experience_level'])['salary'].sum().nlargest(10)
top_title_experience_df = title_experience.reset_index()
top_title_experience_df


Unnamed: 0,job_title,experience_level,salary
0,Data Scientist,SE,363102362
1,Data Engineer,SE,326946705
2,Machine Learning Engineer,SE,250992483
3,Data Scientist,MI,146220743
4,Data Analyst,SE,144508956
5,Data Engineer,MI,98766043
6,Research Scientist,SE,66881886
7,Data Architect,SE,56607712
8,Applied Scientist,SE,56464031
9,Data Analyst,MI,53348853


In [57]:
# Company Size Distribution
company_size = job_salaries['company_size'].value_counts()
fig = px.pie(company_size, values=company_size.values, names=company_size.index,
             title='Experience Level Distribution',color_discrete_sequence=['#00435c','#006971','#008c59','#81a51c','#ffa600'])

# Show the plot
fig.show()

In [58]:
# Salary and Salary Currency

salary_currency = job_salaries.groupby('salary_currency')['salary'].sum().sort_values().nlargest(10)

salary_currency_df = salary_currency.reset_index()
salary_currency_df.columns = ['Salary Currency', 'count']

fig = px.bar(salary_currency_df, x='Salary Currency', y='count', color_discrete_sequence=['#AEC6CF'],title='Highest Salary Currency', labels={'Salary Currency': 'Currency', 'count': 'Salary'},barmode='group')

# Show the plot
fig.show()


In [59]:
# Highest Company Lovation

company_location = job_salaries['company_location'].value_counts().nlargest(10)

company_location_df = company_location.reset_index()
company_location_df.columns = ['Company Location', 'count']

fig = px.bar(company_location_df, y='Company Location', x='count', color_discrete_sequence=['#AEC6CF'],title='Top Company Locations', labels={'Company Location': 'Company Location', 'count': 'count'},barmode='group',orientation='h')

# Show the plot
fig.show()

In [60]:
# Company Location

company_location = job_salaries['company_location'].value_counts().nlargest(10)

company_location_df = company_location.reset_index()
company_location_df

Unnamed: 0,index,company_location
0,US,12975
1,GB,655
2,CA,392
3,ES,127
4,DE,98
5,FR,61
6,IN,59
7,AU,53
8,NL,28
9,PT,28


In [61]:
# Salary Currency

salary_cuurrency = job_salaries['salary_currency'].value_counts().nlargest(10)

salary_cuurrency_df = salary_cuurrency.reset_index()
salary_cuurrency_df

Unnamed: 0,index,salary_currency
0,USD,13682
1,GBP,567
2,EUR,424
3,INR,53
4,CAD,51
5,AUD,12
6,CHF,8
7,PLN,7
8,SGD,6
9,BRL,4


In [62]:
# Salary converted to USD and Salary Converted

salary_converted = job_salaries.groupby('salary_currency',)['salary_in_usd'].sum().nlargest(10)


salary_converted_df = salary_converted.reset_index()

salary_converted_df

Unnamed: 0,salary_currency,salary_in_usd
0,USD,2138870399
1,GBP,45976889
2,EUR,27241382
3,CAD,5615524
4,INR,1810805
5,CHF,1097500
6,AUD,943920
7,SGD,490346
8,ILS,417937
9,PLN,252696


#### Train and Test Model

In [63]:
from sklearn.model_selection import train_test_split
import category_encoders as ce
from sklearn.linear_model import LinearRegression

x = job_salaries.drop('salary_in_usd',axis=1)
y = job_salaries['salary_in_usd']


In [65]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.30,random_state = 42)

In [67]:
print('Train', X_train.shape, y_train.shape)
print('Test', X_test.shape, y_test.shape)

Train (10386, 10) (10386,)
Test (4452, 10) (4452,)


In [64]:
encoder = ce.OrdinalEncoder(cols=['work_year','experience_level','employment_type',
       'job_title', 'salary_currency',  'employee_residence','company_location', 'company_size'])


In [68]:
model = LinearRegression()
model.fit(X_train, y_train)

In [69]:
#Model Testing
y_pred = model.predict(X_test)

In [70]:
#Accuracy of the model
print('Training set score: {:.4f}'.format(model.score(X_train, y_train)))

print('Test set score: {:.4f}'.format(model.score(X_test, y_test)))

Training set score: 0.1374
Test set score: -0.0254


Random Forest
