In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import datetime as dt
import warnings
import plotly.graph_objects as go
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)

# 1. Extracting

In [35]:
# Loading data
data = pd.read_csv('datasets/data_science/ds_salaries.csv')

In [36]:
# Data information
data.info()
#print(data.describe())
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 607 entries, 0 to 606
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          607 non-null    int64 
 1   work_year           607 non-null    int64 
 2   experience_level    607 non-null    object
 3   employment_type     607 non-null    object
 4   job_title           607 non-null    object
 5   salary              607 non-null    int64 
 6   salary_currency     607 non-null    object
 7   salary_in_usd       607 non-null    int64 
 8   employee_residence  607 non-null    object
 9   remote_ratio        607 non-null    int64 
 10  company_location    607 non-null    object
 11  company_size        607 non-null    object
dtypes: int64(5), object(7)
memory usage: 57.0+ KB


Unnamed: 0.1,Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,0,2020,MI,FT,Data Scientist,70000,EUR,79833,DE,0,DE,L
1,1,2020,SE,FT,Machine Learning Scientist,260000,USD,260000,JP,0,JP,S
2,2,2020,SE,FT,Big Data Engineer,85000,GBP,109024,GB,50,GB,M
3,3,2020,MI,FT,Product Data Analyst,20000,USD,20000,HN,0,HN,S
4,4,2020,SE,FT,Machine Learning Engineer,150000,USD,150000,US,50,US,L


# 2. Transform

In [37]:
# Deleting useless column
data = data.drop('Unnamed: 0',axis=1)
# Looking for null values
data.isnull().sum()

work_year             0
experience_level      0
employment_type       0
job_title             0
salary                0
salary_currency       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64

# 3. Data statistics

In [38]:
# Correlation between atributes
#print(data.corr())
fig=px.imshow(data.corr(),text_auto=True,height=600,width=600,template='ggplot2',aspect='auto',title='Correlation of attributes', color_continuous_scale='reds')
fig.show()

In [39]:
# To read clearly the results
data['experience_level'].replace({'EN':'Entry','MI':'Mid','EX':'Executive','SE':'Senior'}, inplace=True)
# Boxploy of salaries in usd by experience
px.box(data,x='experience_level', y='salary_in_usd', color='experience_level', template='ggplot2',labels={'experience_level':'Experience','salary_in_usd':'Salary in USD'},title='Data Science salaries by experience')

In [40]:
# To read clearly the results
data['employment_type'].replace({'PT':'Part Time','FT':'Full Time','CT':'Contract','FL':'Freelance'}, inplace=True)
# Boxploy of salaries in usd by employment type
px.box(data,x='employment_type', y='salary_in_usd', color='employment_type', template='ggplot2',labels={'employment_type':'Experience','salary_in_usd':'Salary in USD'},title='Data Science salaries by employment type')

# 4. Exploratory Analysis

In [41]:
# EDA
px.violin(data,x='work_year',y='salary_in_usd',color='work_year',labels={'work_year':'year','salary_in_usd':'salary in usd'},template='seaborn',title='Data Science salaries by work year')

In [42]:
data_=data['job_title'].value_counts().head(12)
fig=px.bar(data_,x=data_.index,y=data_.values,text=data_.values,labels={'index':'Job Title','y':'Number of jobs','text':'count'},template='seaborn',title='Top 12 Roles in Data Science')
fig.show()

In [46]:
data_ = data.groupby('job_title',as_index=False)['salary_in_usd'].mean().sort_values(by='salary_in_usd',ascending=False)
data_['salary_in_usd']=round(data_['salary_in_usd'],2)
fig=px.bar(data_.head(12),x='job_title',y='salary_in_usd',labels={'job_title':'Job Title','salary_in_usd':'Avg salary in USD'},text='salary_in_usd',template='seaborn',title='Top 12 Roles in Data Science based on average salaries')
fig.show()