In [12]:
import pandas as pd
import plotly.express as px
from datetime import datetime
from google.colab import files
import io
from google.colab import drive
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.preprocessing import MinMaxScaler

In [111]:

url = 'https://raw.githubusercontent.com/schwarzschlyle/ds-projects/master/Regression%20Projects/Data%20Science%20Salaries/ds_salaries.csv'
df = pd.read_csv(url)

In [112]:
df.head()

Unnamed: 0.1,Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,0,2020,MI,FT,Data Scientist,70000,EUR,79833,DE,0,DE,L
1,1,2020,SE,FT,Machine Learning Scientist,260000,USD,260000,JP,0,JP,S
2,2,2020,SE,FT,Big Data Engineer,85000,GBP,109024,GB,50,GB,M
3,3,2020,MI,FT,Product Data Analyst,20000,USD,20000,HN,0,HN,S
4,4,2020,SE,FT,Machine Learning Engineer,150000,USD,150000,US,50,US,L


In [113]:
df.columns

Index(['Unnamed: 0', 'work_year', 'experience_level', 'employment_type',
       'job_title', 'salary', 'salary_currency', 'salary_in_usd',
       'employee_residence', 'remote_ratio', 'company_location',
       'company_size'],
      dtype='object')

# Data Cleaning

In [114]:
# we can use salary_in_usd as main output metric

df.drop(columns=['Unnamed: 0', 'salary', 'salary_currency'], inplace=True)

In [115]:
df.employee_residence.unique()

array(['DE', 'JP', 'GB', 'HN', 'US', 'HU', 'NZ', 'FR', 'IN', 'PK', 'PL',
       'PT', 'CN', 'GR', 'AE', 'NL', 'MX', 'CA', 'AT', 'NG', 'PH', 'ES',
       'DK', 'RU', 'IT', 'HR', 'BG', 'SG', 'BR', 'IQ', 'VN', 'BE', 'UA',
       'MT', 'CL', 'RO', 'IR', 'CO', 'MD', 'KE', 'SI', 'HK', 'TR', 'RS',
       'PR', 'LU', 'JE', 'CZ', 'AR', 'DZ', 'TN', 'MY', 'EE', 'AU', 'BO',
       'IE', 'CH'], dtype=object)

In [116]:
df.salary_in_usd.mean()

112297.86985172982

In [117]:
df.employment_type.unique()

array(['FT', 'CT', 'PT', 'FL'], dtype=object)

# Questions to ask

## Averages 

1. Average trend in salaries per year
2. Average relationship between experience level and salaries
3. Average relationship between employment type and salaries
4. Average salaries for each job title
5. Average salaries for each employee residence.
6. Average salaries for PH employees
7. Relationship between remote ratio and salaries
8. Average salaries for each company location
9. Average relationship between company size and salaries

## Optimizing salaries

1. Find out optimal parameters for current year's entry-level ds salaries by figuring out the appropriate

1.1. employment type

1.2. job title

1.3. remote ratio

1.4. company location

1.5. company size 

for PH based employees. 

2. Figure out salary increase by increasing experience level.

Goal: Produce an approximate salary output by calibrating the input parameters. 

## Availability metrics

1. experience level counts
2. employment type counts
3. job title counts
4. distribution of salaries
5. employee residence counts
6. remote ratio counts
7. company location counts
8. company size counts

# Exploratory Data Analysis

## Average trend in salaries per year

In [148]:
mean_df = df.groupby('work_year').aggregate('mean')['salary_in_usd'].to_frame()
mean_df['work_year'] = [2020,2021,2022]

px.line(mean_df, x='work_year', y='salary_in_usd')

In [151]:
# for entry-level jobs



mean_df = df.query("experience_level == 'EN'").groupby('work_year').aggregate('mean')['salary_in_usd'].to_frame()
mean_df['work_year'] = [2020,2021,2022]

px.line(mean_df, x='work_year', y='salary_in_usd')

## Average relationship between experience level and salaries

In [78]:
mean_df = df.groupby('experience_level').aggregate('mean')
mean_df
px.bar(mean_df, x=['Entry Level', 'Expert', 'Mid-level', 'Senior-level'], y='salary_in_usd')


## Average relationship between employment type and salaries

In [157]:
mean_df = df.groupby('employment_type').aggregate('mean')
mean_df
px.bar(mean_df, x=['Contract', 'Freelance', 'Full-time', 'Part-time'], y='salary_in_usd')


In [156]:
mean_df = df.query("experience_level == 'EN'").groupby('employment_type').aggregate('mean')
mean_df
px.bar(mean_df, x=['Contract', 'Full-time', 'Part-time'], y='salary_in_usd')


## Average salaries for each job title

In [87]:
df['job_title']

0                  Data Scientist
1      Machine Learning Scientist
2               Big Data Engineer
3            Product Data Analyst
4       Machine Learning Engineer
                  ...            
602                 Data Engineer
603                 Data Engineer
604                  Data Analyst
605                  Data Analyst
606                  AI Scientist
Name: job_title, Length: 607, dtype: object

In [176]:
mean_df = df.groupby('job_title').aggregate('mean')
mean_df = mean_df.sort_values('salary_in_usd', ascending = False)['salary_in_usd'].head(10)
mean_df




px.bar(mean_df, x=['Data Analytics Lead', 'Principal Data Engineer', 'Financial Data Analyst', 'Principal Data Scientist',
                   'Director of Data Science', 'Data Architect', 'Applied Data Scientist', 'Analytics Engineer', 'Data Specialist', 'Heads of Data'], y='salary_in_usd')


In [162]:
mean_df = df.query("experience_level == 'EN'").groupby('job_title').aggregate('mean')
mean_df = mean_df.sort_values('salary_in_usd', ascending = False)['salary_in_usd'].head(10)
mean_df




px.bar(mean_df, x=['Machine Learning Scientist', 'Applied Data Scientist', 'Computer Vision Software Engineer',
'Financial Data Analyst', 'Machine Learning Developer', 'Machine Learning Engineer', 'Research Scientist', 'Business Data Analyst', 'Data Science Consultant', 'Data Engineer'], y='salary_in_usd')


In [178]:
df.query("job_title == 'Machine Learning Scientist'")

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
1,2020,SE,FT,Machine Learning Scientist,260000,JP,0,JP,S
115,2021,EN,FT,Machine Learning Scientist,225000,US,100,US,L
126,2021,SE,FT,Machine Learning Scientist,120000,US,50,US,S
184,2021,MI,FL,Machine Learning Scientist,12000,PK,50,PK,M
224,2021,SE,FT,Machine Learning Scientist,225000,US,100,CA,L
419,2022,MI,FT,Machine Learning Scientist,160000,US,100,US,L
420,2022,MI,FT,Machine Learning Scientist,112300,US,100,US,L
495,2022,MI,FT,Machine Learning Scientist,153000,US,50,US,M


## Average salaries for each employee residence

In [106]:
mean_df = df.groupby('employee_residence').aggregate('mean')
mean_df = mean_df.sort_values('salary_in_usd', ascending = False)['salary_in_usd'].head(10)
mean_df



px.bar(mean_df, x=['MY', 'PR', 'US', 'NZ', 'CH', 'AU', 'RU', 'SG', 
                   'JP', 'AE' ], y='salary_in_usd')

In [167]:
mean_df = df.query("experience_level == 'EN'").groupby('employee_residence').aggregate('mean')
mean_df = mean_df.sort_values('salary_in_usd', ascending = False)['salary_in_usd'].head(10)
mean_df



px.bar(mean_df, x=['AU', 'DZ', 'IQ', 'JE','US','GB','NL','LU','CA','DE'], y='salary_in_usd')

## Average salaries for PH employees

In [134]:
df.query("employee_residence == 'PH'")

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
40,2020,MI,FT,Data Scientist,45760,PH,100,US,S


## Relationship between remote ratio and salaries

In [142]:
mean_df = df.groupby('remote_ratio').aggregate('mean')
mean_df = mean_df.sort_values('salary_in_usd', ascending = False)['salary_in_usd'].head(10)
mean_df


px.bar(mean_df, x=['100', '0', '50'], y='salary_in_usd')

In [169]:
mean_df = df.query("experience_level == 'EN'").groupby('remote_ratio').aggregate('mean')
mean_df = mean_df.sort_values('salary_in_usd', ascending = False)['salary_in_usd'].head(10)
mean_df


px.bar(mean_df, x=['100', '50', '0'], y='salary_in_usd')

## Average salaries for each company location

In [145]:
mean_df = df.groupby('company_location').aggregate('mean')
mean_df = mean_df.sort_values('salary_in_usd', ascending = False)['salary_in_usd'].head(10)
mean_df

px.bar(mean_df, x=['RU', 'UZ', 'NZ', 'IL', 'JP', 'AU', 'AE', 'DZ', 'IQ', 'CA'], y='salary_in_usd')

In [172]:
mean_df = df.query("experience_level == 'EN'").groupby('company_location').aggregate('mean')
mean_df = mean_df.sort_values('salary_in_usd', ascending = False)['salary_in_usd'].head(10)
mean_df

px.bar(mean_df, x=['AU','CN','DZ','IQ','US','GB','DE','CA','FR','NL'], y='salary_in_usd')

# Average relationship between company size and salaries

In [147]:
mean_df = df.groupby('company_size').aggregate('mean')
mean_df = mean_df.sort_values('salary_in_usd', ascending = False)['salary_in_usd'].head(10)
mean_df

px.bar(mean_df, x=['L', 'M', 'S'], y='salary_in_usd')

In [175]:
mean_df = df.query("experience_level == 'EN'").groupby('company_size').aggregate('mean')
mean_df = mean_df.sort_values('salary_in_usd', ascending = False)['salary_in_usd'].head(10)
mean_df

px.bar(mean_df, x=['L', 'S', 'M'], y='salary_in_usd')

Some inferred conclusions:

For entry-level positions, apply for:

1. Either contractual or full-time positions

2. Machine Learning Scientist job title occupy the top position for an enty-level job with an average of 225k USD.

3. Increasing remote ratio by 50% increases salary by 3k USD from a baseline 57k USD at 0% remote ratio.

4. Top 5 company locations to apply for are: Australia, China, Algeria, Iraq, and US

5. Pick the largest company. However, smaller companies seem to pay more than medium-sized companies.