# Data Science Jobs Salary
This project focuses on salary analysis for various data science jobs in US.

### End-to-end Project
- Data Collection
- Data Cleaning
- EDA
- Model Building
- Production

## Data Cleaning

### TODO:
- understand what kind of data we're dealing with
- salary parsing (delete Glassdoor part, delete NULL)
- company name, text only
- state field, delete city name
- is company a headquarter
- age of company
- parsing job description

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

def print_full(x):
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_rows')

In [None]:
data = pd.read_csv("../input/glassdoor-data-science-jobs-salary-dataset/glassdoor_jobs.csv")
data.head()

In [None]:
data.shape

In [None]:
# Drop unnamed column
data.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
## Salary Parsing

In [None]:
# delete 'Salary Estimate' column with values '-1'
data = data[data['Salary Estimate'] != '-1']
data.shape

In [None]:
data['Salary Estimate']

In [None]:
# create new columns for hourly rate if presented there
data['hourly'] = data['Salary Estimate'].apply(lambda x: 1 if 'per hour' in x.lower() else 0)
data['employer_provided'] = data['Salary Estimate'].apply(lambda x: 1 if 'employer provided salary' in x.lower() else 0)

data.head()

In [None]:
salary = data['Salary Estimate'].apply(lambda x: x.split('(')[0])
salary = salary.apply(lambda x: x.replace('K', '').replace('$', '')) # delete $, and K
salary = salary.apply(lambda x: x.lower().replace('per hour', '').replace('employer provided salary:', '')) # delete a bit more stuff
salary.head()

In [None]:
data['min_salary'] = salary.apply(lambda x: int(x.split('-')[0]))
data['max_salary'] = salary.apply(lambda x: int(x.split('-')[1]))
data['mean_salary'] = (data.min_salary + data.max_salary) / 2


print(data['min_salary'].dtype)
print(data[['min_salary', 'max_salary', 'mean_salary']].head(20))

In [None]:
# Company Name
# Text only

In [None]:
# delete threee chars from the end of all rows
# as we did not specify Series in .apply(), so that we must use axis=1
data['company_txt'] = data.apply(lambda x: x['Company Name'] if x['Rating'] < 0 else x['Company Name'][:-3], axis=1)
data

In [None]:
# State Field

In [None]:
data["job_state"] = data['Location'].apply(lambda x: x.split(',')[-1])
data.job_state.value_counts()

In [None]:
# age of company
# subtract current year from the year founded
data['age'] = data['Founded'].apply(lambda x: x if x < 1 else 2021 - x)
data.age

In [None]:
# headquarters and location are the same?
data['same_state'] = data.apply(lambda x: 1 if x['Location'] == x['Headquarters'] else 0, axis=1)
data.same_state

In [None]:
# parsing the job descriptions

# python
data['python_yn'] = data['Job Description'].apply(lambda x: 1 if 'python' in x.lower() else 0)
 
# r studio 
data['R_yn'] = data['Job Description'].apply(lambda x: 1 if 'r studio' in x.lower() or 'r-studio' in x.lower() else 0)
print(data.R_yn.value_counts())

# spark 
data['spark'] = data['Job Description'].apply(lambda x: 1 if 'spark' in x.lower() else 0)
print(data.spark.value_counts())

# aws 
data['aws'] = data['Job Description'].apply(lambda x: 1 if 'aws' in x.lower() else 0)
print(data.aws.value_counts())

# excel
data['excel'] = data['Job Description'].apply(lambda x: 1 if 'excel' in x.lower() else 0)
print(data.excel.value_counts())

In [None]:
data.Industry.value_counts()

In [None]:
data.Sector.value_counts()

In [None]:
df = data # easier to work with 'df' keyword
df.columns

In [None]:
print_full(df['Job Title'].value_counts())

In [None]:
def title_simplifier(title):
    if 'data scientist' in title.lower():
        return 'data scientist'
    elif 'data engineer' in title.lower():
        return 'data engineer'
    elif 'analyst' in title.lower():
        return 'analyst'
    elif 'machine learning' in title.lower():
        return 'mle'
    elif 'manager' in title.lower():
        return 'manager'
    elif 'director' in title.lower():
        return 'director'
    else:
        return 'na'
    
def seniority(title):
    if 'sr' in title.lower() or 'senior' in title.lower() or 'sr' in title.lower() or 'lead' in title.lower() or 'principal' in title.lower():
        return 'senior'
    elif 'junior' in title.lower() or 'jr' in title.lower() or 'jr.' in title.lower():
        return 'jr'
    else:
        return 'na'

In [None]:
# Job title and seniority 
		
# Fix state Los Angeles 

# Job description length 

# Competitor count

# Hourly wage to annual 

# Remove new line from job title

In [None]:
df['job_simp'] = df['Job Title'].apply(title_simplifier)
df.job_simp.value_counts()

In [None]:
df['seniority'] = df['Job Title'].apply(seniority)
df.seniority.value_counts()

In [None]:
df.job_state

In [None]:
# Fix state Los Angeles 
df['job_state']= df.job_state.apply(lambda x: x.strip() if x.strip().lower() != 'los angeles' else 'CA')
df.job_state.value_counts()

In [None]:
# Job description length 
df['desc_len'] = df['Job Description'].apply(lambda x: len(x))
df['desc_len']

In [None]:
# Competitor count
df['num_comp'] = df['Competitors'].apply(lambda x: len(x.split(',')) if x != '-1' else 0)
df['Competitors']

In [None]:
# Looking at the number of competitors
df[df['num_comp'] != 0].num_comp.head(20)

In [None]:
df[df.hourly == 1][['hourly','min_salary','max_salary']]

In [None]:
# changing hourly wage to annual 

df['min_salary'] = df.apply(lambda x: x.min_salary*40*53/1000 if x.hourly ==1 else x.min_salary, axis =1)
df['max_salary'] = df.apply(lambda x: x.max_salary*40*53/1000 if x.hourly ==1 else x.max_salary, axis =1)

In [None]:
df[df.hourly == 1][['hourly','min_salary','max_salary']]

In [None]:
# Delete the new line in company name
df['company_txt'] = df.company_txt.apply(lambda x: x.replace('\n', ''))
df['company_txt']

## Explarotary Data Analysis

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
df.hist(column=['Rating', 'mean_salary', 'age', 'desc_len'])

In [None]:
df.boxplot(column = ['age','mean_salary','Rating'])
# we can see that age has many outliers, mean_salary too

In [None]:
df.boxplot(column = 'Rating') # rating has negative outliers

In [None]:
# looking at the correlations
df[['age','mean_salary','Rating','desc_len']].corr()

In [None]:
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(df[['age','mean_salary','Rating','desc_len','num_comp']].corr(),vmax=.3, center=0, cmap=cmap,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
df_cat = df[['Location', 'Headquarters', 'Size','Type of ownership', 'Industry', 'Sector', 'Revenue', 'company_txt', 'job_state','same_state', 'python_yn', 'R_yn', 'spark', 'aws', 'excel', 'job_simp', 'seniority']]

In [None]:
for i in df_cat.columns:
    cat_num = df_cat[i].value_counts()
    print("graph for %s: total = %d" % (i, len(cat_num)))
    chart = sns.barplot(x=cat_num.index, y=cat_num)
    chart.set_xticklabels(chart.get_xticklabels(), rotation=90)
    plt.show()

In [None]:
for i in df_cat[['Location','Headquarters','company_txt']].columns:
    cat_num = df_cat[i].value_counts()[:20]
    print("graph for %s: total = %d" % (i, len(cat_num)))
    chart = sns.barplot(x=cat_num.index, y=cat_num)
    chart.set_xticklabels(chart.get_xticklabels(), rotation=90)
    plt.show()

In [None]:
# looking at the mean_salary of all DS jobs
pd.pivot_table(df, index='job_simp', values='mean_salary')

In [None]:
# filtering DS jobs by seniority level
pd.pivot_table(df, index=['job_simp','seniority'], values='mean_salary')

In [None]:
# looking at which state and in which position there is higher salary
print_full(pd.pivot_table(df, index=['job_state','job_simp'], values='mean_salary').sort_values('mean_salary', ascending=False))

In [None]:
# counting the number of job postings by state
print_full(pd.pivot_table(df, index = ['job_state','job_simp'], values = 'mean_salary', aggfunc = 'count').sort_values('mean_salary', ascending = False))

In [None]:
# 'data scientist' salary by state
pd.pivot_table(df[df.job_simp == 'data scientist'], index = 'job_state', values = 'mean_salary').sort_values('mean_salary', ascending = False)

In [None]:
df_pivots = df[['Rating', 'Industry', 'Sector', 'Revenue', 'num_comp', 'hourly', 'employer_provided', 'python_yn', 'R_yn', 'spark', 'aws', 'excel', 'Type of ownership']]

for i in df_pivots.columns:
    print(i)
    print(pd.pivot_table(df,index=i, values='mean_salary').sort_values('mean_salary', ascending = False))

In [None]:
# does python is used in companies with more revenue?
pd.pivot_table(df, index='Revenue', columns='python_yn', values='mean_salary', aggfunc='count')