# Predicting the salary of a position of a data science job

In [1]:
# Import Statements
import pandas as pd
import numpy as np
import matplotlib as plt

import warnings
warnings.filterwarnings(action = 'ignore')

## Dataset

In [2]:
dataset = pd.read_csv('glassdoor_jobs.csv')

## Data Cleaning

In [3]:
# Removing the rows with negative salaries
dataset = dataset[dataset['Salary Estimate'] != '-1']

In [4]:
# Salary parsing
dataset['hourly'] = dataset['Salary Estimate'].apply(lambda x: 1 if 'per hour' in x.lower() else 0)
dataset['emp_provided'] = dataset['Salary Estimate'].apply(
    lambda x: 1 if 'employer provided salary:' in x.lower() else 0)

In [5]:
salary = dataset['Salary Estimate'].apply(lambda x: x.split('(')[0])

In [6]:
temp = salary.apply(lambda x: x.replace('K','').replace('$',''))
minus_hr = temp.apply(lambda x: x.lower().replace('per hour',''))
minus_emp_prov = minus_hr.apply(lambda x: x.lower().replace('employer provided salary:',''))

In [7]:
dataset['min_salary'] = minus_emp_prov.apply(lambda x: int(x.split('-')[0]))
dataset['max_salary'] = minus_emp_prov.apply(lambda x: int(x.split('-')[1]))

In [8]:
dataset['average_salary'] = (dataset.min_salary+dataset.max_salary)/2

In [9]:
# Company Name
dataset['company_txt'] = dataset.apply(
    lambda x: x['Company Name'] if x['Rating'] < 0 else x['Company Name'][:-3], axis=1)

In [10]:
# State field
dataset['job_state'] = dataset['Location'].apply(lambda x: x.split(',')[1])

In [11]:
dataset['same_state'] = dataset.apply(lambda x: 1 if x.Location == x.Headquarters else 0,axis=1)

In [12]:
dataset['age'] = dataset.Founded.apply(lambda x: x if x<1 else 2021-x)

In [13]:
# Parsing the job descriptions
dataset['python'] = dataset['Job Description'].apply(
    lambda x: 1 if 'python' in x.lower() else 0)
dataset['r_lang'] = dataset['Job Description'].apply(
    lambda x: 1 if 'r studio' in x.lower() or 'r-studio' in x.lower() else 0)
dataset['spark'] = dataset['Job Description'].apply(
    lambda x: 1 if 'spark' in x.lower() else 0)
dataset['aws'] = dataset['Job Description'].apply(
    lambda x: 1 if 'aws' in x.lower() else 0)
dataset['excel'] = dataset['Job Description'].apply(
    lambda x: 1 if 'excel' in x.lower() else 0)

In [15]:
dataset.drop(['Unnamed: 0'],axis=1,inplace=True)

In [17]:
dataset.to_csv('salary_data_cleaned.csv',index=False)