# Predicting salary of the US data scientists

#### If salaries of data scientists are determined by their education, experience and job title etc. then we should be able to make hiring manager's life easier by building a predictive model. This is a first attempt to build such a model.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Load data
data = pd.read_csv('../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv')

In [None]:
# subset data with an initial list of a few features
df = data.iloc[:,list(range(1,20))+[127]]

In [None]:
# rename columns
cols = ['Age', 'Gender', 'Country_of_residence', 'Education', 'Job_title', 'Experience',\
        'Python', 'R', 'SQL', 'C', 'C++', 'Java', 'Javascript', 'Julia',\
        'Swift', 'Bash', 'MATLAB', 'None', 'Other', 'Salary_range']

df.columns = cols

In [None]:
# get rid of the first (questions) row
df = df.drop(0)

In [None]:
# filter USA data
df = df[df.Country_of_residence=='United States of America']

In [None]:
# keep just non-null values of salary
df = df[df['Salary_range'].notna()]

In [None]:
# checking out unique caegories of categorical columns
cat_cols = df.select_dtypes(include = ['object', 'category']).columns

for column in cat_cols:
    print(column)
    print('***********')
    print(df[column].value_counts())
    print('--'*20)

In [None]:
# an attempt to convert categorical salary range into a single numeric value
temp1 = df['Salary_range'].str.split('-', expand=True)
df['Salary_min'] = temp1[0]
df['Salary_max'] = temp1[1]

In [None]:
# some cleanups
def replace_symbols(x):
    return x.replace(',', '').replace('$', '').replace('>', '')

df['Salary_min'] = df['Salary_min'].apply(replace_symbols).astype('int')

In [None]:
# additional cleanups
def clean_text(x):
    if x == None:
        x = '0,000'
    return x

df['Salary_max'] = df.Salary_max.apply(clean_text)
df['Salary_max'] = df['Salary_max'].apply(replace_symbols).astype('int')

In [None]:
# estimated salary (mid value of the range)
df['Salary_est'] = (df['Salary_min'] + df['Salary_max'] + 1)/2

In [None]:
# filter data for salary range 30-300k
df = df[(df['Salary_est']>30000) & (df['Salary_est']<300000)]

In [None]:
# recode rows of language columns
languages = ['Python', 'R', 'SQL', 'C', 'C++', 'Java', 'Javascript', \
             'Julia', 'Swift', 'Bash', 'MATLAB', 'None', 'Other']

for i in languages:
    df[i].replace({i: 1, np.nan:0}, inplace=True)

## Some visualization

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot(x='Salary_est', y = 'Gender', data = df);

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot(x='Salary_est', y = 'Education', data = df);

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot(x='Salary_est', y = 'Job_title', data = df);

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot(x='Salary_est', y = 'Experience', data = df);

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot(x='Salary_est', y = 'Experience', data = df);

## Base model

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [None]:
X = df.drop(['Gender', 'Salary_range', 'Salary_range', 'Salary_min', 'Salary_max', 'Salary_est'], axis=1)
y = df['Salary_est']

In [None]:
X = pd.get_dummies(X, drop_first=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=1)

In [None]:
# instantiate
model = LinearRegression()

# fit
model.fit(X_train, y_train)

# predict
y_pred = model.predict(X_test)

# evaluate
mean_squared_error(y_test, y_pred)

In [None]:
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
r2_score(y_test, y_pred)

In [None]:
mean_absolute_error(y_test, y_pred)

### Improved models forthcoming ........