In [23]:
from pycaret.regression import *
import pandas as pd, numpy as np


In [24]:
df = pd.read_csv('/kaggle/input/data-science-job-salaries/ds_salaries.csv')
df.head()

In [25]:
# Renaming attributes 
df['experience_level'] = df['experience_level'].map({
    'EN':'Entry-level/Junior',
    'MI':'Mid-level/Intermediate',
    'SE':'Senior-level/Expert',
    'EX':'Executive-level/Director'})
df['employment_type'] = df['employment_type'].map({'PT':'Part-time','FT': 'Full-time', 'CT': 'Contract','FL':'Freelance'})
df['company_size'] = df['company_size'].map({'M': 'Medium', 'L': 'Large', 'S':'Small'})
# Visualize DF
df.head()

In [26]:
# Dropping columns 
df.drop(['salary','salary_currency'], axis = 1, inplace = True)
df.head()

In [27]:
#Clean Career Values

#ML Engineer -> Machine Learning Engineer
mask = df['job_title'] == 'ML Engineer'
df.loc[mask, 'job_title_clean'] = 'Machine Learning Engineer'

mask = df['job_title'] == 'NLP Engineer'
df.loc[mask, 'job_title_clean'] = 'Machine Learning Engineer'

mask = df['job_title'] == 'Machine Learning Developer'
df.loc[mask, 'job_title_clean'] = 'Machine Learning Engineer'

mask = df.job_title.str.contains('Machine Learning')
df.loc[mask, 'job_title_clean'] = 'Machine Learning Engineer'

mask = df.job_title.str.contains('Machine Learning Scientist')
df.loc[mask, 'job_title_clean'] = 'Machine Learning Engineer'

#Data Scientist
mask = df.job_title.str.contains('Data Scientist')
df.loc[mask, 'job_title_clean'] = 'Data Scientist'

#Data Analyst
mask = df.job_title.str.contains('Data Analyst')
df.loc[mask, 'job_title_clean'] = 'Data Analyst'

#Data Engineer
mask = df.job_title.str.contains('Data Engineer')
df.loc[mask, 'job_title_clean'] = 'Data Engineer'

#Manager
mask = df.job_title.str.contains('Manager')
df.loc[mask, 'job_title_clean'] = 'Manager'

mask = df.job_title.str.contains('Lead')
df.loc[mask, 'job_title_clean'] = 'Manager'

mask = df.job_title.str.contains('Director')
df.loc[mask, 'job_title_clean'] = 'Manager'

mask = df.job_title.str.contains('Head')
df.loc[mask, 'job_title_clean'] = 'Manager'



df.job_title_clean.value_counts()

In [28]:
df.drop('Unnamed: 0', axis = 1, inplace=True)
df.head()

In [29]:
unseen_data = df.tail(122)
df = df.drop(df.tail(122).index)
df

# **Prediction of Salaries Using Pycaret**

In [31]:
from pycaret.regression import *

setup(data = df, # Dataframe for training/validation split
      session_id = 4588,
      target = 'salary_in_usd', # Defining target variable 
      normalize = True, # Normalizing data
      remove_outliers = True, # Removing outliers
      fold = 5,
      transform_target = True, # Transforming target variable to a normal/gaussian distribution 
      transformation = True, # Transforming distribution of all other features
     ordinal_features = {'experience_level' : ['Entry-level/Junior','Mid-level/Intermediate',
                                               'Senior-level/Expert', 'Executive-level/Director'], 
'company_size' : ['Small','Medium','Large']},# Ordinal edncoding
      high_cardinality_features = ['employee_residence']) # Encoding of high cardinality features

In [32]:
top_3 = compare_models(n_select = 3, sort = 'MAE')

In [33]:
br = create_model('br')

In [34]:
ridge = create_model('ridge')

In [40]:
rf = create_model('rf')

In [37]:
tuned_br = tune_model(br, n_iter=1000, optimize='MAE')

In [38]:
tuned_ridge = tune_model(ridge, n_iter=1000, optimize='MAE')

In [41]:
tuned_rf = tune_model(rf, n_iter=1000, optimize = 'MAE')

In [43]:
blended_model = blend_models(estimator_list = [tuned_br, tuned_ridge, tuned_rf], fold = 10, optimize = 'MAE', choose_better = True)

In [44]:
tune_blended_model = tune_model(blended_model, n_iter = 1000, optimize = 'MAE', choose_better = True)

**Best Model: Blended_Model**

In [45]:
evaluate_model(blended_model)

In [52]:
pd.DataFrame({'Feature': get_config('X_train').columns, 'Value' : abs(blended_model.coef_[0])}).sort_values(by='Value', ascending=False)
# plot_model(blended_model, plot = 'feature')

The 5 most important features to predict salaray in usd were:


Employee Residence

Company Location in India

Job Title: Data Analyst

Company Location in Japan

Job Title: Principal Data Engineer

In [47]:
# ytest data
ytest = get_config('y_test')
ytest

In [48]:
# Predicting on validation sample
predict_model(blended_model)

MAE Score on hold-out sample: 30038.8795

In [49]:
# Finalizing model
final_blended_model = finalize_model(blended_model)
print(final_blended_model)

Prediction

In [50]:
unseen_predictions = predict_model(final_blended_model, data = unseen_data)

In [51]:
unseen_predictions[['salary_in_usd','Label']].head(10).round(0)

MAE Score on Unseen Data: 33588.8826
