<a href="https://colab.research.google.com/github/sanhiitaa/salary-prediction/blob/main/salary_prediction_final_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Libraries

In [68]:
# data handling and visualization libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# data preparation libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import SelectKBest, f_regression

# model
from sklearn.ensemble import GradientBoostingRegressor

# evaluation
from sklearn import metrics

# Pipeline and transformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV

# random seed
np.random.seed(1111)

In [69]:
# loading the dataset
df=pd.read_csv('/content/salary_data_cleaned.csv')
df.head()

Unnamed: 0,SEX,DESIGNATION,AGE,SALARY,UNIT,LEAVES USED,LEAVES REMAINING,RATINGS,PAST EXP,years_experience
0,F,Analyst,21.0,44570,Finance,24.0,6.0,2.0,0,2
1,F,Associate,25.0,89207,Web,22.0,13.0,2.0,7,3
2,F,Analyst,21.0,40955,Finance,23.0,7.0,3.0,0,2
3,F,Analyst,22.0,45550,IT,22.0,8.0,3.0,0,3
4,M,Analyst,25.0,43161,Operations,27.0,3.0,2.0,3,2


In [70]:
df=df.drop(columns=['AGE'])

# Splitting the data

In [71]:
# splitting the data into dependent and independent variables
X=df.drop('SALARY',axis=1)
y=df['SALARY']

In [72]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [73]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((1734, 8), (744, 8), (1734,), (744,))

# Creating a pipeline

In [75]:
# preprocessing transformer
ordinal_columns = ['DESIGNATION']
desired_order= [['Analyst', 'Associate', 'Senior Analyst', 'Manager', 'Senior Manager', 'Director']]
ordinal_columns_no_order = ['SEX', 'UNIT']
trans1 = ColumnTransformer(
                  [('ordinal-encoding-order-based', OrdinalEncoder(categories = desired_order), ordinal_columns),
                   ('ordinal-encoding-no-order', OrdinalEncoder(), ordinal_columns_no_order)],
                   remainder='passthrough')

trans2 = SelectKBest(f_regression, k=5)

trans3 = GradientBoostingRegressor()

In [76]:
# creating a pipeline
pipeline = Pipeline(steps=[('preprocessing', trans1),
                           ('feature_selection', trans2),
                           ('model', trans3)])

In [77]:
pipeline.fit(x_train, y_train)

# Hyperparameter tuning

In [78]:
# parameters for hyperparameter tuning

param_dist = {
    'model__n_estimators': [50, 100, 200],
    'model__learning_rate': [0.01, 0.03, 0.05, 0.1],
    'model__max_depth': [2, 3, 4, 5],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__subsample': [0.6, 0.8, 1.0]
}


In [79]:
# hyperparameter tuning using RandomizedSearchCV
random_search= RandomizedSearchCV(pipeline, param_dist, n_iter=100, cv=3, scoring= 'r2', n_jobs=-1)
random_search.fit(x_train, y_train)

In [80]:
print('Best Parameters: ', random_search.best_params_)
print('Best Score: ', random_search.best_score_)

Best Parameters:  {'model__subsample': 0.6, 'model__n_estimators': 50, 'model__min_samples_split': 10, 'model__min_samples_leaf': 4, 'model__max_depth': 4, 'model__learning_rate': 0.1}
Best Score:  0.947314738061936


generating a new pipeline with the best parameters

In [81]:
# extracting the SelectKBest feature selection step from the best model
best_model = random_search.best_estimator_
select_k_best = best_model.named_steps['feature_selection']

In [82]:
# extracting the names of the best features selected through SelectKBest
mask = select_k_best.get_support()

feature_names = x_train.columns
selected_features = feature_names[mask]

In [83]:
# updating training and test datasets with only the selected best features
x_train_new = x_train[selected_features]
x_test_new = x_test[selected_features]

In [84]:
x_train_new.head()

Unnamed: 0,SEX,DESIGNATION,UNIT,PAST EXP,years_experience
646,F,Analyst,IT,1,3
1074,M,Analyst,Marketing,0,2
244,F,Analyst,IT,1,3
2071,M,Analyst,Marketing,2,3
1780,M,Analyst,Finance,0,3


# Final Pipeline

In [85]:
# creating the final pipeline
final_pipeline = Pipeline(steps=[('preprocessing', trans1),
                                 ('model', GradientBoostingRegressor())])
# extracting best paramteres
best_params= random_search.best_params_

# updating pipeline with the best parameters
final_pipeline.set_params(**best_params)

In [86]:
x_train_new

Unnamed: 0,SEX,DESIGNATION,UNIT,PAST EXP,years_experience
646,F,Analyst,IT,1,3
1074,M,Analyst,Marketing,0,2
244,F,Analyst,IT,1,3
2071,M,Analyst,Marketing,2,3
1780,M,Analyst,Finance,0,3
...,...,...,...,...,...
1942,M,Analyst,IT,1,1
152,F,Analyst,Finance,0,3
674,F,Analyst,Management,2,2
311,M,Analyst,Web,0,2


In [87]:
final_pipeline.fit(x_train_new, y_train)

# Exporting the pipeline

In [88]:
import pickle
pickle.dump(final_pipeline, open('final-pipeline-salary-prediction.pkl', 'wb'))

In [91]:
# test_input for pipeline testing in the next notebook
num=np.random.randint(1000)
a = x_train_new.iloc[num,:]# random number for index value
{x : [value] for x, value in a.items()}

{'SEX': ['F'],
 'DESIGNATION': ['Analyst'],
 'UNIT': ['Marketing'],
 'PAST EXP': [0],
 'years_experience': [3]}

In [92]:
y_train.iloc[num]

49432