<a href="https://colab.research.google.com/github/sanhiitaa/salary-prediction/blob/main/salary_prediction_using_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# import libraries
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# load the pipeline
pipe=pickle.load(open('/content/final-pipeline-salary-prediction.pkl', 'rb'))

In [3]:
# sample test input from x train itself
test_input = pd.DataFrame({'SEX': ['F'],
 'DESIGNATION': ['Analyst'],
 'UNIT': ['Marketing'],
 'PAST EXP': [0],
 'years_experience': [3]})
print(test_input)

  SEX DESIGNATION       UNIT  PAST EXP  years_experience
0   F     Analyst  Marketing         0                 3


In [4]:
# making prediction using the pipeline
pipe.predict(test_input)

array([45233.39584925])

- actual value: `49432`
- predicted value: `45233.4`

# Predicting x_test values using pipeline

In [11]:
df=pd.read_csv('/content/salary_data_cleaned.csv')
df

Unnamed: 0,SEX,DESIGNATION,AGE,SALARY,UNIT,LEAVES USED,LEAVES REMAINING,RATINGS,PAST EXP,years_experience
0,F,Analyst,21.0,44570,Finance,24.0,6.0,2.0,0,2
1,F,Associate,25.0,89207,Web,22.0,13.0,2.0,7,3
2,F,Analyst,21.0,40955,Finance,23.0,7.0,3.0,0,2
3,F,Analyst,22.0,45550,IT,22.0,8.0,3.0,0,3
4,M,Analyst,25.0,43161,Operations,27.0,3.0,2.0,3,2
...,...,...,...,...,...,...,...,...,...,...
2473,F,Senior Manager,36.0,185977,Management,15.0,15.0,5.0,10,5
2474,F,Analyst,23.0,45758,IT,17.0,13.0,2.0,0,2
2475,F,Analyst,21.0,47315,Web,29.0,1.0,5.0,0,2
2476,F,Analyst,24.0,45172,Web,23.0,7.0,3.0,1,2


In [12]:
# only retaining the columns which are required for prediction
cols=[ 'SEX', 'DESIGNATION', 'UNIT', 'PAST EXP',  'years_experience', 'SALARY']
df=df[cols]
df

Unnamed: 0,SEX,DESIGNATION,UNIT,PAST EXP,years_experience,SALARY
0,F,Analyst,Finance,0,2,44570
1,F,Associate,Web,7,3,89207
2,F,Analyst,Finance,0,2,40955
3,F,Analyst,IT,0,3,45550
4,M,Analyst,Operations,3,2,43161
...,...,...,...,...,...,...
2473,F,Senior Manager,Management,10,5,185977
2474,F,Analyst,IT,0,2,45758
2475,F,Analyst,Web,0,2,47315
2476,F,Analyst,Web,1,2,45172


In [13]:
# splitting the data into dependent and independent variables
x=df.drop('SALARY', axis=1)
y=df['SALARY']

In [14]:
# splitting the data into train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [16]:
# predicting values
pred= pipe.predict(x_test)

In [19]:
# custom made helper function to compute and return metrics
def scorer(model_name, dictionary_name, y_test, pred):
  '''
    A metric scorer for regression tasks

    This function takes the following arguments:
    - model_name: a string representing the name of the model.
    - dictionary_name: a dictionary where the scores will be stored.
    - y_test: a pandas Series or array representing the true target values for the test set.
    - pred: a pandas Series or array representing the predicted target values for the test set.

    The function calculates Mean Absolute Error, Mean Squared Error, Root Mean Squared Error, and R2 score for the model,
    stores them in the provided dictionary under the key of the model name, and returns the dictionary.

  '''
  from sklearn import metrics
  import numpy as np
  dictionary_name[model_name]={'Mean Absolute Error' : metrics.mean_absolute_error(y_test, pred),
                              'Mean Squared Error' : metrics.mean_squared_error(y_test, pred),
                              'Root Mean Squared Error': np.sqrt(metrics.mean_squared_error(y_test, pred)),
                              'R2 score' : metrics.r2_score(y_test,pred)}
  return dictionary_name[model_name]

In [20]:
score={}
scorer('Gradient Boosting Machine', score, y_test, pred)

{'Mean Absolute Error': 4077.9765047702363,
 'Mean Squared Error': 45826367.445759386,
 'Root Mean Squared Error': 6769.517519421852,
 'R2 score': 0.9674823146258182}

The model has an R2 score of `0.9674823146258182` on the x_test dataset.