# Machine Learning Project

## Salary Prediction for Data Professions using Machine Learning


### 🔴 Important: Read before proceeding ⬇️
### To create and view stats on various types of models run steps 1-13.
### To create and save a model locally using pipeline run steps 1-5 and then step 8 and finally steps 14-15.


### Step 1 - Import Required Modules

In [None]:
# Importing Data Manipulation and Visualisation Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Importing Data Preprocessing Libraries
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import f_regression, SelectKBest

# Importing SkLearn ML Model Libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.metrics import mean_squared_error, r2_score

import pickle

### Step 2 - Load The Dataset Into The Project

In [None]:
df = pd.read_csv('salary_dataset.csv')

In [None]:
print(f"First 5 lines of dataset\n")
print(df.head())

print(f"\n\n\nRows and Columns in dataset\n")
print(df.shape)

print(f"\n\n\nInformation about the dataset\n")
print(df.info())
print(df.describe())

### Step 3 - Clean Some Of The Data

In [None]:
df = df.drop_duplicates()
df = df.dropna()

### Step 4 - Feature Engineering

In [None]:
# Calculating Experience of an Individual
df['DOJ'] = pd.to_datetime(df['DOJ'])
df['CURRENT DATE'] = pd.to_datetime(df['CURRENT DATE'])
df['EXPERIENCE'] = df['CURRENT DATE'].dt.year - df['DOJ'].dt.year

# Dropping irrelevant columns
df = df.drop(columns=['FIRST NAME', 'LAST NAME', 'DOJ', 'CURRENT DATE'])

### Step 5 - Split Data Into Training And Testing Sets

In [None]:
# Splitting the Data
X = df.drop('SALARY', axis=1)
y = df['SALARY']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Run The Below Code Only If You Want Stats Of Different Models
### Step 6 - Encode The Data

In [None]:
# Ordinal Encoding
order = [['Analyst', 'Senior Analyst', 'Associate', 'Manager', 'Senior Manager', 'Director']]
oencoder = OrdinalEncoder(categories = order)
X_train['DESIGNATION'] = oencoder.fit_transform(X_train[['DESIGNATION']])
X_test['DESIGNATION'] = oencoder.transform(X_test[['DESIGNATION']])

In [None]:
# Label Encoding
labelColumns = ['SEX', 'UNIT']
for col in labelColumns:
    lencoder = LabelEncoder()
    X_train[col] = lencoder.fit_transform(X_train[col])
    X_test[col] = lencoder.transform(X_test[col])

### Step 7 - Select The Best Features And Reassign Only Them To The Training And Testing Sets

In [None]:
# Feature Selection
featureSelector = SelectKBest(score_func = f_regression, k = 5)
X_train = featureSelector.fit_transform(X_train, y_train)
X_test = featureSelector.transform(X_test)

In [None]:
X_train = pd.DataFrame(X_train, columns = featureSelector.get_feature_names_out())
X_test = pd.DataFrame(X_test, columns = featureSelector.get_feature_names_out())

### Step 8 - Perform Correlation Analysis And Adjust Training And Testing Sets

In [None]:
# Correlation Analysis
matrix = X_train.corr(numeric_only=True)
plt.figure(figsize=(10, 8))
sns.heatmap(matrix, annot=True)
plt.title('Matrix')
plt.show()

In [None]:
X_train = X_train.drop(columns = ['AGE'])
X_test = X_test.drop(columns = ['AGE'])

### Step 9 - Write Helper Functions For Evaluating Models

In [None]:
# Model Training
def findScores(y_test, predictions):
    return [metrics.mean_absolute_error(y_test, predictions), metrics.mean_squared_error(y_test, predictions), np.sqrt(metrics.mean_squared_error(y_test, predictions)), metrics.r2_score(y_test, predictions)]

### Step 10 - Use Linear Regression To Make Predictions

In [None]:
# LinearRegression
linearRegressor = LinearRegression()
linearRegressor.fit(X_train, y_train)
predictionslr = linearRegressor.predict(X_test)

findScores(y_test, predictionslr)

### Step 11 - Use A Random Forest Regressor To Make Predictions

In [None]:
# RandomForest
randomForest = RandomForestRegressor(n_estimators = 110)
randomForest.fit(X_train, y_train)
predictionsrf = randomForest.predict(X_test)

findScores(y_test, predictionsrf)

### Step 12 - Use A Decision Tree Regressor To Make Predictions

In [None]:
# DecisionTree
dtRegressor = DecisionTreeRegressor()
dtRegressor.fit(X_train, y_train)
predictionsdt = dtRegressor.predict(X_test)

findScores(y_test, predictionsdt)

### Step 13 - Use A Gradient Boosting Regressor To Make Predictions

In [None]:
# GradientBoosting
gradientBooster = GradientBoostingRegressor()
gradientBooster.fit(X_train, y_train)
predictionsgb = gradientBooster.predict(X_test)
findScores(y_test, predictionsgb)

## Continue Below To Create And Save Model


### Step 14 - Remove Columns Not Needed In Model

In [None]:
X_train = X_train.drop(columns = ['LEAVES USED', 'LEAVES REMAINING', 'RATINGS'])
X_test = X_test.drop(columns = ['LEAVES USED', 'LEAVES REMAINING', 'RATINGS'])
X_train

### Step 15 - Create A Pipeline

In [None]:
order = [['Analyst', 'Senior Analyst', 'Associate', 'Manager', 'Senior Manager', 'Director']]
oColumnsOrder = ['DESIGNATION']
oColumnsNonOrder = ['SEX','UNIT']
transformer1 = ColumnTransformer([('ordinal-encoding-order-based', OrdinalEncoder(categories = order), oColumnsOrder),('ordinal-encoding-no-order', OrdinalEncoder(), oColumnsNonOrder)], remainder='passthrough')

transformer2 = SelectKBest(f_regression, k = 5)

transformer3 = GradientBoostingRegressor()

pipeline = Pipeline(steps=[('preprocessing', transformer1), ('feature_selection', transformer2), ('model', transformer3)])
pipeline.fit(X_train, y_train)
pickle.dump(pipeline, open('salaryPredictionModel.pkl', 'wb'))