In [153]:
# Import all the necessary libraries : 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

In [155]:
# Load the data : 
df = pd.read_csv('Salary Data.csv')

# ---------------------------------- FE --------------------------------

### 1_ Handling Duplicates 

In [157]:
df = df.drop_duplicates()

In [159]:
df.shape

(325, 6)

### 2_ Missing Value Handling - Only one missing values (Better to drop it)

In [161]:
df.isnull().sum()

Age                    1
Gender                 1
Education Level        1
Job Title              1
Years of Experience    1
Salary                 1
dtype: int64

#### Dropped Null value 1 row 

In [163]:
df = df.dropna()

In [105]:
df.shape

(324, 6)

In [107]:
df.isnull().sum()

Age                    0
Gender                 0
Education Level        0
Job Title              0
Years of Experience    0
Salary                 0
dtype: int64

#### 3_ Encoding (Categorical to Numerical) - Gender, Education Level, Job Title

#### 1_ Gender Encoding : 

In [165]:
#Using One Hot Encoding : 
df = pd.get_dummies(df, columns=['Gender'], drop_first=True)

In [167]:
df["Gender_Male"] = df["Gender_Male"].astype(int)

In [169]:
df

Unnamed: 0,Age,Education Level,Job Title,Years of Experience,Salary,Gender_Male
0,32.0,Bachelor's,Software Engineer,5.0,90000.0,1
1,28.0,Master's,Data Analyst,3.0,65000.0,0
2,45.0,PhD,Senior Manager,15.0,150000.0,1
3,36.0,Bachelor's,Sales Associate,7.0,60000.0,0
4,52.0,Master's,Director,20.0,200000.0,1
...,...,...,...,...,...,...
348,28.0,Bachelor's,Junior Operations Manager,1.0,35000.0,0
349,36.0,Bachelor's,Senior Business Development Manager,8.0,110000.0,1
350,44.0,PhD,Senior Data Scientist,16.0,160000.0,0
351,31.0,Bachelor's,Junior Marketing Coordinator,3.0,55000.0,1


#### 2_ Education Level Encoding : 

In [171]:
# Using Ordinal Encoding : 
from sklearn.preprocessing import OrdinalEncoder
categories = [["PhD", "Master's", "Bachelor's"]]
encoder = OrdinalEncoder(categories=categories)
df['edu_ord'] = encoder.fit_transform(df[['Education Level']]).astype(int)

In [173]:
df

Unnamed: 0,Age,Education Level,Job Title,Years of Experience,Salary,Gender_Male,edu_ord
0,32.0,Bachelor's,Software Engineer,5.0,90000.0,1,2
1,28.0,Master's,Data Analyst,3.0,65000.0,0,1
2,45.0,PhD,Senior Manager,15.0,150000.0,1,0
3,36.0,Bachelor's,Sales Associate,7.0,60000.0,0,2
4,52.0,Master's,Director,20.0,200000.0,1,1
...,...,...,...,...,...,...,...
348,28.0,Bachelor's,Junior Operations Manager,1.0,35000.0,0,2
349,36.0,Bachelor's,Senior Business Development Manager,8.0,110000.0,1,2
350,44.0,PhD,Senior Data Scientist,16.0,160000.0,0,0
351,31.0,Bachelor's,Junior Marketing Coordinator,3.0,55000.0,1,2


#### 3_  Job Title Encoding : Label Encoding 

In [175]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['job_title'] = encoder.fit_transform(df['Job Title'])

In [121]:
df

Unnamed: 0,Age,Education Level,Job Title,Years of Experience,Salary,Gender_Male,edu_ord,job_title
0,32.0,Bachelor's,Software Engineer,5.0,90000.0,1,2,159
1,28.0,Master's,Data Analyst,3.0,65000.0,0,1,17
2,45.0,PhD,Senior Manager,15.0,150000.0,1,0,130
3,36.0,Bachelor's,Sales Associate,7.0,60000.0,0,2,101
4,52.0,Master's,Director,20.0,200000.0,1,1,22
...,...,...,...,...,...,...,...,...
348,28.0,Bachelor's,Junior Operations Manager,1.0,35000.0,0,2,68
349,36.0,Bachelor's,Senior Business Development Manager,8.0,110000.0,1,2,111
350,44.0,PhD,Senior Data Scientist,16.0,160000.0,0,0,115
351,31.0,Bachelor's,Junior Marketing Coordinator,3.0,55000.0,1,2,63


In [177]:
df = df.drop(columns=['Education Level', 'Job Title'], axis=1)

In [179]:
df

Unnamed: 0,Age,Years of Experience,Salary,Gender_Male,edu_ord,job_title
0,32.0,5.0,90000.0,1,2,159
1,28.0,3.0,65000.0,0,1,17
2,45.0,15.0,150000.0,1,0,130
3,36.0,7.0,60000.0,0,2,101
4,52.0,20.0,200000.0,1,1,22
...,...,...,...,...,...,...
348,28.0,1.0,35000.0,0,2,68
349,36.0,8.0,110000.0,1,2,111
350,44.0,16.0,160000.0,0,0,115
351,31.0,3.0,55000.0,1,2,63


In [181]:
df = df[['Age', 'Years of Experience', 'Salary', 'Gender_Male', 'edu_ord', 'job_title']].astype(int)

In [183]:
df

Unnamed: 0,Age,Years of Experience,Salary,Gender_Male,edu_ord,job_title
0,32,5,90000,1,2,159
1,28,3,65000,0,1,17
2,45,15,150000,1,0,130
3,36,7,60000,0,2,101
4,52,20,200000,1,1,22
...,...,...,...,...,...,...
348,28,1,35000,0,2,68
349,36,8,110000,1,2,111
350,44,16,160000,0,0,115
351,31,3,55000,1,2,63


In [185]:
df.rename(columns={'Age':'age', 'Years of Experience':'years_of_experience','Salary':'salary', 'Gender_Male':'gender_male', 'edu_ord':'education_level'}, inplace=True)

In [187]:
df

Unnamed: 0,age,years_of_experience,salary,gender_male,education_level,job_title
0,32,5,90000,1,2,159
1,28,3,65000,0,1,17
2,45,15,150000,1,0,130
3,36,7,60000,0,2,101
4,52,20,200000,1,1,22
...,...,...,...,...,...,...
348,28,1,35000,0,2,68
349,36,8,110000,1,2,111
350,44,16,160000,0,0,115
351,31,3,55000,1,2,63


In [189]:
df

Unnamed: 0,age,years_of_experience,salary,gender_male,education_level,job_title
0,32,5,90000,1,2,159
1,28,3,65000,0,1,17
2,45,15,150000,1,0,130
3,36,7,60000,0,2,101
4,52,20,200000,1,1,22
...,...,...,...,...,...,...
348,28,1,35000,0,2,68
349,36,8,110000,1,2,111
350,44,16,160000,0,0,115
351,31,3,55000,1,2,63


In [191]:
from joblib import dump

# Saving the DataFrame after FE

In [193]:
dump(df, 'df_fe.joblib')

['df_fe.joblib']