### Data Preprocessing

#### Load data

In [1]:
# Import the libraries
import pandas as pd

In [2]:
# Get the dataframe
df = pd.read_csv("E:\\RajaRajeshwari\\MyFolders\\Projects\\mlops_salary_prediction\\mlops_salary_prediction_main\\data\\raw\\salary_dataset.csv", sep=",", encoding="utf-8")

#### Preprocess data based on Statistical Data Analysis

In [3]:
# Remove rows with missing values
df = df.dropna(how='any',axis=0) 

In [4]:
# Remove duplicate rows
df.drop_duplicates(keep=False, inplace=True)

In [5]:
# Rename column names
new_cols = [col.replace(" ", "_") for col in df.columns]
df.columns = new_cols

#### Preprocess data based on Exploratory Data Analysis

In [6]:
# Drop the Age column
df = df.drop('Age', axis=1)

In [7]:
# Save the min max schema of the dataframe
overview = df.describe()
overview.loc[["min", "max"]].to_json("schema_data.json")

In [8]:
# Label Encode rankable Categorical columns
import json
import joblib
from sklearn.preprocessing import LabelEncoder

label_map = {}
label_encoder = LabelEncoder()
categorical_cols = ['Education_Level', 'Job_Title']

for col in categorical_cols:
    # Label encode categorical column
    df[col] = label_encoder.fit_transform(df[col])
    
    # Save the Label encoder
    joblib.dump(label_encoder, f"{col}_LabelEncoder.joblib", compress=9)
    
    # Get the label mapping
    encoder_mapping = dict(zip(label_encoder.classes_.tolist(), label_encoder.transform(label_encoder.classes_).tolist()))
    label_map[col] = encoder_mapping

print(label_map)
    
# Save the label mapping    
with open("label_map.json", 'w') as f:
    json.dump(label_map, f)

{'Education_Level': {"Bachelor's": 0, "Master's": 1, 'PhD': 2}, 'Job_Title': {'Account Manager': 0, 'Accountant': 1, 'Administrative Assistant': 2, 'Business Analyst': 3, 'Business Development Manager': 4, 'Business Intelligence Analyst': 5, 'CEO': 6, 'Chief Data Officer': 7, 'Chief Technology Officer': 8, 'Content Marketing Manager': 9, 'Copywriter': 10, 'Creative Director': 11, 'Customer Service Manager': 12, 'Customer Service Rep': 13, 'Customer Service Representative': 14, 'Customer Success Manager': 15, 'Customer Success Rep': 16, 'Data Analyst': 17, 'Data Entry Clerk': 18, 'Data Scientist': 19, 'Digital Content Producer': 20, 'Digital Marketing Manager': 21, 'Director': 22, 'Director of Business Development': 23, 'Director of Engineering': 24, 'Director of Finance': 25, 'Director of HR': 26, 'Director of Human Capital': 27, 'Director of Human Resources': 28, 'Director of Marketing': 29, 'Director of Operations': 30, 'Director of Product Management': 31, 'Director of Sales': 32, '

In [9]:
# One Hot Encode non-rankable Categorical Data Column
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import joblib

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
df = ct.fit_transform(df)

# Save the One Hot encoder
joblib.dump(ct, "OneHotEncoder.joblib", compress=9)

['OneHotEncoder.joblib']

In [10]:
print(df)

[[0.00e+00 1.00e+00 0.00e+00 1.57e+02 5.00e+00 9.00e+04]
 [1.00e+00 0.00e+00 1.00e+00 1.70e+01 3.00e+00 6.50e+04]
 [0.00e+00 1.00e+00 2.00e+00 1.28e+02 1.50e+01 1.50e+05]
 ...
 [1.00e+00 0.00e+00 2.00e+00 1.14e+02 1.60e+01 1.60e+05]
 [0.00e+00 1.00e+00 0.00e+00 6.30e+01 3.00e+00 5.50e+04]
 [0.00e+00 1.00e+00 1.00e+00 3.00e+01 1.90e+01 1.70e+05]]
