## Model Training

#### 1.1 Import Data and Required Packages
##### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [19]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

#### Import the CSV Data as Pandas DataFrame

In [22]:

df = pd.read_csv("data/cleaned_employee_data.csv")

#### Show Top 5 Records

In [23]:
df.head()

Unnamed: 0,Salary,Age,Sex,Dependents,HRA,DA,PF,Gross Salary,Insurance,Marital Status,In Company Years,Year of Experience,Department,Position
0,149289.04,53,Other,2.0,8776.096,35233.666375,22142.724765,171156.07761,,Widowed,26,32,Sales,Sales Director
1,69761.64,26,Male,0.0,9302.836,16464.425985,10347.127918,85181.774067,Both,Single,3,5,Human Resources,HR Executive
2,107633.14,44,Female,2.0,13357.686,25402.468564,15964.273028,130429.021537,,Divorced,3,23,Marketing,Senior Marketing Executive
3,147408.65,58,Male,1.0,6026.135,34789.876034,21863.823124,166360.83791,,Married,17,37,Finance,Account Director
4,53447.06,31,Male,2.0,7554.294,12614.026326,7927.330359,65688.049967,Life,Married,7,10,Sales,Sales Executive


In [38]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import warnings
warnings.filterwarnings("ignore")

# Load data
df = pd.read_csv("data/cleaned_employee_data.csv")

# Optional: remove leaky features
leak_columns = ['Gross Salary', 'HRA', 'DA', 'PF']
df = df.drop(columns=leak_columns)

# Define X and y
X = df.drop("Salary", axis=1)
y = df["Salary"]

# Column types
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

# Preprocessor
numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

preprocessor = ColumnTransformer(
    transformers=[
        ("OneHotEncoder", oh_transformer, cat_features),
        ("StandardScaler", numeric_transformer, num_features),        
    ]
)

# Apply transformations
X = preprocessor.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Evaluation function
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mse)
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

# Models
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(),
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}

model_list = []
r2_list = []

for name, model in models.items():
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_mae, train_rmse, train_r2 = evaluate_model(y_train, y_train_pred)
    test_mae, test_rmse, test_r2 = evaluate_model(y_test, y_test_pred)

    print(f"{name}")
    print("Model performance for Training set")
    print(f"- Root Mean Squared Error: {train_rmse:.4f}")
    print(f"- Mean Absolute Error: {train_mae:.4f}")
    print(f"- R2 Score: {train_r2:.4f}")
    print('----------------------------------')
    print("Model performance for Test set")
    print(f"- Root Mean Squared Error: {test_rmse:.4f}")
    print(f"- Mean Absolute Error: {test_mae:.4f}")
    print(f"- R2 Score: {test_r2:.4f}")
    print("=" * 35)
    
    model_list.append(name)
    r2_list.append(test_r2)

# Tabular summary
summary_df = pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score'])
summary_df = summary_df.sort_values(by='R2_Score', ascending=False)
print("\n✅ Model Ranking by R2 Score:")
print(summary_df)


Linear Regression
Model performance for Training set
- Root Mean Squared Error: 10235.7231
- Mean Absolute Error: 8788.0542
- R2 Score: 0.9091
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 10239.6667
- Mean Absolute Error: 8727.4027
- R2 Score: 0.9086
Lasso
Model performance for Training set
- Root Mean Squared Error: 10234.6838
- Mean Absolute Error: 8787.0311
- R2 Score: 0.9091
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 10238.7198
- Mean Absolute Error: 8721.9880
- R2 Score: 0.9086
Ridge
Model performance for Training set
- Root Mean Squared Error: 10300.7130
- Mean Absolute Error: 8824.8700
- R2 Score: 0.9079
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 10290.6946
- Mean Absolute Error: 8738.8149
- R2 Score: 0.9077
K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 9744.1657
- Mean Absolute Error: 8090.6344


In [39]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
8,AdaBoost Regressor,0.909853
1,Lasso,0.908621
0,Linear Regression,0.908604
2,Ridge,0.907691
5,Random Forest Regressor,0.899955
7,CatBoosting Regressor,0.8996
6,XGBRegressor,0.88839
3,K-Neighbors Regressor,0.877514
4,Decision Tree,0.811332


In [44]:
import os
import joblib

# Create the folder if it doesn't exist
os.makedirs("model", exist_ok=True)

# Save the trained pipeline
joblib.dump(final_pipeline, "model/salary_model.pkl")
print("✅ AdaBoost pipeline saved successfully.")


✅ AdaBoost pipeline saved successfully.


#### Preparing X and Y variables

In [25]:
X = df.drop("Salary", axis=1)
y = df["Salary"]

In [26]:
X.head()

Unnamed: 0,Age,Sex,Dependents,HRA,DA,PF,Gross Salary,Insurance,Marital Status,In Company Years,Year of Experience,Department,Position
0,53,Other,2.0,8776.096,35233.666375,22142.724765,171156.07761,,Widowed,26,32,Sales,Sales Director
1,26,Male,0.0,9302.836,16464.425985,10347.127918,85181.774067,Both,Single,3,5,Human Resources,HR Executive
2,44,Female,2.0,13357.686,25402.468564,15964.273028,130429.021537,,Divorced,3,23,Marketing,Senior Marketing Executive
3,58,Male,1.0,6026.135,34789.876034,21863.823124,166360.83791,,Married,17,37,Finance,Account Director
4,31,Male,2.0,7554.294,12614.026326,7927.330359,65688.049967,Life,Married,7,10,Sales,Sales Executive


In [27]:
print("Categories in 'Sex' variable:                ", df['Sex'].unique())
print("Categories in 'Insurance' variable:          ", df['Insurance'].unique())
print("Categories in 'Marital Status' variable:     ", df['Marital Status'].unique())
print("Categories in 'Department' variable:         ", df['Department'].unique())
print("Categories in 'Position' variable:           ", df['Position'].unique())


Categories in 'Sex' variable:                 ['Other' 'Male' 'Female']
Categories in 'Insurance' variable:           [nan 'Both' 'Life' 'Medical']
Categories in 'Marital Status' variable:      ['Widowed' 'Single' 'Divorced' 'Married']
Categories in 'Department' variable:          ['Sales' 'Human Resources' 'Marketing' 'Finance' 'IT']
Categories in 'Position' variable:            ['Sales Director' 'HR Executive' 'Senior Marketing Executive'
 'Account Director' 'Sales Executive' 'Regional Sales Manager'
 'National Account Head' 'Senior Executive' 'Technical Lead'
 'National Marketing Manager' 'IT Manager' 'Sales Representative'
 'Software Engineer II' 'National Sales Manager' 'Senior HR' 'ERP Head'
 'Senior Account Executive' 'Regional Marketing Manager' 'HR Director'
 'Marketing Intern' 'QA Lead' 'Regional Account Head' 'QA Engineer II'
 'Software Engineer III' 'Account Associate' 'Software Engineer I'
 'QA Engineer I' 'HR Associate' 'Marketing Director' 'Recruitment Manager'
 'Marketi

In [28]:
y

0       149289.04
1        69761.64
2       107633.14
3       147408.65
4        53447.06
          ...    
4995    129159.11
4996    134664.03
4997     92852.65
4998     17412.05
4999     91693.80
Name: Salary, Length: 5000, dtype: float64

## pipeline


In [29]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [30]:
X = preprocessor.fit_transform(X)

In [31]:
X.shape

(5000, 64)

In [32]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((4000, 64), (1000, 64))

#### Create an Evaluate Function to give all metrics after model Training

In [33]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [34]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 0.1209
- Mean Absolute Error: 0.0863
- R2 Score: 1.0000
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.1205
- Mean Absolute Error: 0.0856
- R2 Score: 1.0000


Lasso
Model performance for Training set
- Root Mean Squared Error: 4.7149
- Mean Absolute Error: 3.7587
- R2 Score: 1.0000
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 4.6978
- Mean Absolute Error: 3.7644
- R2 Score: 1.0000


Ridge
Model performance for Training set
- Root Mean Squared Error: 35.1878
- Mean Absolute Error: 28.4504
- R2 Score: 1.0000
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 36.0195
- Mean Absolute Error: 29.4108
- R2 Score: 1.0000


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 5173.2254
- Mean Absolute Error: 4187.2976
- R2 Score: 0.9768
-------------

### Results

In [35]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
0,Linear Regression,1.0
1,Lasso,1.0
5,Random Forest Regressor,0.999999
2,Ridge,0.999999
4,Decision Tree,0.999997
6,XGBRegressor,0.999958
7,CatBoosting Regressor,0.999907
8,AdaBoost Regressor,0.992181
3,K-Neighbors Regressor,0.96646
