In [None]:
import pandas as pd
df = pd.read_csv('/content/HR_Employee_Attrition.csv')

In [None]:
# first make the copy of the datset
df_ml = df.copy()
# Encode the target variable that is Attrition
df_ml['Attrition'] = df_ml['Attrition'].map({'Yes':1,'No':0})

# Then we drop the columns which are not useful for our model
df_ml = df_ml.drop(columns=['EmployeeNumber','EmployeeCount','StandardHours','Over18'])

#Features and Target
X = df_ml.drop('Attrition',axis=1)
y = df_ml['Attrition']

In [None]:
# One-hot encode categorical variables
X = pd.get_dummies(X,drop_first=True)
X.head()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,...,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Married,MaritalStatus_Single,OverTime_Yes
0,41,1102,1,2,2,94,3,2,4,5993,...,False,False,False,False,False,True,False,False,True,True
1,49,279,8,1,3,61,2,2,2,5130,...,False,False,False,False,True,False,False,True,False,False
2,37,1373,2,2,4,92,2,1,3,2090,...,True,False,False,False,False,False,False,False,True,True
3,33,1392,3,4,4,56,3,1,3,2909,...,False,False,False,False,True,False,False,True,False,True
4,27,591,2,1,1,40,3,1,2,3468,...,True,False,False,False,False,False,False,True,False,False


In [None]:
features = [
    'Age',
    'MonthlyIncome',
    'JobSatisfaction',
    'WorkLifeBalance',
    'YearsAtCompany',
    'OverTime'
]

X_deploy = df[features].copy()
# Encode 'OverTime' column to numerical values
X_deploy['OverTime'] = X_deploy['OverTime'].map({'Yes':1,'No':0})
y = df['Attrition'].map({'Yes': 1, 'No': 0})


In [None]:
X_deploy.isna().sum()

Unnamed: 0,0
Age,0
MonthlyIncome,0
JobSatisfaction,0
WorkLifeBalance,0
YearsAtCompany,0
OverTime,0


In [None]:
X_deploy.dtypes


Unnamed: 0,0
Age,int64
MonthlyIncome,int64
JobSatisfaction,int64
WorkLifeBalance,int64
YearsAtCompany,int64
OverTime,int64


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_deploy,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
from sklearn.linear_model import LogisticRegression

lr_deploy = LogisticRegression(
    max_iter=1000,
    class_weight='balanced'
)

lr_deploy.fit(X_train_scaled, y_train)


In [None]:
sample = [[28, 3500, 2, 2, 1, 1]]

print("LR:", lr_deploy.predict_proba(scaler.transform(sample)))
print("Prediction:", lr_deploy.predict(scaler.transform(sample)))


LR: [[0.14165045 0.85834955]]
Prediction: [1]




In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

dt_deploy = DecisionTreeClassifier(
    max_depth=5,
    random_state=42,
    class_weight='balanced'
)
dt_deploy.fit(X_train, y_train)

rf_deploy = RandomForestClassifier(
    n_estimators=200,
    max_depth=8,
    random_state=42,
    class_weight='balanced'
)
rf_deploy.fit(X_train, y_train)


In [None]:
print("DT:", dt_deploy.predict_proba(sample))
print("RF:", rf_deploy.predict_proba(sample))


DT: [[0.13149616 0.86850384]]
RF: [[0.30127224 0.69872776]]




In [None]:
import joblib

joblib.dump(lr_deploy, "logistic_model.pkl")
joblib.dump(dt_deploy, "decision_tree.pkl")
joblib.dump(rf_deploy, "random_forest.pkl")
joblib.dump(scaler, "scaler.pkl")


['scaler.pkl']

In [None]:
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=False)

feature_importance.head(10)


Unnamed: 0,Feature,Importance
9,MonthlyIncome,0.076838
0,Age,0.070334
16,TotalWorkingYears,0.059787
1,DailyRate,0.050175
19,YearsAtCompany,0.049344
43,OverTime_Yes,0.047843
2,DistanceFromHome,0.045145
5,HourlyRate,0.04465
10,MonthlyRate,0.042436
22,YearsWithCurrManager,0.037476


In [None]:
import joblib

joblib.dump(lr, 'logistic_regression_model.pkl')
joblib.dump(dt, 'decision_tree_model.pkl')
joblib.dump(rf, 'random_forest_model.pkl')
joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']

In [None]:
import joblib

joblib.dump(lr_deploy, "logistic_model.pkl")
joblib.dump(scaler, "scaler.pkl")


['scaler.pkl']

In [None]:
# Test sample used in FastAPI
sample = [[28, 3500, 2, 2, 1, 1]]

sample_scaled = scaler.transform(sample)

print("Raw probability:", lr_deploy.predict_proba(sample_scaled))
print("Prediction:", lr_deploy.predict(sample_scaled))


Raw probability: [[0.40596125 0.59403875]]
Prediction: [1]




In [None]:
import joblib

joblib.dump(lr_deploy, "logistic_model.pkl")
joblib.dump(scaler, "scaler.pkl")


['scaler.pkl']