<a href="https://colab.research.google.com/github/razon1494/Employee-Attrition-Predictor/blob/main/Employee_Attrition_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Employee Attrition Prediction**
### By Mohammad Arifur Rahman
Future AI ML Expert


### Importing Necessary Libraries


In [None]:
import numpy as np
import os, pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (
    roc_auc_score, accuracy_score, classification_report, confusion_matrix, average_precision_score
)
import matplotlib.pyplot as plt
import joblib, json, os


## 1. Data Loading
Load the chosen dataset into your environment and display the first few rows along with the shape to verify correctness.



In [None]:
CSV_NAME = "WA_Fn-UseC_-HR-Employee-Attrition.csv"

assert CSV_NAME in os.listdir("/content/sample_data"), f"{CSV_NAME} not found in /content/sample_data. Upload it first."

df = pd.read_csv(f"/content/sample_data/{CSV_NAME}")
print("Shape:", df.shape)
df.head()

Shape: (1470, 35)


Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


The target column is **Attrition**. I am seeing it is a class-imbalanced data. So accuracy can be misleading. So I am going to use ROC-AUC, F1 Score, Precision /recall

In [None]:
print("Columns:", df.columns.tolist())
print("\nMissing values (top 15):")
display(df.isna().sum().sort_values(ascending=False).head(15))


Columns: ['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department', 'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount', 'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']

Missing values (top 15):


Unnamed: 0,0
Age,0
Attrition,0
BusinessTravel,0
DailyRate,0
Department,0
DistanceFromHome,0
Education,0
EducationField,0
EmployeeCount,0
EmployeeNumber,0


## Target Encoding and dropping unnecessary columns

In [None]:
target_col = "Attrition"

# y as 0/1
y = df[target_col].map({"Yes": 1, "No": 0})
X = df.drop(columns=[target_col])

print("X shape:", X.shape)
print("y distribution:\n", y.value_counts())





X shape: (1470, 34)
y distribution:
 Attrition
0    1233
1     237
Name: count, dtype: int64


## Dropping Unnecessary Columns

In [None]:
drop_cols = [c for c in ["EmployeeCount", "EmployeeNumber", "Over18", "StandardHours"] if c in df.columns]

print("\nDropping columns:", drop_cols)
df = df.drop(columns=drop_cols)

print("\nShape after dropping:", df.shape)
df.head()



Dropping columns: ['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours']

Shape after dropping: (1470, 31)


Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,2,Female,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,3,Male,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,4,Male,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,4,Female,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,Male,...,3,4,1,6,3,3,2,2,2,2


## 2. Data Preprocessing
Perform and document at least 5 distinct preprocessing steps (e.g., handling missing values, encoding, scaling, outlier detection, feature engineering).



### Seperating Target column

In [None]:
# Target
y = df["Attrition"].map({"Yes": 1, "No": 0})

# Features
X = df.drop(columns=["Attrition"])

print("X shape:", X.shape)
print("y distribution:\n", y.value_counts())


X shape: (1470, 30)
y distribution:
 Attrition
0    1233
1     237
Name: count, dtype: int64


### Identify Numeric & Categorical Columns

In [None]:
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

print("Numeric features:", len(num_cols))
print("Categorical features:", len(cat_cols))
print("\nCategorical columns:", cat_cols)


Numeric features: 23
Categorical features: 7

Categorical columns: ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime']


### Feature Engineering Transformer

In [None]:
class RatioFeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        if "MonthlyIncome" in X.columns and "TotalWorkingYears" in X.columns:
            X["IncomePerYearExp"] = X["MonthlyIncome"] / (X["TotalWorkingYears"] + 1)

        if "YearsAtCompany" in X.columns and "TotalWorkingYears" in X.columns:
            X["CompanyTenureRatio"] = X["YearsAtCompany"] / (X["TotalWorkingYears"] + 1)

        if "Age" in X.columns and "TotalWorkingYears" in X.columns:
            X["YearsPerAge"] = X["TotalWorkingYears"] / (X["Age"] + 1)

        return X

### IQR Outlier Clipper Transformer

In [None]:
class IQRClipper(BaseEstimator, TransformerMixin):
    def __init__(self, factor=1.5):
        self.factor = factor

    def fit(self, X, y=None):
        X = pd.DataFrame(X)
        self.q1_ = X.quantile(0.25)
        self.q3_ = X.quantile(0.75)
        self.iqr_ = self.q3_ - self.q1_
        return self

    def transform(self, X):
        X = pd.DataFrame(X)
        lower = self.q1_ - self.factor * self.iqr_
        upper = self.q3_ + self.factor * self.iqr_
        return X.clip(lower=lower, upper=upper, axis=1)


## Section 3: Pipeline Creation

### Train Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train:", X_train.shape, " Test:", X_test.shape)
print("Train y distribution:\n", y_train.value_counts())


Train: (1176, 30)  Test: (294, 30)
Train y distribution:
 Attrition
0    986
1    190
Name: count, dtype: int64


### Define Column Lists

In [None]:
num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X_train.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

print("Numeric:", len(num_cols), "Categorical:", len(cat_cols))
print("Categorical columns:", cat_cols)


Numeric: 23 Categorical: 7
Categorical columns: ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime']


### Preprocessors

In [None]:
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("clipper", IQRClipper(factor=1.5)),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols)
    ],
    remainder="drop"
)


# Section 3,4 Pipeline & Primary Model Selection (Logistic Regression)

In [None]:
clf = LogisticRegression(
    max_iter=2000,
    class_weight="balanced",
    solver="liblinear"
)

model_pipeline = Pipeline(steps=[
    ("feateng", RatioFeatureEngineer()),
    ("preprocess", preprocessor),
    ("model", clf)
])

model_pipeline


# Section 5: Model Training

In [None]:
model_pipeline.fit(X_train, y_train)
print("Pipeline fitted successfully ✅")


Pipeline fitted successfully ✅


In [None]:
y_pred = model_pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.7517006802721088

Confusion Matrix:
 [[191  56]
 [ 17  30]]

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.77      0.84       247
           1       0.35      0.64      0.45        47

    accuracy                           0.75       294
   macro avg       0.63      0.71      0.65       294
weighted avg       0.83      0.75      0.78       294



In [None]:
Xt = model_pipeline.named_steps["preprocess"].transform(
    model_pipeline.named_steps["feateng"].transform(X_train)
)
print("Transformed train shape:", Xt.shape)


Transformed train shape: (1176, 51)


In [None]:
y_proba = model_pipeline.predict_proba(X_test)[:, 1]

print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print("PR-AUC :", average_precision_score(y_test, y_proba))

ROC-AUC: 0.7945559479714014
PR-AUC : 0.5565759455786149


# Section 6: Cross-Validation

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Use ROC-AUC (good for imbalanced classification)
cv_scores = cross_val_score(
    model_pipeline,
    X_train,
    y_train,
    cv=cv,
    scoring="roc_auc",
    n_jobs=-1
)

print("Cross-Validation (ROC-AUC)")
print("Scores:", np.round(cv_scores, 4))
print(f"Mean ROC-AUC: {cv_scores.mean():.4f}")
print(f"Std  ROC-AUC: {cv_scores.std():.4f}")


Cross-Validation (ROC-AUC)
Scores: [0.8173 0.8107 0.8388 0.8791 0.8193]
Mean ROC-AUC: 0.8330
Std  ROC-AUC: 0.0249


The low standard deviation indicates stable performance across folds,
demonstrating that the model generalizes well and is not sensitive to
data partitioning.

# Section 7: Hyperparameter Tuning

In [None]:
param_grid = {
    "model__C": [0.01, 0.1, 1.0, 3.0, 10.0],
    "model__penalty": ["l1", "l2"],
    "model__solver": ["liblinear"],
}


In [None]:
grid = GridSearchCV(
    estimator=model_pipeline,
    param_grid=param_grid,
    scoring="average_precision",   # PR-AUC
    cv=cv,
    n_jobs=-1,
    verbose=1,
    refit=True,
)

grid.fit(X_train, y_train)

print("Best CV PR-AUC:", grid.best_score_)
print("Best Params:", grid.best_params_)
best_model = grid.best_estimator_


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best CV PR-AUC: 0.6220477666484883
Best Params: {'model__C': 0.1, 'model__penalty': 'l2', 'model__solver': 'liblinear'}


In [None]:
results = pd.DataFrame(grid.cv_results_)
cols_to_show = [
    "mean_test_score", "std_test_score",
    "param_model__C", "param_model__penalty", "param_model__solver"
]

results_view = results[cols_to_show].sort_values("mean_test_score", ascending=False)
results_view.head(10)


Unnamed: 0,mean_test_score,std_test_score,param_model__C,param_model__penalty,param_model__solver
3,0.622048,0.047341,0.1,l2,liblinear
1,0.617144,0.043469,0.01,l2,liblinear
2,0.60881,0.030045,0.1,l1,liblinear
5,0.60756,0.071786,1.0,l2,liblinear
4,0.606264,0.072258,1.0,l1,liblinear
7,0.602659,0.074648,3.0,l2,liblinear
8,0.602503,0.081381,10.0,l1,liblinear
9,0.602129,0.079843,10.0,l2,liblinear
6,0.601671,0.075983,3.0,l1,liblinear
0,0.343636,0.054945,0.01,l1,liblinear


In [None]:
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

print("=== Tuned Model Test Metrics ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC :", roc_auc_score(y_test, y_proba))
print("PR-AUC  :", average_precision_score(y_test, y_proba))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


=== Tuned Model Test Metrics ===
Accuracy: 0.7755102040816326
ROC-AUC : 0.8054957360668447
PR-AUC  : 0.5882541130009683

Confusion Matrix:
 [[196  51]
 [ 15  32]]

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.79      0.86       247
           1       0.39      0.68      0.49        47

    accuracy                           0.78       294
   macro avg       0.66      0.74      0.67       294
weighted avg       0.84      0.78      0.80       294



# Section 8: Best Model Selection

## Based on the hyperparameter tuning results obtained using GridSearchCV, the **Logistic Regression** model with optimized regularization was selected as the final best-performing model.

### Base Model vs Tuned Model Comparison
**Baseline Logistic Regression (Before Tuning)**

Accuracy: ~0.75

ROC-AUC: ~0.79

PR-AUC: ~0.56

Minority Class (Attrition = 1) Recall: ~0.64

The baseline model provided a strong and interpretable starting point. However, performance on the minority class was limited, which is critical in an imbalanced classification task such as employee attrition prediction.

**Tuned Logistic Regression (After GridSearchCV)**

Accuracy: ~0.78

ROC-AUC: ~0.81

PR-AUC: ~0.59

Minority Class Recall: ~0.68

**Best Hyperparameters:**

C = 0.1

penalty = l2

solver = liblinear

Hyperparameter tuning led to consistent improvements across all evaluation metrics, particularly ROC-AUC and PR-AUC, which are more informative than accuracy for imbalanced datasets. The tuned model demonstrates better discrimination capability and improved detection of employee attrition cases.

# Section 9: Model Performance Evaluation

The final tuned Logistic Regression model was evaluated on the held-out test set using multiple metrics suitable for an imbalanced binary classification problem (employee attrition).

In [None]:
# Section 9: Model Performance Evaluation

from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    average_precision_score,
    confusion_matrix,
    classification_report
)

# Predictions
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

# Metrics
print("=== Final Model Performance on Test Set ===")
print(f"Accuracy  : {accuracy_score(y_test, y_pred):.4f}")
print(f"ROC-AUC   : {roc_auc_score(y_test, y_proba):.4f}")
print(f"PR-AUC    : {average_precision_score(y_test, y_proba):.4f}")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


=== Final Model Performance on Test Set ===
Accuracy  : 0.7755
ROC-AUC   : 0.8055
PR-AUC    : 0.5883

Confusion Matrix:
[[196  51]
 [ 15  32]]

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.79      0.86       247
           1       0.39      0.68      0.49        47

    accuracy                           0.78       294
   macro avg       0.66      0.74      0.67       294
weighted avg       0.84      0.78      0.80       294



#Saving Model


In [None]:
import pickle

FILENAME = "employee_attrition_pipeline.pkl"

with open(FILENAME, "wb") as f:
    pickle.dump(best_model, f)   # or model_pipeline if that's your final one

print("Saved:", FILENAME)


Saved: employee_attrition_pipeline.pkl


In [None]:
with open("employee_attrition_pipeline.pkl", "rb") as f:
    loaded_model = pickle.load(f)

sample = X_test.iloc[[0]]
print("Pred:", loaded_model.predict(sample))

# optional: probability if available
if hasattr(loaded_model, "predict_proba"):
    print("Prob:", loaded_model.predict_proba(sample))


Pred: [0]
Prob: [[0.63914103 0.36085897]]


In [None]:
MODEL_PATH = "employee_attrition_model.pkl"
joblib.dump(best_model, MODEL_PATH)

print(f"Model saved to: {MODEL_PATH}")

Model saved to: employee_attrition_model.pkl


In [None]:
sample = X_test.iloc[[0]]
loaded_model.predict(sample)


array([0])

In [None]:
print(type(best_model))
best_model


<class 'sklearn.pipeline.Pipeline'>


In [None]:
type(best_model)


In [None]:
import os
print(os.getcwd())
!ls -lh

/content
total 24K
-rw-r--r-- 1 root root  11K Jan 17 18:22 employee_attrition_model.pkl
-rw-r--r-- 1 root root 6.9K Jan 17 18:22 employee_attrition_pipeline.pkl
drwxr-xr-x 1 root root 4.0K Jan 17 18:07 sample_data


In [None]:
import pickle

final_model = best_model  # your tuned pipeline

FEATURE_COLS = list(X_train.columns)
CAT_COLS = X_train.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
NUM_COLS = [c for c in FEATURE_COLS if c not in CAT_COLS]

CATEGORY_CHOICES = {
    col: sorted(X_train[col].dropna().unique().tolist())
    for col in CAT_COLS
}

bundle = {
    "model": final_model,
    "feature_cols": FEATURE_COLS,
    "cat_cols": CAT_COLS,
    "num_cols": NUM_COLS,
    "category_choices": CATEGORY_CHOICES,
}

with open("employee_attrition_bundle.pkl", "wb") as f:
    pickle.dump(bundle, f)

print("✅ Saved employee_attrition_bundle.pkl")


✅ Saved employee_attrition_bundle.pkl
