In [1]:
import os
import re
import json
import joblib
import pandas as pd
import numpy as np
from typing import Dict, Any

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from catboost import CatBoostClassifier, Pool

# Config
CSV_PATH = "data/emp-attrition-data.csv"
MODEL_PATH = "catattmodel.cbm"
ENCODER_PATH = "feature_encoder.joblib"
SEED = 42

In [2]:
def load_data(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    # def to_snake(s: str) -> str:
    #     return re.sub(r'[^0-9a-zA-Z]+', '_', s).strip('_').lower()
    # df.columns = [to_snake(c) for c in df.columns]
    return df

def quick_eda(df: pd.DataFrame) -> None:
    print("Shape:", df.shape)
    print("Columns:", df.columns.tolist())
    print("\nAttrition value counts:")
    print(df['Attrition'].value_counts())
    print("\nMissing values:")
    print(df.isna().sum().sort_values(ascending=False).head(10))

df = load_data(CSV_PATH)
quick_eda(df)
df.head()


Shape: (1470, 13)
Columns: ['Age', 'Attrition', 'Department', 'DistanceFromHome', 'Education', 'EducationField', 'EnvironmentSatisfaction', 'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'NumCompaniesWorked', 'WorkLifeBalance', 'YearsAtCompany']

Attrition value counts:
Attrition
No     1233
Yes     237
Name: count, dtype: int64

Missing values:
Age                        0
Attrition                  0
Department                 0
DistanceFromHome           0
Education                  0
EducationField             0
EnvironmentSatisfaction    0
JobSatisfaction            0
MaritalStatus              0
MonthlyIncome              0
dtype: int64


Unnamed: 0,Age,Attrition,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,JobSatisfaction,MaritalStatus,MonthlyIncome,NumCompaniesWorked,WorkLifeBalance,YearsAtCompany
0,41,Yes,Sales,1,2,Life Sciences,2,4,Single,5993,8,1,6
1,49,No,Research & Development,8,1,Life Sciences,3,2,Married,5130,1,3,10
2,37,Yes,Research & Development,2,2,Other,4,3,Single,2090,6,3,0
3,33,No,Research & Development,3,4,Life Sciences,4,3,Married,2909,1,3,8
4,27,No,Research & Development,2,1,Medical,1,2,Married,3468,9,3,2


In [3]:
def map_position(row):
    dept = str(row.get("Department", "")).lower()
    # jobrole = str(row.get("JobRole", "")).lower()
    env_satisfy = row.get("EnvironmentSatisfaction", np.nan)

    if "research" in dept or "development" in dept or "res" in dept:
        dept_norm = "R&D"
    elif "sales" in dept:
        dept_norm = "Sales"
    else:
        dept_norm = "Other"

    # is_manager = ("manager" in dept) or (pd.notna(joblevel) and joblevel >= 3)
    is_manager = (pd.notna(env_satisfy) and env_satisfy >= 3) # ("manager" in dept) or 

    if is_manager:
        return "Manager"
    elif dept_norm == "R&D":
        return "R&D"
    elif dept_norm == "Sales":
        return "Sales"
    elif dept_norm == "Other":
        return "Employee"
    


In [4]:
def engineer_features(df) -> pd.DataFrame:
    df = df.copy()
    df["Churn"] = df["Attrition"].map({"Yes": 1, "No": 0}).astype(int)
    df["EmployeeSatisfaction"] = (df["JobSatisfaction"].astype(float) - 1) / 3
    df["Salary"] = df["MonthlyIncome"].astype(float) * 12
    df["Position"] = df.apply(map_position, axis=1)

    df_final = df[["YearsAtCompany", "EmployeeSatisfaction", "Position", "Salary", "Churn"]]
    return df_final    

df_final = engineer_features(df)
df_final.head()

Unnamed: 0,YearsAtCompany,EmployeeSatisfaction,Position,Salary,Churn
0,6,1.0,Sales,71916.0,1
1,10,0.333333,Manager,61560.0,0
2,0,0.666667,Manager,25080.0,1
3,8,0.666667,Manager,34908.0,0
4,2,0.333333,R&D,41616.0,0


# FROM HERE, IT'S MODEL TRAINING EXPERIMENTS

### Simple random forest classifier

In [None]:
"""
MODEL TRAINING CODE GOES HERE
"""

"""
RandomForest baseline training (mirrors the "CATBOOST - Training" cell).

Assumptions:
- `df_final` is already in memory, with 4 columns total.
- The dependent/target column is 'Churn'.
- The 3 remaining columns are features (may be numeric and/or categorical).

What this cell does:
1) Splits df_final into X/y with stratification on Churn.
2) Builds a robust sklearn Pipeline:
   - OneHotEncoder for categorical (object / category / boolean) features
   - Passthrough for numeric features
   - RandomForestClassifier (with class_weight='balanced' as a safe default)
3) Trains, evaluates, and prints quick metrics.

You can tune the RF params (n_estimators, max_depth, etc.) as needed.
"""

from __future__ import annotations

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    roc_auc_score,
    f1_score,
    accuracy_score,
    classification_report,
)

# --- Split ---
assert "Churn" in df_final.columns, "Expected a 'Churn' column in df_final."
X = df_final.drop(columns=["Churn"])
y = df_final["Churn"]

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- Preprocess ---
cat_sel = selector(dtype_include=["object", "bool", "category"])
num_sel = selector(dtype_include=["number"])

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_sel),
        ("num", "passthrough", num_sel),
    ],
    remainder="drop",
)

# --- Model ---
rf = RandomForestClassifier(
    n_estimators=400,
    max_depth=None,
    min_samples_leaf=1,
    n_jobs=-1,
    class_weight="balanced",
    random_state=42,
)

pipe = Pipeline(steps=[("prep", preprocess), ("rf", rf)])

# --- Train ---
pipe.fit(X_train, y_train)

# --- Eval ---
pred = pipe.predict(X_valid)
proba = (
    pipe.predict_proba(X_valid)[:, 1]
    if hasattr(pipe.named_steps["rf"], "predict_proba")
    else None
)

print("Accuracy:", accuracy_score(y_valid, pred))
print("F1:", f1_score(y_valid, pred, zero_division=0))
if proba is not None:
    print("ROC-AUC:", roc_auc_score(y_valid, proba))
print("\nClassification report:\n", classification_report(y_valid, pred, zero_division=0))

Accuracy: 0.7891156462585034
F1: 0.225
ROC-AUC: 0.6028943061417864

Classification report:
               precision    recall  f1-score   support

           0       0.85      0.90      0.88       247
           1       0.27      0.19      0.23        47

    accuracy                           0.79       294
   macro avg       0.56      0.55      0.55       294
weighted avg       0.76      0.79      0.77       294



In [5]:
from __future__ import annotations

import numpy as np
import pandas as pd
from typing import Dict, Any

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix,
    precision_score, recall_score, balanced_accuracy_score, matthews_corrcoef, average_precision_score,
)

In [6]:
def split_data(df_final: pd.DataFrame, test_size: float = 0.2, seed: int = 42):
    assert "Churn" in df_final.columns, "Expected a 'Churn' column in df_final."
    X = df_final.drop(columns=["Churn"])
    y = df_final["Churn"]

    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, test_size=test_size, random_state=seed, stratify=y
    )
    return X_train, X_valid, y_train, y_valid

In [7]:
def build_preprocess():
    cat_sel = selector(dtype_include=["object", "bool", "category"])
    num_sel = selector(dtype_include=["number"])

    preprocess = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_sel),
            ("num", "passthrough", num_sel),
        ],
        remainder="drop",
    )
    return preprocess

In [8]:
def train_model(
    df_final: pd.DataFrame,
    n_estimators: int = 400,
    max_depth=None,
    min_samples_leaf: int = 1,
    seed: int = 42,
    class_weight: str | dict | None = "balanced",
):
    """
    Build RF + preprocessing Pipeline and fit it.
    Returns the trained pipeline along with the validation split for evaluation.
    """
    X_train, X_valid, y_train, y_valid = split_data(df_final, test_size=0.2, seed=seed)
    preprocess = build_preprocess()

    rf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_leaf=min_samples_leaf,
        n_jobs=-1,
        class_weight=class_weight,
        random_state=seed,
    )

    pipe = Pipeline(steps=[("prep", preprocess), ("rf", rf)])
    pipe.fit(X_train, y_train)
    return pipe, X_valid, y_valid

In [9]:
pipe, X_valid, y_valid = train_model(df_final)

In [10]:
def evaluate_model(
    pipe: Pipeline, X_valid: pd.DataFrame, 
    y_valid: pd.Series, print_report: bool = True) -> Dict[str, Any]:
    """
    Evaluate the trained pipeline on the validation set and print metrics.
    Returns a dict of metrics for reuse/logging.
    """
    pred = pipe.predict(X_valid)
    proba = None
    if hasattr(pipe.named_steps["rf"], "predict_proba"):
        proba = pipe.predict_proba(X_valid)[:, 1]

    metrics = {
        "accuracy": float(accuracy_score(y_valid, pred)),
        "f1": float(f1_score(y_valid, pred, zero_division=0)),
        "precision": float(precision_score(y_valid, pred, zero_division=0)),
        "recall": float(recall_score(y_valid, pred, zero_division=0)),
        "balanced_accuracy": float(balanced_accuracy_score(y_valid, pred)),
        "mcc": float(matthews_corrcoef(y_valid, pred)),
        "roc_auc": float(roc_auc_score(y_valid, proba)) if proba is not None else None,
        "pr_auc": float(average_precision_score(y_valid, proba)) if proba is not None else None,
        "confusion_matrix": confusion_matrix(y_valid, pred).tolist(),
        "classification_report": classification_report(y_valid, pred, zero_division=0),
    }

    if print_report:
        print("Accuracy:", round(metrics["accuracy"], 6))
        print("F1:", round(metrics["f1"], 6))
        print("Precision:", round(metrics["precision"], 6))
        print("Recall:", round(metrics["recall"], 6))
        print("Balanced Acc:", round(metrics["balanced_accuracy"], 6))
        print("MCC:", round(metrics["mcc"], 6))
        if metrics["roc_auc"] is not None:
            print("ROC-AUC:", round(metrics["roc_auc"], 6))
        if metrics["pr_auc"] is not None:
            print("PR-AUC (Average Precision):", round(metrics["pr_auc"], 6))
        print("\nConfusion Matrix [[TN, FP], [FN, TP]]:", metrics["confusion_matrix"])
        print("\nClassification report:\n", metrics["classification_report"])

    return metrics

In [11]:
evaluate_model(pipe, X_valid, y_valid, print_report=True)

Accuracy: 0.789116
F1: 0.225
Precision: 0.272727
Recall: 0.191489
Balanced Acc: 0.547162
MCC: 0.109506
ROC-AUC: 0.602894
PR-AUC (Average Precision): 0.241868

Confusion Matrix [[TN, FP], [FN, TP]]: [[223, 24], [38, 9]]

Classification report:
               precision    recall  f1-score   support

           0       0.85      0.90      0.88       247
           1       0.27      0.19      0.23        47

    accuracy                           0.79       294
   macro avg       0.56      0.55      0.55       294
weighted avg       0.76      0.79      0.77       294



{'accuracy': 0.7891156462585034,
 'f1': 0.225,
 'precision': 0.2727272727272727,
 'recall': 0.19148936170212766,
 'balanced_accuracy': 0.5471616848996468,
 'mcc': 0.10950639700302685,
 'roc_auc': 0.6028943061417864,
 'pr_auc': 0.2418675621720766,
 'confusion_matrix': [[223, 24], [38, 9]],
 'classification_report': '              precision    recall  f1-score   support\n\n           0       0.85      0.90      0.88       247\n           1       0.27      0.19      0.23        47\n\n    accuracy                           0.79       294\n   macro avg       0.56      0.55      0.55       294\nweighted avg       0.76      0.79      0.77       294\n'}

In [12]:
"""
Save the trained model as a neutral .skops bundle.
Note: because we used make_column_selector, loading later via skops may require
trusted=[...] in sio.load(...). Saving works fine.
"""

import skops.io as sio
from datetime import datetime, timezone
import platform, sklearn, numpy as _np

SKOPS_PATH = "ECRFmodel.skops"

def save_model_skops(pipe: Pipeline, path: str = SKOPS_PATH):
    meta = {
        "saved_at": datetime.now(timezone.utc).isoformat(),
        "env": {"python": platform.python_version(), "numpy": _np.__version__, "sklearn": sklearn.__version__},
    }
    sio.dump({"pipeline": pipe, "meta": meta}, path)
    print(f"Saved skops model -> {path}")


save_model_skops(pipe)

Saved skops model -> ECRFmodel.skops


In [13]:
"""
Save the trained model as a pickle for convenience (not version-neutral).
"""

import joblib

PKL_PATH = "ECRFmodel.pkl"

def save_model_pickle(pipe: Pipeline, path: str = PKL_PATH):
    joblib.dump(pipe, path)
    print(f"Saved pickle model -> {path}")


save_model_pickle(pipe)


Saved pickle model -> ECRFmodel.pkl


In [14]:
SKOPS_PATH = "ECRFmodel.skops"
PKL_PATH   = "ECRFmodel.pkl"


def load_skops_model(path: str = SKOPS_PATH):
    """
    Load the trained sklearn Pipeline from a skops bundle.
    Notes:
      - Because the pipeline uses make_column_selector, we allowlist it via `trusted=[...]`.
      - Returns just the pipeline (model); the saved bundle may also contain a meta dict.
    """
    import skops.io as sio
    bundle = sio.load(
        path,
        trusted=[
            "sklearn.pipeline.Pipeline",
            "sklearn.compose._column_transformer.ColumnTransformer",
            "sklearn.preprocessing._encoders.OneHotEncoder",
            "sklearn.ensemble._forest.RandomForestClassifier",
            "sklearn.compose._column_transformer.make_column_selector",
        ],
    )
    return bundle["pipeline"]


def load_pickle_model(path: str = PKL_PATH):
    """
    Load the trained sklearn Pipeline from a pickle file.
    (Less portable across versions than skops, but simple.)
    """
    import joblib
    return joblib.load(path)

In [15]:
from __future__ import annotations
from typing import Any, Dict, List
import pandas as pd
import numpy as np

# Normalize inputs to model (Accounting for spelling mistakes, whitespaces etc.)
def _normalize_position(raw: Any) -> str:
    """
    Map free-form 'Position' strings into the buckets used in training.
    Falls back to 'Employee' for anything not matched.
    """
    s = str(raw or "").strip().lower()
    if "sales" in s:
        return "Sales"
    if "r&d" in s or "research" in s or "dev" in s:
        return "R&D"
    if "manager" in s:
        return "Manager"
    # Common alt: "non-manager" etc. → bucket as Employee
    return "Employee"


def _payload_to_dataframe(payload: Dict[str, Any]) -> pd.DataFrame:
    """
    Validate and coerce a single prediction payload to a 1-row DataFrame
    with the exact training-time column names (order doesn’t matter for the Pipeline).
    """
    years = float(payload.get("YearsAtCompany", 0))
    sat   = float(payload.get("EmployeeSatisfaction", 0))
    pos   = _normalize_position(payload.get("Position"))
    sal   = float(payload.get("Salary", 0.0))

    row = {
        "YearsAtCompany": years,
        "EmployeeSatisfaction": sat,
        "Position": pos,
        "Salary": sal,
    }
    return pd.DataFrame([row])


In [16]:
def predict_single(
    payload: Dict[str, Any],
    model=None,
    load_from: str = "skops",
    threshold: float = 0.5,
) -> Dict[str, Any]:
    """
    Predict for one payload.
    - If `model` is None, loads from disk using `load_from` ('skops' or 'pkl').
    - Returns: {"attrition": 0/1, "prob": float, "details": {...}}
    """
    if model is None:
        if load_from == "skops":
            model = load_skops_model(SKOPS_PATH)
        elif load_from == "pkl":
            model = load_pickle_model(PKL_PATH)
        else:
            raise ValueError("load_from must be 'skops' or 'pkl'")

    X = _payload_to_dataframe(payload)
    # prob of positive class (churn=1)
    proba = float(model.predict_proba(X)[0, 1])
    pred = int(proba >= threshold)

    return {
        "attrition": pred,
        "prob": round(proba, 4),
        "details": {
            "threshold": threshold,
            "normalized_input": X.iloc[0].to_dict(),
        },
    }


def predict_batch(
    payloads: List[Dict[str, Any]],
    model=None,
    load_from: str = "skops",
    threshold: float = 0.5,
) -> List[Dict[str, Any]]:
    """
    Predict for a list of payloads.
    Loads model if not provided.
    """
    if model is None:
        model = load_skops_model(SKOPS_PATH) if load_from == "skops" else load_pickle_model(PKL_PATH)

    X = pd.concat([_payload_to_dataframe(p) for p in payloads], ignore_index=True)
    probs = model.predict_proba(X)[:, 1]
    preds = (probs >= threshold).astype(int)

    results = []
    for i in range(len(payloads)):
        results.append({
            "attrition": int(preds[i]),
            "prob": round(float(probs[i]), 4),
            "details": {
                "threshold": threshold,
                "normalized_input": X.iloc[i].to_dict(),
            },
        })
    return results

In [17]:
"""
Example usage of the reusable prediction workflow.

This cell:
- Loads the model from ECRFmodel.skops via load_skops_model()
- Scores your provided sample input
- Shows a batch example (optional)
"""

# Single prediction (your sample input)
sample_payload = {
    "YearsAtCompany": 1,
    "EmployeeSatisfaction": 0.01,
    "Position": "Non-Manager",  # will be normalized to "Employee"
    "Salary": 4.0,
}

batch_payloads = [
    {"YearsAtCompany": 3, "EmployeeSatisfaction": 0.7, "Position": "R&D", "Salary": 75000.0},
    {"YearsAtCompany": 6, "EmployeeSatisfaction": 0.3, "Position": "Manager", "Salary": 98000.0},
    sample_payload,
    {"YearsAtCompany": 2, "EmployeeSatisfaction": 0.5, "Position": "Sales", "Salary": 52000.0},
]


In [18]:
# pipe = load_skops_model()   #  or if you prefer the .pkl
# pipe = load_pickle_model()

In [19]:
# Single Prediction
result = predict_single(sample_payload, model=pipe, load_from="skops", threshold=0.5)
print("Single prediction:", result)

Single prediction: {'attrition': 1, 'prob': 0.9525, 'details': {'threshold': 0.5, 'normalized_input': {'YearsAtCompany': 1.0, 'EmployeeSatisfaction': 0.01, 'Position': 'Manager', 'Salary': 4.0}}}


In [20]:
# Batch Prediction
batch_results = predict_batch(batch_payloads, model=pipe, load_from="skops", threshold=0.5)
print("\nBatch predictions:")
for i, r in enumerate(batch_results, 1):
    print(f"{i}. {r}")


Batch predictions:
1. {'attrition': 0, 'prob': 0.0075, 'details': {'threshold': 0.5, 'normalized_input': {'YearsAtCompany': 3.0, 'EmployeeSatisfaction': 0.7, 'Position': 'R&D', 'Salary': 75000.0}}}
2. {'attrition': 0, 'prob': 0.0329, 'details': {'threshold': 0.5, 'normalized_input': {'YearsAtCompany': 6.0, 'EmployeeSatisfaction': 0.3, 'Position': 'Manager', 'Salary': 98000.0}}}
3. {'attrition': 1, 'prob': 0.9525, 'details': {'threshold': 0.5, 'normalized_input': {'YearsAtCompany': 1.0, 'EmployeeSatisfaction': 0.01, 'Position': 'Manager', 'Salary': 4.0}}}
4. {'attrition': 0, 'prob': 0.195, 'details': {'threshold': 0.5, 'normalized_input': {'YearsAtCompany': 2.0, 'EmployeeSatisfaction': 0.5, 'Position': 'Sales', 'Salary': 52000.0}}}


## Sample inputs + Test BELOW 👇🏽

In [None]:
sample_input00 = {
    "YearsAtCompany": 3,
    "EmployeeSatisfaction": 0.7,
    "Position": "R&D",
    "Salary": 75000.0
}

In [None]:
# Examples covering each Position bucket
sample_input01 = [
    {"YearsAtCompany": 1, "EmployeeSatisfaction": 2, "Position": "R&D",      "Salary": 60000.0},
    {"YearsAtCompany": 3, "EmployeeSatisfaction": 4, "Position": "Sales",    "Salary": 75000.0},
    {"YearsAtCompany": 6, "EmployeeSatisfaction": 3, "Position": "Manager",  "Salary": 98000.0},
    {"YearsAtCompany": 2, "EmployeeSatisfaction": 1, "Position": "Employee", "Salary": 52000.0},
]

# for i, s in enumerate(sample_input01, 1):
#     print(f"Sample {i}: {s}")
#     print(" →", predict(s))


Sample 1: {'YearsAtCompany': 1, 'EmployeeSatisfaction': 2, 'Position': 'R&D', 'Salary': 60000.0}
 → {'attrition': 0, 'prob': 0.1552}
Sample 2: {'YearsAtCompany': 3, 'EmployeeSatisfaction': 4, 'Position': 'Sales', 'Salary': 75000.0}
 → {'attrition': 1, 'prob': 0.3083}
Sample 3: {'YearsAtCompany': 6, 'EmployeeSatisfaction': 3, 'Position': 'Manager', 'Salary': 98000.0}
 → {'attrition': 0, 'prob': 0.044}
Sample 4: {'YearsAtCompany': 2, 'EmployeeSatisfaction': 1, 'Position': 'Employee', 'Salary': 52000.0}
 → {'attrition': 0, 'prob': 0.1973}
