In [1]:
import os
import re
import json
import joblib
import pandas as pd
import numpy as np
from typing import Dict, Any

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from catboost import CatBoostClassifier, Pool

# Config
CSV_PATH = "data/emp-attrition-data.csv"
MODEL_PATH = "catattmodel.cbm"
ENCODER_PATH = "feature_encoder.joblib"
SEED = 42

In [2]:
def load_data(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    # def to_snake(s: str) -> str:
    #     return re.sub(r'[^0-9a-zA-Z]+', '_', s).strip('_').lower()
    # df.columns = [to_snake(c) for c in df.columns]
    return df

def quick_eda(df: pd.DataFrame) -> None:
    print("Shape:", df.shape)
    print("Columns:", df.columns.tolist())
    print("\nAttrition value counts:")
    print(df['Attrition'].value_counts())
    print("\nMissing values:")
    print(df.isna().sum().sort_values(ascending=False).head(10))

df = load_data(CSV_PATH)
quick_eda(df)
df.head()


Shape: (1470, 13)
Columns: ['Age', 'Attrition', 'Department', 'DistanceFromHome', 'Education', 'EducationField', 'EnvironmentSatisfaction', 'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'NumCompaniesWorked', 'WorkLifeBalance', 'YearsAtCompany']

Attrition value counts:
Attrition
No     1233
Yes     237
Name: count, dtype: int64

Missing values:
Age                        0
Attrition                  0
Department                 0
DistanceFromHome           0
Education                  0
EducationField             0
EnvironmentSatisfaction    0
JobSatisfaction            0
MaritalStatus              0
MonthlyIncome              0
dtype: int64


Unnamed: 0,Age,Attrition,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,JobSatisfaction,MaritalStatus,MonthlyIncome,NumCompaniesWorked,WorkLifeBalance,YearsAtCompany
0,41,Yes,Sales,1,2,Life Sciences,2,4,Single,5993,8,1,6
1,49,No,Research & Development,8,1,Life Sciences,3,2,Married,5130,1,3,10
2,37,Yes,Research & Development,2,2,Other,4,3,Single,2090,6,3,0
3,33,No,Research & Development,3,4,Life Sciences,4,3,Married,2909,1,3,8
4,27,No,Research & Development,2,1,Medical,1,2,Married,3468,9,3,2


In [3]:
def map_position(row):
    dept = str(row.get("Department", "")).lower()
    # jobrole = str(row.get("JobRole", "")).lower()
    env_satisfy = row.get("EnvironmentSatisfaction", np.nan)

    if "research" in dept or "development" in dept or "res" in dept:
        dept_norm = "R&D"
    elif "sales" in dept:
        dept_norm = "Sales"
    else:
        dept_norm = "Other"

    # is_manager = ("manager" in dept) or (pd.notna(joblevel) and joblevel >= 3)
    is_manager = (pd.notna(env_satisfy) and env_satisfy >= 3) # ("manager" in dept) or 

    if is_manager:
        return "Manager"
    elif dept_norm == "R&D":
        return "R&D"
    elif dept_norm == "Sales":
        return "Sales"
    elif dept_norm == "Other":
        return "Employee"
    


In [4]:
def engineer_features(df) -> pd.DataFrame:
    df = df.copy()
    df["Churn"] = df["Attrition"].map({"Yes": 1, "No": 0}).astype(int)
    df["EmployeeSatisfaction"] = (df["JobSatisfaction"].astype(float) - 1) / 3
    df["Salary"] = df["MonthlyIncome"].astype(float) * 12
    df["Position"] = df.apply(map_position, axis=1)

    df_final = df[["YearsAtCompany", "EmployeeSatisfaction", "Position", "Salary", "Churn"]]
    return df_final    

df_final = engineer_features(df)
df_final.head()

Unnamed: 0,YearsAtCompany,EmployeeSatisfaction,Position,Salary,Churn
0,6,1.0,Sales,71916.0,1
1,10,0.333333,Manager,61560.0,0
2,0,0.666667,Manager,25080.0,1
3,8,0.666667,Manager,34908.0,0
4,2,0.333333,R&D,41616.0,0


FROM HERE, IT'S MODEL TRAINING EXPERIMENTS

### Simple random forest classifier

In [6]:
"""
MODEL TRAINING CODE GOES HERE
"""

"""
RandomForest baseline training (mirrors the "CATBOOST - Training" cell).

Assumptions:
- `df_final` is already in memory, with 4 columns total.
- The dependent/target column is 'Churn'.
- The 3 remaining columns are features (may be numeric and/or categorical).

What this cell does:
1) Splits df_final into X/y with stratification on Churn.
2) Builds a robust sklearn Pipeline:
   - OneHotEncoder for categorical (object / category / boolean) features
   - Passthrough for numeric features
   - RandomForestClassifier (with class_weight='balanced' as a safe default)
3) Trains, evaluates, and prints quick metrics.

You can tune the RF params (n_estimators, max_depth, etc.) as needed.
"""

from __future__ import annotations

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    roc_auc_score,
    f1_score,
    accuracy_score,
    classification_report,
)

# --- Split ---
assert "Churn" in df_final.columns, "Expected a 'Churn' column in df_final."
X = df_final.drop(columns=["Churn"])
y = df_final["Churn"]

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- Preprocess ---
cat_sel = selector(dtype_include=["object", "bool", "category"])
num_sel = selector(dtype_include=["number"])

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_sel),
        ("num", "passthrough", num_sel),
    ],
    remainder="drop",
)

# --- Model ---
rf = RandomForestClassifier(
    n_estimators=400,
    max_depth=None,
    min_samples_leaf=1,
    n_jobs=-1,
    class_weight="balanced",
    random_state=42,
)

pipe = Pipeline(steps=[("prep", preprocess), ("rf", rf)])

# --- Train ---
pipe.fit(X_train, y_train)

# --- Eval ---
pred = pipe.predict(X_valid)
proba = (
    pipe.predict_proba(X_valid)[:, 1]
    if hasattr(pipe.named_steps["rf"], "predict_proba")
    else None
)

print("Accuracy:", accuracy_score(y_valid, pred))
print("F1:", f1_score(y_valid, pred, zero_division=0))
if proba is not None:
    print("ROC-AUC:", roc_auc_score(y_valid, proba))
print("\nClassification report:\n", classification_report(y_valid, pred, zero_division=0))

# Trained pipeline is in `pipe`


Accuracy: 0.7891156462585034
F1: 0.225
ROC-AUC: 0.6028943061417864

Classification report:
               precision    recall  f1-score   support

           0       0.85      0.90      0.88       247
           1       0.27      0.19      0.23        47

    accuracy                           0.79       294
   macro avg       0.56      0.55      0.55       294
weighted avg       0.76      0.79      0.77       294



### CATBOOST - Training

In [2]:
def train_model(df_final):
    X = df_final[["YearsAtCompany", "EmployeeSatisfaction", "Position", "Salary"]]
    y = df_final["Churn"]

    cat_cols = ["Position"]
    cat_idx = [X.columns.get_loc(c) for c in cat_cols]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=SEED, stratify=y
    )

    train_pool = Pool(X_train, y_train, cat_features=cat_idx)
    test_pool  = Pool(X_test,  y_test,  cat_features=cat_idx)

    model = CatBoostClassifier(
        depth=6,
        learning_rate=0.08,
        iterations=1500,
        l2_leaf_reg=6,
        loss_function="Logloss",
        eval_metric="AUC",
        od_type="Iter",
        od_wait=50,
        random_seed=SEED,
        verbose=100
    )

    model.fit(train_pool, eval_set=test_pool)

    pred_proba = model.predict_proba(test_pool)[:, 1]
    preds = (pred_proba >= 0.5).astype(int)

    auc = roc_auc_score(y_test, pred_proba)
    print("\nAUC:", round(auc, 4))                 # <-- fix: use built-in round()
    print("\nClassification Report:\n", classification_report(y_test, preds))

    model.save_model(MODEL_PATH)
    joblib.dump({"feature_order": list(X.columns), "cat_cols": cat_cols}, ENCODER_PATH)
    return model


### Random Forest - Modular

## Test VERSION BELOW 👇🏽

In [8]:
# --- CONFIG (update paths as you like) ---
MODEL_BUNDLE = "rfmodel0.skops"   # neutral bundle (safer than pickle)

def train_model(df_final):
    """
    Train RF pipeline on df_final (expects 'Churn' target) and return (model, meta).
    Stores a tuned decision threshold (by F1 on the validation set) in meta['threshold'].
    Also saves a neutral bundle via skops at MODEL_BUNDLE.
    """
    import numpy as np
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.compose import ColumnTransformer, make_column_selector as selector
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.pipeline import Pipeline
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, classification_report
    import skops.io as sio
    from datetime import datetime, timezone
    import platform
    import sklearn, numpy

    assert "Churn" in df_final.columns, "Expected 'Churn' in df_final."
    X = df_final.drop(columns=["Churn"])
    y = df_final["Churn"]

    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    cat_sel = selector(dtype_include=["object", "bool", "category"])
    num_sel = selector(dtype_include=["number"])

    preprocess = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_sel),
            ("num", "passthrough", num_sel),
        ],
        remainder="drop",
    )

    rf = RandomForestClassifier(
        n_estimators=800,             # a bit larger for stability on small data
        max_depth=None,
        min_samples_leaf=2,           # slightly >1 often helps generalization
        max_features="sqrt",
        class_weight="balanced",
        n_jobs=-1,
        random_state=42,
    )

    pipe = Pipeline(steps=[("prep", preprocess), ("rf", rf)])
    pipe.fit(X_train, y_train)

    # --- Threshold tuning for minority class F1 ---
    proba_valid = pipe.predict_proba(X_valid)[:, 1]
    grid = np.linspace(0.10, 0.90, 33)
    best_t, best_f1 = 0.5, -1.0
    for t in grid:
        pred = (proba_valid >= t).astype(int)
        f1v = f1_score(y_valid, pred, zero_division=0)
        if f1v > best_f1:
            best_f1, best_t = f1v, t

    pred_valid = (proba_valid >= best_t).astype(int)
    print("\n=== RF (tuned threshold) ===")
    print("Chosen threshold:", round(best_t, 3))
    print("Accuracy:", accuracy_score(y_valid, pred_valid))
    print("F1:", f1_score(y_valid, pred_valid, zero_division=0))
    print("ROC-AUC:", roc_auc_score(y_valid, proba_valid))
    print("\nClassification report:\n", classification_report(y_valid, pred_valid, zero_division=0))

    # --- Build meta + save neutral bundle ---
    feature_order = list(X.columns)
    cat_cols = [c for c in feature_order if str(X[c].dtype) in ("object", "bool") or pd.api.types.is_categorical_dtype(X[c])]
    meta = {
        "feature_order": feature_order,
        "cat_cols": cat_cols,
        "threshold": float(best_t),
        "trained_at": datetime.now(timezone.utc).isoformat() + "Z",
        "env": {
            "python": platform.python_version(),
            "numpy": numpy.__version__,
            "sklearn": sklearn.__version__,
        },
    }

    sio.dump({"pipeline": pipe, "meta": meta}, MODEL_BUNDLE)
    print(f"\nSaved neutral bundle -> {MODEL_BUNDLE}")

    return pipe, meta


In [9]:
model, meta = train_model(df_final)


=== RF (tuned threshold) ===
Chosen threshold: 0.25
Accuracy: 0.6666666666666666
F1: 0.36363636363636365
ROC-AUC: 0.6435954862606599

Classification report:
               precision    recall  f1-score   support

           0       0.90      0.68      0.77       247
           1       0.26      0.60      0.36        47

    accuracy                           0.67       294
   macro avg       0.58      0.64      0.57       294
weighted avg       0.80      0.67      0.71       294



  cat_cols = [c for c in feature_order if str(X[c].dtype) in ("object", "bool") or pd.api.types.is_categorical_dtype(X[c])]



Saved neutral bundle -> rfmodel0.skops


In [12]:
# Neutral load via skops; trust the column selector callable used in your pipeline
def load_model_skops(path: str = MODEL_BUNDLE):
    import skops.io as sio
    bundle = sio.load(
        path,
        trusted=[
            "sklearn.pipeline.Pipeline",
            "sklearn.compose._column_transformer.ColumnTransformer",
            "sklearn.preprocessing._encoders.OneHotEncoder",
            "sklearn.ensemble._forest.RandomForestClassifier",
            # this is what triggers UntrustedTypesFoundException if not whitelisted:
            "sklearn.compose._column_transformer.make_column_selector",
        ],
    )
    return bundle["pipeline"], bundle["meta"]


In [13]:
from typing import Any, Dict
import pandas as pd

def _preprocess_payload(payload: Dict[str, Any]) -> pd.DataFrame:
    # normalize Position into expected buckets
    pos_raw = str(payload.get("Position", "")).lower()
    if "sales" in pos_raw:
        pos_norm = "Sales"
    elif "r&d" in pos_raw or "research" in pos_raw or "dev" in pos_raw:
        pos_norm = "R&D"
    elif "manager" in pos_raw:
        pos_norm = "Manager"
    else:
        pos_norm = "Employee"

    row = {
        "YearsAtCompany": float(payload.get("YearsAtCompany", 0)),         # accept int, cast to float
        "EmployeeSatisfaction": float(payload.get("EmployeeSatisfaction", 0)),
        "Position": pos_norm,
        "Salary": float(payload.get("Salary", 0.0)),
    }
    return pd.DataFrame([row])


def predict(payload: Dict[str, Any], model_path: str = MODEL_BUNDLE) -> Dict[str, Any]:
    pipe, meta = load_model_skops(model_path)
    X = _preprocess_payload(payload)

    # match training column order (and fill any missing)
    if "feature_order" in meta:
        X = X.reindex(columns=meta["feature_order"], fill_value=0)

    proba = float(pipe.predict_proba(X)[0, 1])
    threshold = float(meta.get("threshold", 0.5))
    pred = int(proba >= threshold)
    return {"attrition": pred, "prob": round(proba, 4)}


In [14]:
# Examples covering each Position bucket
samples = [
    {"YearsAtCompany": 1, "EmployeeSatisfaction": 2, "Position": "R&D",      "Salary": 60000.0},
    {"YearsAtCompany": 3, "EmployeeSatisfaction": 4, "Position": "Sales",    "Salary": 75000.0},
    {"YearsAtCompany": 6, "EmployeeSatisfaction": 3, "Position": "Manager",  "Salary": 98000.0},
    {"YearsAtCompany": 2, "EmployeeSatisfaction": 1, "Position": "Employee", "Salary": 52000.0},
]

for i, s in enumerate(samples, 1):
    print(f"Sample {i}: {s}")
    print(" →", predict(s))


Sample 1: {'YearsAtCompany': 1, 'EmployeeSatisfaction': 2, 'Position': 'R&D', 'Salary': 60000.0}
 → {'attrition': 0, 'prob': 0.1552}
Sample 2: {'YearsAtCompany': 3, 'EmployeeSatisfaction': 4, 'Position': 'Sales', 'Salary': 75000.0}
 → {'attrition': 1, 'prob': 0.3083}
Sample 3: {'YearsAtCompany': 6, 'EmployeeSatisfaction': 3, 'Position': 'Manager', 'Salary': 98000.0}
 → {'attrition': 0, 'prob': 0.044}
Sample 4: {'YearsAtCompany': 2, 'EmployeeSatisfaction': 1, 'Position': 'Employee', 'Salary': 52000.0}
 → {'attrition': 0, 'prob': 0.1973}


## MAIN VERSION BELOW 👇🏽

In [7]:
MODEL_BUNDLE = "rfmodel.skops"   # neutral, version-safe bundle

from __future__ import annotations
from typing import Any, Dict, Tuple

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    roc_auc_score,
    f1_score,
    accuracy_score,
    classification_report,
)


In [15]:
def split_data(df_final: pd.DataFrame, test_size: float = 0.2, seed: int = 42):
    """
    Split df_final into train/valid.
    Expects 'Churn' column as the target; all other columns are features.
    Returns: X_train, X_valid, y_train, y_valid, feature_order, cat_cols
    """
    assert "Churn" in df_final.columns, "Expected 'Churn' in df_final."
    X = df_final.drop(columns=["Churn"])
    y = df_final["Churn"]

    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=seed
    )

    feature_order = list(X.columns)
    cat_cols = [
        c for c in feature_order
        if str(X[c].dtype) in ("object", "bool") or pd.api.types.is_categorical_dtype(X[c])
    ]
    return X_train, X_valid, y_train, y_valid, feature_order, cat_cols


In [16]:
def build_pipeline(
    cat_cols,
    num_cols,
    n_estimators: int = 800,
    min_samples_leaf: int = 2,
    max_depth=None,
    class_weight: str | dict | None = "balanced",
    seed: int = 42,
):
    """
    Build a Pipeline using explicit cat/num column name lists (no callables),
    so skops doesn't flag untrusted types on load.
    """
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.pipeline import Pipeline
    from sklearn.ensemble import RandomForestClassifier

    preprocess = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), list(cat_cols)),
            ("num", "passthrough", list(num_cols)),
        ],
        remainder="drop",
    )

    rf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_leaf=min_samples_leaf,
        max_features="sqrt",
        class_weight=class_weight,
        n_jobs=-1,
        random_state=seed,
    )
    return Pipeline([("prep", preprocess), ("rf", rf)])


In [17]:
def train_model(X_train: pd.DataFrame, y_train: pd.Series, pipe: Pipeline) -> Pipeline:
    """
    Fit the provided pipeline on the training split and return the trained pipeline.
    """
    pipe.fit(X_train, y_train)
    return pipe


In [None]:
def evaluation_metrics(
    pipe: Pipeline,
    X_valid: pd.DataFrame,
    y_valid: pd.Series,
    threshold_grid = np.linspace(0.10, 0.90, 33),
    print_report: bool = True,
) -> Tuple[float, dict]:
    """
    Compute probabilities on the validation set, choose a decision threshold
    that maximizes F1 (for the positive class), optionally print a detailed report,
    and return (best_threshold, metrics_dict).
    """
    proba_valid = pipe.predict_proba(X_valid)[:, 1]
    best_t, best_f1 = 0.5, -1.0

    for t in threshold_grid:
        pred_t = (proba_valid >= t).astype(int)
        f1v = f1_score(y_valid, pred_t, zero_division=0)
        if f1v > best_f1:
            best_f1, best_t = f1v, t

    final_pred = (proba_valid >= best_t).astype(int)
    metrics = {
        "threshold": float(best_t),
        "accuracy": float(accuracy_score(y_valid, final_pred)),
        "f1": float(f1_score(y_valid, final_pred, zero_division=0)),
        "roc_auc": float(roc_auc_score(y_valid, proba_valid)),
        "report": classification_report(y_valid, final_pred, zero_division=0),
    }

    if print_report:
        print("\n=== Validation (threshold tuned for F1) ===")
        print("Chosen threshold:", round(metrics["threshold"], 3))
        print("Accuracy:", round(metrics["accuracy"], 4))
        print("F1:", round(metrics["f1"], 4))
        print("ROC-AUC:", round(metrics["roc_auc"], 4))
        print("\nClassification report:\n", metrics["report"])

    return best_t, metrics


In [None]:
def save_model(pipe: Pipeline, feature_order, cat_cols, threshold: float, path: str = MODEL_BUNDLE):
    """
    Save the trained pipeline + metadata in a neutral, version-safe bundle.
    """
    import skops.io as sio
    from datetime import datetime, timezone
    import platform, sklearn, numpy as _np

    meta = {
        "feature_order": list(feature_order),
        "cat_cols": list(cat_cols),
        "threshold": float(threshold),
        "trained_at": datetime.now(timezone.utc).isoformat() + "Z",
        "env": {
            "python": platform.python_version(),
            "numpy": _np.__version__,
            "sklearn": sklearn.__version__,
        },
    }

    sio.dump({"pipeline": pipe, "meta": meta}, path)
    print(f"Saved neutral bundle -> {path}")
    return meta


def load_model(path: str = MODEL_BUNDLE):
    """
    Load a pipeline + metadata saved via skops.
    Returns (pipeline, meta).
    """
    import skops.io as sio
    bundle = sio.load(path)
    model = bundle["pipeline"]
    meta = bundle["meta"]
    return model, meta


In [None]:
def _preprocess_payload(payload: Dict[str, Any]) -> pd.DataFrame:
    pos_raw = str(payload.get("Position", "")).lower()
    if "sales" in pos_raw:
        pos_norm = "Sales"
    elif "r&d" in pos_raw or "research" in pos_raw or "dev" in pos_raw:
        pos_norm = "R&D"
    elif "manager" in pos_raw:
        pos_norm = "Manager"
    else:
        pos_norm = "Employee"

    row = {
        "YearsAtCompany": float(payload.get("YearsAtCompany", 0.0)),
        "EmployeeSatisfaction": float(payload.get("EmployeeSatisfaction", 0.0)),
        "Position": pos_norm,
        "Salary": float(payload.get("Salary", 0.0)),
    }
    return pd.DataFrame([row])


def predict(payload: Dict[str, Any], model_path: str = MODEL_BUNDLE) -> Dict[str, Any]:
    """
    Load the model bundle, transform payload, score, and apply tuned threshold.
    Returns {"attrition": 0/1, "prob": float}.
    """
    pipe, meta = load_model(model_path)
    X = _preprocess_payload(payload)

    # ensure column order to match training (and provide fill values if needed)
    if "feature_order" in meta:
        X = X.reindex(columns=meta["feature_order"], fill_value=0)

    proba = float(pipe.predict_proba(X)[0, 1])
    threshold = float(meta.get("threshold", 0.5))
    pred = int(proba >= threshold)
    return {"attrition": pred, "prob": round(proba, 4)}


In [21]:
# split (same as before)
X_train, X_valid, y_train, y_valid, feature_order, cat_cols = split_data(df_final)

# derive numeric column list explicitly
num_cols = [c for c in feature_order if c not in cat_cols]

# build without callables + train
pipe = build_pipeline(
    cat_cols=cat_cols,
    num_cols=num_cols,
    n_estimators=800,
    min_samples_leaf=2,
    max_depth=None,
    class_weight="balanced",
)
pipe = train_model(X_train, y_train, pipe)

# evaluate & save (unchanged)
best_t, metrics = evaluation_metrics(pipe, X_valid, y_valid, print_report=True)
meta = save_model(pipe, feature_order, cat_cols, threshold=best_t, path=MODEL_BUNDLE)


  if str(X[c].dtype) in ("object", "bool") or pd.api.types.is_categorical_dtype(X[c])



=== Validation (threshold tuned for F1) ===
Chosen threshold: 0.25
Accuracy: 0.6667
F1: 0.3636
ROC-AUC: 0.6436

Classification report:
               precision    recall  f1-score   support

           0       0.90      0.68      0.77       247
           1       0.26      0.60      0.36        47

    accuracy                           0.67       294
   macro avg       0.58      0.64      0.57       294
weighted avg       0.80      0.67      0.71       294

Saved neutral bundle -> rfmodel.skops


## EXAMPLE USAGE -- TESTING

In [None]:
def load_model():
    model = CatBoostClassifier()
    model.load_model(MODEL_PATH)
    meta = joblib.load(ENCODER_PATH)
    return model, meta

def _preprocess_payload(payload: Dict[str, Any]) -> pd.DataFrame:
    pos_raw = str(payload.get("Position", "")).lower()
    if "sales" in pos_raw:
        pos_norm = "Sales"
    elif "r&d" in pos_raw or "research" in pos_raw or "dev" in pos_raw:
        pos_norm = "R&D"
    elif "manager" in pos_raw:
        pos_norm = "Manager"
    else:
        pos_norm = "Employee"

    row = {
        "YearsAtCompany": float(payload.get("YearsAtCompany", 0.0)),
        "EmployeeSatisfaction": float(payload.get("EmployeeSatisfaction", 0.0)),
        "Position": pos_norm,
        "Salary": float(payload.get("Salary", 0.0))
    }
    return pd.DataFrame([row])

def predict(payload: Dict[str, Any]) -> Dict[str, Any]:
    model, meta = load_model()
    X = _preprocess_payload(payload)
    X = X[meta["feature_order"]]
    cat_idx = [X.columns.get_loc(c) for c in meta["cat_cols"]]
    pool = Pool(X, cat_features=cat_idx)
    proba = float(model.predict_proba(pool)[0, 1])
    pred = int(proba >= 0.5)
    return {"attrition": pred, "prob": round(proba, 4)}


In [None]:
sample_input = {
    "YearsAtCompany": 3,
    "EmployeeSatisfaction": 0.7,
    "Position": "R&D",
    "Salary": 75000.0
}

result = predict(sample_input)
print("Prediction Result:", result)
