## Training employee attrition detection model

**Objective**<br/>
Predict if an employee is likely to quit and identify the factors responsible - to allow HR to intervene on time and remedy the situation to prevent attrition.

> While some level of attrition in a company is inevitable, minimizing it and being prepared for the cases that cannot be helped will significantly help improve the operations of most businesses.

**Data**<br/>
The data set presents an employee survey from IBM, indicating if there is attrition or not. The data set contains approximately 1500 entries. Given the limited size of the data set, the model should only be expected to provide modest improvement in indentification of attrition vs a random allocation of probability of attrition.

### Import libraries

In [None]:
from typing import Tuple

import matplotlib.pyplot as plt
import mlflow
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    plot_confusion_matrix,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

#### Load Data

In [None]:
def load_csv_data(path: str) -> pd.DataFrame:
    """Loads the csv file from path and returns pandas dataframe"""
    try:
        data = pd.read_csv(path)
        return data
    except:
        raise Exception(f"Error while loading the data from {path}")

#### Train-test split

In [None]:
def split_data(
    df: pd.DataFrame, test_size: float = 0.2, random_state: int = 42
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """Splits the input data and returns training and test sets"""
    drop_columns = [
        "EmployeeNumber",
        "EmployeeCount",
        "Over18",
        "StandardHours",
        "Attrition",
    ]
    label_column = "Attrition"
    features = df.drop(columns=drop_columns)
    labels = df[label_column]
    train_features, test_features, train_labels, test_labels = train_test_split(
        features,
        labels,
        test_size=test_size,
        random_state=random_state,
        stratify=labels,
    )
    return train_features, test_features, train_labels, test_labels

#### Training
Using Multilayer perceptron algorithm

In [None]:
def train(
    train_features: pd.DataFrame,
    train_labels: pd.DataFrame,
    random_state: int = 42,
    **kwargs,
) -> Tuple[LabelEncoder, Pipeline]:
    """Preprocesses the data and trains a model using sklearn pipeline"""
    # label encoder
    label_encoder = LabelEncoder()
    label_encoder.fit(train_labels.values)
    train_labels = label_encoder.transform(train_labels)
    # pipeline
    categorical_columns = [
        "BusinessTravel",
        "Department",
        "Education",
        "EducationField",
        "EnvironmentSatisfaction",
        "Gender",
        "JobInvolvement",
        "JobLevel",
        "JobRole",
        "JobSatisfaction",
        "MaritalStatus",
        "OverTime",
        "PerformanceRating",
        "RelationshipSatisfaction",
        "StockOptionLevel",
        "WorkLifeBalance",
    ]
    numerical_columns = train_features.select_dtypes(include="int64").columns
    transformers = [
        ("one_hot_encoder", OneHotEncoder(), categorical_columns),
        ("scaler", StandardScaler(), numerical_columns),
    ]
    preprocessing = ColumnTransformer(transformers=transformers)
    classifier = MLPClassifier(
        max_iter=kwargs.get("max_iter", 500),
        activation=kwargs.get("activation", "tanh"),
        solver=kwargs.get("solver", "sgd"),
        random_state=random_state,
    )
    model = Pipeline(
        steps=[("preprocessing", preprocessing), ("classifier", classifier)]
    )
    # model building - fit
    model.fit(train_features, train_labels)
    return label_encoder, model

#### Metrics

**Accuracy**
* Proportion of true results among the total number of cases
* Accuracy is a valid choice of evaluation for classification problems which are well balanced and not skewed or No class imbalance

**Precision**
* Proportion of predicted positives are truly positives
* Precision is a valid choice of evaluation metric when we want to be very sure of our prediction

**Recall**
* What proportion of actual Positives is correctly classified?
* Recall is a valid choice of evaluation metric when we want to capture as many positives as possible
* For example: If we are building a system to predict if a person has cancer or not, we want to capture the disease even if we are not very sure.

**F1-score**
* The F1 score is a number between 0 and 1 and is the harmonic mean of precision and recall
* We use this when we want to have a model with both good precision and recall 
* If you are a police inspector and you want to catch criminals, you want to be sure that the person you catch is a criminal (Precision) and you also want to capture as many criminals (Recall) as possible. The F1 score manages this tradeoff.

In [None]:
def evaluate(
    model: Pipeline,
    label_encoder: LabelEncoder,
    test_features: pd.DataFrame,
    test_labels: pd.DataFrame,
) -> Tuple[float, float, float, float]:
    """Evaluates the trained model using the held out test set"""
    predictions = model.predict(test_features)
    test_labels = label_encoder.transform(test_labels)
    accuracy = accuracy_score(test_labels, predictions)
    precision = precision_score(test_labels, predictions)
    recall = recall_score(test_labels, predictions)
    f1 = f1_score(test_labels, predictions)
    cm = confusion_matrix(test_labels, predictions)
    plot_confusion_matrix(
        model, test_features, test_labels, display_labels=label_encoder.classes_
    )
    return accuracy, precision, recall, f1, cm, plt

#### Experiments

In [None]:
def run_experiment(experiment_name, **parameters):
    """Runs all the steps and logs experiment parameters using mlflow"""
    with mlflow.start_run(run_name=experiment_name) as run:
        run_id = run.info.run_uuid
        experiment_id = run.info.experiment_id
        print(f"\nRun Id", run_id)
        print(f"Experiment Id", experiment_id)
        data = load_csv_data("https://tinyurl.com/ibmhrattrition")
        train_features, test_features, train_labels, test_labels = split_data(data)
        label_encoder, model = train(train_features, train_labels, **parameters)
        mlflow.sklearn.log_model(model, "hr-attrition-model")
        mlflow.log_params(parameters)
        accuracy, precision, recall, f1, cm, plt_cm = evaluate(
            model, label_encoder, test_features, test_labels
        )
        tp = cm[0][0]
        tn = cm[1][1]
        fp = cm[0][1]
        fn = cm[1][0]
        mlflow.log_metric("TP", tp)
        mlflow.log_metric("TN", tn)
        mlflow.log_metric("FP", fp)
        mlflow.log_metric("FN", fn)
        print("accuracy_score", accuracy)
        print("precision_score", precision)
        print("recall_score", recall)
        print("f1_score", f1)
        mlflow.log_metric("accuracy_score", accuracy)
        mlflow.log_metric("precision_score", precision)
        mlflow.log_metric("recall_score", recall)
        mlflow.log_metric("f1_score", f1)
        fig_name = "confusion-matrix.png"
        plt_cm.savefig(fig_name)
        mlflow.log_artifact(fig_name, "confusion-matrix-plot")
        return run_id, experiment_id

In [None]:
# change parameters and run experiments
run_experiment(
    "hr-attrition-experiment", max_iter=500, activation="relu", solver="adam"
)
# run_experiment("hr-attrition-experiment", max_iter=1000, activation="relu", solver="adam")
# run_experiment("hr-attrition-experiment", max_iter=500, activation="relu", solver="sgd")
# run_experiment("hr-attrition-experiment", max_iter=1000, activation="relu", solver="sgd")
# run_experiment("hr-attrition-experiment", max_iter=500, activation="tanh", solver="sgd")
# run_experiment("hr-attrition-experiment", max_iter=500, activation="tanh", solver="adam")

### Predict

In [None]:
data = load_csv_data("https://tinyurl.com/ibmhrattrition")
train_features, test_features, train_labels, test_labels = split_data(data)

In [None]:
data.head()

In [None]:
def predict(features):
    run_id = "72a2690bd0c24316984f4e2f9e49f3bd"
    logged_model = f"file:///content/mlruns/0/{run_id}/artifacts/hr-attrition-model"
    loaded_model = mlflow.pyfunc.load_model(logged_model)
    response = loaded_model.predict(pd.DataFrame(features))
    return "No" if response[0] == 0 else "Yes"

In [None]:
test_features

In [None]:
sample_features = test_features.loc[[1023]]
sample_features

In [None]:
predict(sample_features)

In [None]:
test_labels.loc[[1023]]