In [20]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# Patch scikit-learn with Intel Extension
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [21]:
# Load the dataset
data_path = '/kaggle/input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv'  # Adjust the path accordingly
df = pd.read_csv(data_path)

In [22]:
# Encode categorical variables (save the column names after encoding)
df_encoded = pd.get_dummies(df, drop_first=True)
encoded_columns = df_encoded.columns.tolist()  # Ensure to get a list

# Define features and target variable
X = df_encoded.drop('Attrition_Yes', axis=1)
y = df_encoded['Attrition_Yes']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

INFO:sklearnex: sklearn.model_selection.train_test_split: fallback to original Scikit-learn
INFO:sklearnex: sklearn.model_selection.train_test_split: fallback to original Scikit-learn


In [23]:
# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

INFO:sklearnex: sklearn.ensemble.RandomForestClassifier.fit: fallback to original Scikit-learn
INFO:sklearnex: sklearn.utils.validation._assert_all_finite: running accelerated version on CPU


In [24]:
# Save the model and the column names
joblib.dump(model, 'employee_attrition_model.joblib')
joblib.dump(encoded_columns, 'encoded_columns.joblib')  # Save the column names
print("Model and columns saved successfully.")

Model and columns saved successfully.


In [25]:
# Load the model and encoded column names for predictions
loaded_model = joblib.load('employee_attrition_model.joblib')
encoded_columns = joblib.load('encoded_columns.joblib')

# Example new employee data for prediction
new_employee_data = pd.DataFrame({
    'Age': [30],
    'DistanceFromHome': [1],
    'Education': [2],
    'EnvironmentSatisfaction': [2],
    'JobInvolvement': [3],
    'JobLevel': [1],
    'JobSatisfaction': [4],
    'PerformanceRating': [3],
    'RelationshipSatisfaction': [4],
    'StockOptionLevel': [0],
    'TotalWorkingYears': [5],
    'TrainingTimesLastYear': [3],
    'WorkLifeBalance': [1],
    'YearsAtCompany': [3],
    'YearsInCurrentRole': [2],
    'YearsSinceLastPromotion': [0],
    'YearsWithCurrManager': [1],
    'BusinessTravel_Travel_Frequently': [0],  # Make sure to include one-hot encoded values
    'BusinessTravel_Travel_Rarely': [1],
    'Department_Research & Development': [1],
    'Department_Sales': [0],
    'Gender_Male': [1],
    'JobRole_Research Scientist': [1],
    'MaritalStatus_Single': [1],
})

# Ensure all columns are present, including the encoded ones
new_employee_data_encoded = pd.get_dummies(new_employee_data)

# Add any missing columns (that existed in the training set but are not present in the new data)
for col in encoded_columns:
    if col not in new_employee_data_encoded.columns:
        new_employee_data_encoded[col] = 0

# Reorder columns to match the training data
new_employee_data_encoded = new_employee_data_encoded[encoded_columns]

# Remove any target variable column if mistakenly included
if 'Attrition_Yes' in new_employee_data_encoded.columns:
    new_employee_data_encoded = new_employee_data_encoded.drop('Attrition_Yes', axis=1)

# Make prediction
attrition_prediction = loaded_model.predict(new_employee_data_encoded)
print("Prediction (0: Not leaving, 1: Leaving):", attrition_prediction[0])

INFO:sklearnex: sklearn.ensemble.RandomForestClassifier.predict: fallback to original Scikit-learn
INFO:sklearnex: sklearn.ensemble.RandomForestClassifier.predict_proba: fallback to original Scikit-learn


Prediction (0: Not leaving, 1: Leaving): True
