In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [2]:
# Load the employee attrition dataset
data = pd.read_csv('employee_attrition.csv')

# Explore the dataset (e.g., check columns, data types, missing values)
print(data.head())
print(data.info())


   Age Attrition     BusinessTravel  DailyRate              Department  \
0   41       Yes      Travel_Rarely       1102                   Sales   
1   49        No  Travel_Frequently        279  Research & Development   
2   37       Yes      Travel_Rarely       1373  Research & Development   
3   33        No  Travel_Frequently       1392  Research & Development   
4   27        No      Travel_Rarely        591  Research & Development   

   DistanceFromHome  Education EducationField  EmployeeCount  EmployeeNumber  \
0                 1          2  Life Sciences              1               1   
1                 8          1  Life Sciences              1               2   
2                 2          2          Other              1               4   
3                 3          4  Life Sciences              1               5   
4                 2          1        Medical              1               7   

   ...  RelationshipSatisfaction StandardHours  StockOptionLevel  \
0  ...

In [3]:
# Define features (X) and target (y)
X = data.drop('Attrition', axis=1)
y = data['Attrition']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define numerical and categorical features
numerical_features = ['Age', 'MonthlyIncome', 'TotalWorkingYears']
categorical_features = ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus']

# Create transformers for preprocessing
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop='first'))
])

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a logistic regression model
lr_classifier = LogisticRegression(random_state=42)

# Create a decision tree model
dt_classifier = DecisionTreeClassifier(random_state=42)

# Create a random forest model
rf_classifier = RandomForestClassifier(random_state=42)

# Create pipelines for each model
lr_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', lr_classifier)
])

dt_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', dt_classifier)
])

rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', rf_classifier)
])

# Fit the pipelines on the training data
lr_pipeline.fit(X_train, y_train)
dt_pipeline.fit(X_train, y_train)
rf_pipeline.fit(X_train, y_train)


In [4]:
# Make predictions on the test data
lr_predictions = lr_pipeline.predict(X_test)
dt_predictions = dt_pipeline.predict(X_test)
rf_predictions = rf_pipeline.predict(X_test)

# Evaluate Logistic Regression model
lr_accuracy = accuracy_score(y_test, lr_predictions)
lr_classification_report = classification_report(y_test, lr_predictions)
lr_confusion_matrix = confusion_matrix(y_test, lr_predictions)

# Evaluate Decision Tree model
dt_accuracy = accuracy_score(y_test, dt_predictions)
dt_classification_report = classification_report(y_test, dt_predictions)
dt_confusion_matrix = confusion_matrix(y_test, dt_predictions)

# Evaluate Random Forest model
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_classification_report = classification_report(y_test, rf_predictions)
rf_confusion_matrix = confusion_matrix(y_test, rf_predictions)

# Print the evaluation metrics
print("Logistic Regression Accuracy:", lr_accuracy)
print("Logistic Regression Classification Report:\n", lr_classification_report)
print("Logistic Regression Confusion Matrix:\n", lr_confusion_matrix)

print("\nDecision Tree Accuracy:", dt_accuracy)
print("Decision Tree Classification Report:\n", dt_classification_report)
print("Decision Tree Confusion Matrix:\n", dt_confusion_matrix)

print("\nRandom Forest Accuracy:", rf_accuracy)
print("Random Forest Classification Report:\n", rf_classification_report)
print("Random Forest Confusion Matrix:\n", rf_confusion_matrix)


Logistic Regression Accuracy: 0.8809523809523809
Logistic Regression Classification Report:
               precision    recall  f1-score   support

          No       0.89      0.99      0.94       255
         Yes       0.70      0.18      0.29        39

    accuracy                           0.88       294
   macro avg       0.79      0.58      0.61       294
weighted avg       0.86      0.88      0.85       294

Logistic Regression Confusion Matrix:
 [[252   3]
 [ 32   7]]

Decision Tree Accuracy: 0.7517006802721088
Decision Tree Classification Report:
               precision    recall  f1-score   support

          No       0.88      0.82      0.85       255
         Yes       0.20      0.28      0.23        39

    accuracy                           0.75       294
   macro avg       0.54      0.55      0.54       294
weighted avg       0.79      0.75      0.77       294

Decision Tree Confusion Matrix:
 [[210  45]
 [ 28  11]]

Random Forest Accuracy: 0.8775510204081632
Random Fo