In [19]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Load the dataset into a DataFrame
data = pd.read_csv('Employee-Attrition.csv')

In [3]:
# Explore the first few rows of the dataset
print(data.head())

   Age Attrition     BusinessTravel  DailyRate              Department  \
0   41       Yes      Travel_Rarely       1102                   Sales   
1   49        No  Travel_Frequently        279  Research & Development   
2   37       Yes      Travel_Rarely       1373  Research & Development   
3   33        No  Travel_Frequently       1392  Research & Development   
4   27        No      Travel_Rarely        591  Research & Development   

   DistanceFromHome  Education EducationField  EmployeeCount  EmployeeNumber  \
0                 1          2  Life Sciences              1               1   
1                 8          1  Life Sciences              1               2   
2                 2          2          Other              1               4   
3                 3          4  Life Sciences              1               5   
4                 2          1        Medical              1               7   

   ...  RelationshipSatisfaction StandardHours  StockOptionLevel  \
0  ...

In [4]:
# Check for missing values and handle them
missing_values = data.isnull().sum()
print("Missing Values:\n", missing_values)

Missing Values:
 Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole     

In [10]:
# Handle missing values by filling with the mean for numeric columns
numeric_cols = data.select_dtypes(include=[np.number])
data[numeric_cols.columns] = numeric_cols.fillna(numeric_cols.mean())

In [11]:
# Check the data types of the columns
print("Data Types:\n", data.dtypes)

Data Types:
 Age                          int64
Attrition                   object
BusinessTravel              object
DailyRate                    int64
Department                  object
DistanceFromHome             int64
Education                    int64
EducationField              object
EmployeeCount                int64
EmployeeNumber               int64
EnvironmentSatisfaction      int64
Gender                      object
HourlyRate                   int64
JobInvolvement               int64
JobLevel                     int64
JobRole                     object
JobSatisfaction              int64
MaritalStatus               object
MonthlyIncome                int64
MonthlyRate                  int64
NumCompaniesWorked           int64
Over18                      object
OverTime                    object
PercentSalaryHike            int64
PerformanceRating            int64
RelationshipSatisfaction     int64
StandardHours                int64
StockOptionLevel             int64
TotalWo

In [12]:
# Convert categorical variables into numerical format (one-hot encoding)
categorical_cols = data.select_dtypes(include=['object']).columns
data = pd.get_dummies(data, columns=categorical_cols)

In [14]:
# List the column names in the dataset
print(data.columns)

Index(['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome',
       'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager', 'Attrition_No',
       'Attrition_Yes', 'BusinessTravel_Non-Travel',
       'BusinessTravel_Travel_Frequently', 'BusinessTravel_Travel_Rarely',
       'Department_Human Resources', 'Department_Research & Development',
       'Department_Sales', 'EducationField_Human Resources',
       'EducationField_Life Sciences', 'EducationField_Marketing',
       'EducationField_Medical', 'EducationField_Other',
       'EducationField_Technical Degree', 'Gen

In [15]:
# Use 'Attrition_Yes' as the target variable
y = data['Attrition_Yes']

# Drop the 'Attrition_No' column, we are predicting for people who will leave so this column is not needed
X = data.drop(['Attrition_No', 'Attrition_Yes'], axis=1)

In [16]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [20]:
#3. Model Building Using Logistic Regression:

In [24]:
# Create a logistic regression model
logreg = LogisticRegression()

In [25]:
# Fit the model to your training data
logreg.fit(X_train, y_train)

LogisticRegression()

In [26]:
# Make predictions
y_pred_logreg = logreg.predict(X_test)

In [27]:
# Calculate accuracy
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)

In [28]:
# Display classification report
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_logreg))

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.95      0.93       255
           1       0.58      0.46      0.51        39

    accuracy                           0.88       294
   macro avg       0.75      0.71      0.72       294
weighted avg       0.88      0.88      0.88       294



In [29]:
# Display confusion matrix
print("Logistic Regression Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_logreg))

Logistic Regression Confusion Matrix:
[[242  13]
 [ 21  18]]


In [30]:
#Model Building using Decision Tree:

In [31]:
# Create a decision tree model
decision_tree = DecisionTreeClassifier()

In [32]:
# Fit the model to your training data
decision_tree.fit(X_train, y_train)

DecisionTreeClassifier()

In [33]:
# Make predictions
y_pred_decision_tree = decision_tree.predict(X_test)

In [34]:
# Calculate accuracy
accuracy_decision_tree = accuracy_score(y_test, y_pred_decision_tree)

In [35]:
# Display classification report
print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred_decision_tree))

Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.86      0.87       255
           1       0.18      0.21      0.19        39

    accuracy                           0.77       294
   macro avg       0.53      0.53      0.53       294
weighted avg       0.78      0.77      0.78       294



In [36]:
# Display confusion matrix
print("Decision Tree Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_decision_tree))

Decision Tree Confusion Matrix:
[[219  36]
 [ 31   8]]


In [None]:
#In the code above, we:

#1. Create instances of the Logistic Regression and Decision Tree models.
#2. Fit these models to the training data.
#3. Make predictions on the test data.
#4. Calculate the accuracy and display classification reports and confusion matrices to assess the models' performance.