In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

employee_status=pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')
employee_status.head()


In [None]:
print(employee_status.shape)
print(employee_status.columns)
print(employee_status.dtypes)

In [None]:
data=employee_status.copy()
for col in data.columns:
    if data[col].dtype in ['float64', 'int64']:
        data[col] = data[col].fillna(data[col].median())
    else:
        data[col] = data[col].fillna(data[col].mode()[0])


In [None]:
data.drop(columns=['Over18'])


In [None]:
data.to_csv("clean_employee_attrition.csv", index=False)

In [None]:
from sklearn.preprocessing import StandardScaler

num_cols = ["Age", "DailyRate", "DistanceFromHome", "HourlyRate", "MonthlyIncome", "MonthlyRate", "NumCompaniesWorked",
            "PercentSalaryHike", "TotalWorkingYears", "TrainingTimesLastYear", "YearsAtCompany", "YearsInCurrentRole",
            "YearsSinceLastPromotion"]

scaler = StandardScaler()
data[num_cols] = scaler.fit_transform(data[num_cols])

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
import seaborn as sb
for i in data.select_dtypes(include=['object']).columns:
    data[i] = le.fit_transform(data[i])
corellation = data.corr()['Attrition'].sort_values(ascending=False)
print(corellation)
sb.heatmap(corellation.to_frame(), annot=False, cmap='coolwarm')

In [None]:
import matplotlib.pyplot as plt
sb.countplot(x='Attrition', data=data)
plt.title('Employee Attrition Count')
plt.show()


In [None]:
features=data.drop('Attrition', axis=1)
target=data['Attrition']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)


In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [None]:
import pickle
with open('employee_attrition_model.pkl', 'wb') as file:
    pickle.dump(model, file)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sb
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred)
sb.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [18]:
# RESULTS AND TESTING

In [19]:
new_employee_dict = {
    col: 0 for col in features.columns
}
new_employee_dict.update({
    "Age": 28,
    "DistanceFromHome": 5,
    "MonthlyIncome": 42000,
    "OverTime": 1,
    "JobSatisfaction": 3,
    "JobRole": 7
})
new_employee = pd.DataFrame([new_employee_dict])
new_employee[num_cols] = scaler.transform(new_employee[num_cols])

prediction = model.predict(new_employee)
print("Will the employee leave?", "Yes" if prediction[0] == 1 else "No")

Will the employee leave? No


In [20]:
prob = model.predict_proba(new_employee)[0][1]
print(f"Probability of leaving: {prob:.4f}")

Probability of leaving: 0.2550
