## HR Attrition Prediction
Full notebook: EDA, preprocessing, model training (Logistic Regression & RandomForest), evaluation, and export.

Generated dataset contains 12,000 rows.

In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,roc_auc_score, roc_curve

In [4]:
df = pd.read_csv("HR_Employee_Data.csv")
df.head()

Unnamed: 0,EmployeeID,Age,Gender,Department,JobRole,Education,EducationField,MaritalStatus,MonthlyIncome,DistanceFromHome,YearsAtCompany,TrainingHours,LastPromotionYears,PerformanceRating,WorkLifeBalance,OverTime,Attrition
0,10001,58,Male,Finance,Financial Analyst,3,Life Sciences,Single,2712,26,4,9,9,5,3,Yes,Yes
1,10002,48,Male,Sales,Sales Executive,3,Life Sciences,Single,6563,22,6,115,10,3,3,No,No
2,10003,34,Female,Operations,Logistics Manager,2,Life Sciences,Single,4407,46,2,92,0,3,3,No,Yes
3,10004,27,Female,Operations,Logistics Manager,3,Life Sciences,Single,5444,40,1,46,6,2,3,No,No
4,10005,40,Male,R&D,Lab Technician,2,Technical Degree,Single,7624,18,3,166,0,4,3,No,No


In [5]:
df.head

<bound method NDFrame.head of        EmployeeID  Age  Gender  Department             JobRole  Education  \
0           10001   58    Male     Finance   Financial Analyst          3   
1           10002   48    Male       Sales     Sales Executive          3   
2           10003   34  Female  Operations   Logistics Manager          2   
3           10004   27  Female  Operations   Logistics Manager          3   
4           10005   40    Male         R&D      Lab Technician          2   
...           ...  ...     ...         ...                 ...        ...   
11995       21996   58    Male         R&D  Research Scientist          4   
11996       21997   40    Male       Sales       Sales Manager          3   
11997       21998   49    Male         R&D  Research Scientist          4   
11998       21999   32  Female       Sales     Sales Executive          4   
11999       22000   56  Female          HR        HR Executive          3   

         EducationField MaritalStatus  Monthl

In [6]:
df.describe()

Unnamed: 0,EmployeeID,Age,Education,MonthlyIncome,DistanceFromHome,YearsAtCompany,TrainingHours,LastPromotionYears,PerformanceRating,WorkLifeBalance
count,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0
mean,16000.5,39.521833,3.247667,6101.750333,25.376417,2.974667,99.675583,5.002917,3.30025,2.960583
std,3464.24595,11.433675,0.994272,2846.529815,14.425015,1.715742,57.869518,3.173181,0.765495,0.739983
min,10001.0,20.0,1.0,1500.0,1.0,0.0,0.0,0.0,1.0,1.0
25%,13000.75,30.0,3.0,3977.75,13.0,2.0,49.0,2.0,3.0,3.0
50%,16000.5,40.0,3.0,6040.5,26.0,3.0,99.0,5.0,3.0,3.0
75%,19000.25,49.0,4.0,8046.0,38.0,4.0,150.0,8.0,4.0,3.0
max,22000.0,59.0,5.0,17570.0,50.0,14.0,200.0,10.0,5.0,4.0


In [7]:
df.isnull().sum()

EmployeeID            0
Age                   0
Gender                0
Department            0
JobRole               0
Education             0
EducationField        0
MaritalStatus         0
MonthlyIncome         0
DistanceFromHome      0
YearsAtCompany        0
TrainingHours         0
LastPromotionYears    0
PerformanceRating     0
WorkLifeBalance       0
OverTime              0
Attrition             0
dtype: int64

In [8]:
df.isnull().any()

EmployeeID            False
Age                   False
Gender                False
Department            False
JobRole               False
Education             False
EducationField        False
MaritalStatus         False
MonthlyIncome         False
DistanceFromHome      False
YearsAtCompany        False
TrainingHours         False
LastPromotionYears    False
PerformanceRating     False
WorkLifeBalance       False
OverTime              False
Attrition             False
dtype: bool

In [9]:
df.isnull()

Unnamed: 0,EmployeeID,Age,Gender,Department,JobRole,Education,EducationField,MaritalStatus,MonthlyIncome,DistanceFromHome,YearsAtCompany,TrainingHours,LastPromotionYears,PerformanceRating,WorkLifeBalance,OverTime,Attrition
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
11996,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
11997,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
11998,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [10]:
df.dtypes

EmployeeID             int64
Age                    int64
Gender                object
Department            object
JobRole               object
Education              int64
EducationField        object
MaritalStatus         object
MonthlyIncome          int64
DistanceFromHome       int64
YearsAtCompany         int64
TrainingHours          int64
LastPromotionYears     int64
PerformanceRating      int64
WorkLifeBalance        int64
OverTime              object
Attrition             object
dtype: object

In [11]:
df.shape

(12000, 17)

In [14]:
df.columns

Index(['EmployeeID', 'Age', 'Gender', 'Department', 'JobRole', 'Education',
       'EducationField', 'MaritalStatus', 'MonthlyIncome', 'DistanceFromHome',
       'YearsAtCompany', 'TrainingHours', 'LastPromotionYears',
       'PerformanceRating', 'WorkLifeBalance', 'OverTime', 'Attrition'],
      dtype='object')

In [34]:
le = LabelEncoder()

for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = le.fit_transform(df_clean[col])

In [42]:
# Encode target
le_target = LabelEncoder()
df['AttritionFlag'] = le_target.fit_transform(df['Attrition'])


In [43]:
x=df[['EmployeeID','Attrition','AttritionFlag']]
x

Unnamed: 0,EmployeeID,Attrition,AttritionFlag
0,10001,1,1
1,10002,0,0
2,10003,1,1
3,10004,0,0
4,10005,0,0
...,...,...,...
11995,21996,0,0
11996,21997,0,0
11997,21998,0,0
11998,21999,0,0


In [44]:
y= df[['AttritionFlag']]
y

Unnamed: 0,AttritionFlag
0,1
1,0
2,1
3,0
4,0
...,...
11995,0
11996,0
11997,0
11998,0


In [45]:
x_train,x_test,y_train,y_test= train_test_split(x,y,test_size=1,random_state=50)
print(x_train,x_test,y_train,y_test)

       EmployeeID  Attrition  AttritionFlag
2162        12163          0              0
6143        16144          0              0
2524        12525          0              0
9586        19587          0              0
1124        11125          0              0
...           ...        ...            ...
8324        18325          0              0
10206       20207          0              0
6253        16254          1              1
10123       20124          0              0
5600        15601          0              0

[11999 rows x 3 columns]       EmployeeID  Attrition  AttritionFlag
5789       15790          0              0        AttritionFlag
2162               0
6143               0
2524               0
9586               0
1124               0
...              ...
8324               0
10206              0
6253               1
10123              0
5600               0

[11999 rows x 1 columns]       AttritionFlag
5789              0


In [46]:
scaled=StandardScaler()
x_scaled=scaled.fit_transform(x)

In [47]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(11999, 3)
(1, 3)
(11999, 1)
(1, 1)


In [52]:
 #Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(x_train, y_train)



  y = column_or_1d(y, warn=True)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [51]:
y_pred_lr = lr.predict(x_test)
y_pred_lr

array([0])

In [54]:
print('Logistic Regression Accuracy:', accuracy_score(y_test, y_pred_lr))


Logistic Regression Accuracy: 1.0


In [55]:
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1



In [69]:
# New hr attrition input
ne = np.array([[10001,1,1]])

# Predict
ps = lr.predict(ne)

# Print properly
print(f"Predicted atrriction: {ps[0].item():.2f}")

Predicted atrriction: 1.00




In [58]:
# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(x_train, y_train)
y_pred_rf = rf.predict(x_test)

  return fit_method(estimator, *args, **kwargs)


In [59]:
print('Random Forest Accuracy:', accuracy_score(y_test, y_pred_rf))

Random Forest Accuracy: 1.0


In [60]:
print(classification_report(y_test, y_pred_rf))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1



In [61]:
cnf=confusion_matrix(y_test, y_pred_rf)
cnf




array([[1]])

In [75]:
# New HR attrition input
ne = np.array([[10001,1,1]])

# Predict (gives 0 or 1)
ps = rf.predict(ne)

# Decode prediction back to Yes/No
decoded = target_encoder.inverse_transform(ps)

print("Predicted Attrition:", decoded[0])


Predicted Attrition: 1




## Recommendations
Implement targeted retention programs for employees in Sales and those with low income.
Focus on early-career employees (0–2 years) with mentoring and growth plans.
Offer flexible work options for employees with long commute distances.
Regularly monitor performance ratings and provide learning opportunities.