In [2]:
import pandas as pd
df = pd.read_csv('/content/archive (13).zip')
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [10]:
X = df.drop('Attrition' , axis=1)
y = df['Attrition']

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [12]:
X_train.head(2)

Unnamed: 0,Age,BusinessTravel,Department,DistanceFromHome,EducationField,JobInvolvement,JobSatisfaction,MonthlyIncome,OverTime,PercentSalaryHike,TotalWorkingYears,WorkLifeBalance,YearsAtCompany
1194,47,Travel_Rarely,Sales,2,Life Sciences,4,2,15972,No,14,29,3,3
128,22,Travel_Rarely,Research & Development,2,Technical Degree,3,4,2523,No,14,3,3,2


In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(),
         ['Age', 'DistanceFromHome', 'MonthlyIncome',
          'TotalWorkingYears', 'PercentSalaryHike', 'YearsAtCompany']),

        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'),
         ['BusinessTravel', 'Department', 'EducationField']),

        ('bin', OneHotEncoder(drop='if_binary'),
         ['OverTime'])
    ],
    remainder='passthrough'
)


In [14]:
X_train_fe = preprocessor.fit_transform(X_train)


In [15]:
X_test_fe = preprocessor.transform(X_test)


In [16]:
X_train_fe

array([[ 1.09019402, -0.89991452,  2.02675233, ...,  4.        ,
         2.        ,  3.        ],
       [-1.6348276 , -0.89991452, -0.8644084 , ...,  3.        ,
         4.        ,  3.        ],
       [ 0.98119316, -0.77761018,  2.34770578, ...,  3.        ,
         3.        ,  3.        ],
       ...,
       [-1.6348276 , -0.16608847, -0.8798864 , ...,  1.        ,
         1.        ,  2.        ],
       [-0.10881549, -0.89991452,  0.08985307, ...,  3.        ,
         2.        ,  3.        ],
       [ 0.21818711,  1.30156365, -0.51959304, ...,  3.        ,
         4.        ,  3.        ]])

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report , accuracy_score

model = LogisticRegression(
    max_iter =1000,
    class_weight='balanced'
)
model.fit(X_train_fe , y_train)
y_pred = model.predict(X_test_fe)
accuracy_score(y_test , y_pred)


0.7312925170068028

In [26]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
cm


array([[186,  61],
       [ 18,  29]])

In [25]:


print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

          No       0.91      0.75      0.82       247
         Yes       0.32      0.62      0.42        47

    accuracy                           0.73       294
   macro avg       0.62      0.69      0.62       294
weighted avg       0.82      0.73      0.76       294



In [28]:
from sklearn.metrics import roc_auc_score

y_prob = model.predict_proba(X_test_fe)[:,1]
roc_auc_score(y_test, y_prob)


np.float64(0.7604444827289173)

## Logistic Regression – Baseline Model

A Logistic Regression model was trained to predict employee attrition.
Class imbalance was handled using `class_weight='balanced'`.

The model achieved an accuracy of ~73% and a ROC-AUC score of ~0.76,
indicating good separation between employees likely to leave and stay.


## Model Choice

Logistic Regression could not capture complex patterns in the data.
Random Forest was used to handle these complex relationships better.


In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score , accuracy_score

rf_model = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced',
    random_state=42
)

rf_model.fit(X_train_fe,y_train)
y_pred_rf = rf_model.predict(X_test_fe)

In [35]:
accuracy_score(y_test , y_pred)

0.7312925170068028

In [36]:
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

          No       0.85      0.97      0.91       247
         Yes       0.43      0.13      0.20        47

    accuracy                           0.83       294
   macro avg       0.64      0.55      0.55       294
weighted avg       0.79      0.83      0.79       294



## Final Model Selection

Although Random Forest achieved higher accuracy, it performed poorly in
identifying employees likely to leave.

Logistic Regression was selected as the final model because it achieved
significantly better recall for employee attrition, which is more important
for this business problem.
