In [16]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder , StandardScaler
from sklearn.model_selection import train_test_split , StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix , accuracy_score
from imblearn.pipeline import Pipeline  
from imblearn.over_sampling import SMOTE


In [17]:
df = pd.read_csv('Employee-Attrition.csv')
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [18]:
df.drop(columns=['Over18','EmployeeCount','StandardHours','EmployeeNumber','MonthlyIncome','YearsInCurrentRole','YearsWithCurrManager'],axis =1)


Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsSinceLastPromotion
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,2,Female,...,Yes,11,3,1,0,8,0,1,6,0
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,3,Male,...,No,23,4,4,1,10,3,3,10,1
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,4,Male,...,Yes,15,3,2,0,7,3,3,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,4,Female,...,Yes,11,3,3,0,8,3,3,8,3
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,Male,...,No,12,3,4,1,6,3,3,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,3,Male,...,No,17,3,3,1,17,3,3,5,0
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,4,Male,...,No,15,3,1,1,9,5,3,7,1
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,2,Male,...,Yes,20,4,2,1,6,0,3,6,0
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,4,Male,...,No,14,3,4,0,17,3,2,9,0


In [19]:
travel_map={
    'Non-Travel':0,
    'Travel_Rarely':1,
    'Travel_Frequently':2
}
df['BusinessTravel']=df['BusinessTravel'].map(travel_map)

In [20]:
le=LabelEncoder()
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = le.fit_transform(df[col])


In [None]:
from sklearn.decomposition import PCA

X = df.drop(columns = 'Attrition',axis = 1)
y = df['Attrition']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = .2,random_state = 42,stratify=y)



In [30]:
standard_scaler = StandardScaler()

pca_std = PCA(n_components=30, random_state=0)
X_train_standardized = pca_std.fit_transform(standard_scaler.fit_transform(X_train))
print('20 PCs explain ', np.cumsum(pca_std.explained_variance_ratio_)*100, '% of variance cumulatively')

20 PCs explain  [14.97026583 21.12551135 27.02214426 32.52618841 37.6501061  41.578558
 45.43548813 49.18038419 52.73008544 56.23098723 59.6063302  62.8164911
 65.99689882 69.13265498 72.19935027 75.24831069 78.23639194 81.12112907
 83.97018122 86.69005512 89.33075644 91.74595232 93.48299834 95.06278288
 96.13356518 97.13897125 98.05656622 98.77417359 99.39613088 99.84301224] % of variance cumulatively


In [None]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)), 
    ('pca' , PCA(n_components=23)),
    ('model',  LogisticRegression(random_state=42))
])

cv = StratifiedKFold(n_splits = 5 , shuffle = True , random_state = 42)

param_grid = {
    'model__C': [0.01, 0.1, 1, 10],       
    'model__penalty': ['l1', 'l2'],        
    'model__solver': ['liblinear']         
}

# --- Cross-validation strategy ---
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# --- Grid Search ---
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=cv,
    scoring='recall',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [23]:
print(f"\nBest Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Score: {grid_search.best_score_:.4f}")
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)


Best Parameters: {'model__C': 0.01, 'model__penalty': 'l2', 'model__solver': 'liblinear'}
Best Cross-Validation Score: 0.7632


In [24]:
print("\nTest Set Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Test Set Evaluation:
Accuracy: 0.7483
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.75      0.83       247
           1       0.36      0.74      0.49        47

    accuracy                           0.75       294
   macro avg       0.65      0.75      0.66       294
weighted avg       0.85      0.75      0.78       294

Confusion Matrix:
 [[185  62]
 [ 12  35]]


In [29]:
from sklearn.metrics import classification_report, confusion_matrix , accuracy_score
from sklearn.model_selection import cross_val_predict,cross_val_score
import numpy as np

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)), 
    ('pca' , PCA(n_components=23)),
    ('model', LogisticRegression(C= 1, penalty= 'l1',class_weight='balanced' ,solver= 'liblinear',random_state=42))
])
cv_scores = cross_val_score(pipeline,X_train,y_train,cv=cv,scoring='f1')
print(cv_scores)
print(np.mean(cv_scores))
y_pred = cross_val_predict(pipeline,X,y,cv=cv)
print(classification_report(y,y_pred))
print(confusion_matrix(y,y_pred))

[0.53061224 0.5106383  0.57142857 0.45901639 0.52459016 0.45283019
 0.46428571 0.58064516 0.56140351 0.37931034]
0.5034760589430719
              precision    recall  f1-score   support

           0       0.94      0.77      0.85      1233
           1       0.38      0.75      0.51       237

    accuracy                           0.77      1470
   macro avg       0.66      0.76      0.68      1470
weighted avg       0.85      0.77      0.79      1470

[[948 285]
 [ 59 178]]
