In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from imblearn.over_sampling import SMOTE
import warnings
from imblearn.pipeline import Pipeline
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('E:/Employee-Attrition-Prediction-main/Employee-Attrition.csv')
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [10]:
drop_cols = ['EmployeeCount', 'Attrition', 'EmployeeNumber', 'Over18', 'StandardHours']
X = df.drop(columns=drop_cols)
y = df['Attrition']


categories = X.select_dtypes(include=['object']).columns

for col in categories:
    X[col] = LabelEncoder().fit_transform(X[col])


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('model', LogisticRegression(random_state=42))
])

In [12]:
param_grid = {
    'model__C': [0.01, 0.1, 1, 10],       
    'model__penalty': ['l1', 'l2'],        
    'model__solver': ['liblinear']         
}

# --- Cross-validation strategy ---
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# --- Grid Search ---
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [13]:
param_grid = {
    'model__C': [0.01, 0.1, 1, 10],       
    'model__penalty': ['l1', 'l2'],        
    'model__solver': ['liblinear']         
}

# --- Cross-validation strategy ---
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# --- Grid Search ---
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [None]:
print(f"\nBest Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Score: {grid_search.best_score_:.4f}")
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)



Best Parameters: {'model__C': 1, 'model__penalty': 'l2', 'model__solver': 'liblinear'}
Best Cross-Validation Score: 0.7500


In [15]:
print("\nTest Set Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Test Set Evaluation:
Accuracy: 0.7721
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.77      0.85       247
           1       0.39      0.79      0.52        47

    accuracy                           0.77       294
   macro avg       0.67      0.78      0.69       294
weighted avg       0.86      0.77      0.80       294

Confusion Matrix:
 [[190  57]
 [ 10  37]]
