In [None]:
from google.colab import files
uploaded = files.upload()


Saving HR-Employee-Attrition.csv to HR-Employee-Attrition.csv


In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, roc_auc_score

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from imblearn.over_sampling import SMOTE


In [None]:
df = pd.read_csv("HR_Employee_Attrition.csv")
df.head()


Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [None]:
# Drop unnecessary columns
df.drop(['EmployeeNumber', 'Over18', 'StandardHours'], axis=1, inplace=True)

# Encode target variable
df['Attrition'] = df['Attrition'].map({'Yes': 1, 'No': 0})

# Encode categorical columns
cat_cols = df.select_dtypes(include='object').columns
le = LabelEncoder()

for col in cat_cols:
    df[col] = le.fit_transform(df[col])


In [None]:
X = df.drop('Attrition', axis=1)
y = df['Attrition']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


In [None]:
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", pd.Series(y_train_sm).value_counts())


Before SMOTE: Attrition
0    986
1    190
Name: count, dtype: int64
After SMOTE: Attrition
0    986
1    986
Name: count, dtype: int64


In [None]:
scaler = StandardScaler()
X_train_sm = scaler.fit_transform(X_train_sm)
X_test = scaler.transform(X_test)


In [None]:
lr = LogisticRegression(class_weight='balanced', max_iter=1000)
lr.fit(X_train_sm, y_train_sm)

lr_auc = roc_auc_score(y_test, lr.predict_proba(X_test)[:, 1])
print("Logistic Regression ROC-AUC:", lr_auc)


Logistic Regression ROC-AUC: 0.7182358514945302


In [None]:
xgb = XGBClassifier(
    scale_pos_weight=y_train.value_counts()[0] / y_train.value_counts()[1],
    eval_metric='logloss',
    random_state=42
)

xgb.fit(X_train_sm, y_train_sm)
xgb_auc = roc_auc_score(y_test, xgb.predict_proba(X_test)[:, 1])
print("XGBoost ROC-AUC:", xgb_auc)


XGBoost ROC-AUC: 0.7320182616935137


In [None]:
cat = CatBoostClassifier(
    verbose=0,
    class_weights=[1, y_train.value_counts()[0] / y_train.value_counts()[1]]
)

cat.fit(X_train_sm, y_train_sm)
cat_auc = roc_auc_score(y_test, cat.predict_proba(X_test)[:, 1])
print("CatBoost ROC-AUC:", cat_auc)


CatBoost ROC-AUC: 0.7450254113188044


In [None]:
print("Classification Report (XGBoost):")
print(classification_report(y_test, xgb.predict(X_test)))


Classification Report (XGBoost):
              precision    recall  f1-score   support

           0       0.89      0.87      0.88       247
           1       0.38      0.40      0.39        47

    accuracy                           0.80       294
   macro avg       0.63      0.64      0.64       294
weighted avg       0.80      0.80      0.80       294



In [None]:
importances = pd.Series(xgb.feature_importances_, index=X.columns)
importances.sort_values(ascending=False).head(10)


Unnamed: 0,0
StockOptionLevel,0.139506
MaritalStatus,0.091154
YearsWithCurrManager,0.052119
EnvironmentSatisfaction,0.051536
JobSatisfaction,0.049666
WorkLifeBalance,0.047018
JobInvolvement,0.04517
Education,0.038469
PerformanceRating,0.038449
BusinessTravel,0.034968
