In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder , StandardScaler
from sklearn.model_selection import train_test_split , StratifiedKFold, GridSearchCV,KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix , accuracy_score
from imblearn.pipeline import Pipeline  
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_predict,cross_val_score


In [2]:
df = pd.read_csv('C:/Users/amrsa/OneDrive/Documents/GitHub/Employee-Attrition-Prediction/data/raw/Employee-Attrition.csv')
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [3]:
df.drop(columns=['Over18','EmployeeCount','StandardHours','EmployeeNumber','MonthlyIncome','YearsInCurrentRole','YearsWithCurrManager'],axis =1)


Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsSinceLastPromotion
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,2,Female,...,Yes,11,3,1,0,8,0,1,6,0
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,3,Male,...,No,23,4,4,1,10,3,3,10,1
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,4,Male,...,Yes,15,3,2,0,7,3,3,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,4,Female,...,Yes,11,3,3,0,8,3,3,8,3
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,Male,...,No,12,3,4,1,6,3,3,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,3,Male,...,No,17,3,3,1,17,3,3,5,0
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,4,Male,...,No,15,3,1,1,9,5,3,7,1
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,2,Male,...,Yes,20,4,2,1,6,0,3,6,0
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,4,Male,...,No,14,3,4,0,17,3,2,9,0


In [4]:
travel_map={
    'Non-Travel':0,
    'Travel_Rarely':1,
    'Travel_Frequently':2
}
df['BusinessTravel']=df['BusinessTravel'].map(travel_map)


In [5]:
le=LabelEncoder()
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = le.fit_transform(df[col])


In [6]:
from sklearn.decomposition import PCA

X = df.drop(columns = 'Attrition',axis = 1)
y = df['Attrition']

In [7]:
standard_scaler = StandardScaler()

pca_std = PCA(n_components=30, random_state=0)
X_train_standardized = pca_std.fit_transform(standard_scaler.fit_transform(X))
print('20 PCs explain ', np.cumsum(pca_std.explained_variance_ratio_)*100, '% of variance cumulatively')

20 PCs explain  [15.1177236  21.22709673 27.03762795 32.49652301 37.64650746 41.49890243
 45.21541078 48.84229606 52.41322997 55.86503621 59.20301401 62.50306886
 65.68957269 68.83697581 71.9793182  75.03380451 78.03348644 80.91687859
 83.7415239  86.53173326 89.24734137 91.63610396 93.37259113 95.04566343
 96.09706656 97.12496156 98.02482084 98.74759769 99.39578583 99.85036824] % of variance cumulatively


In [8]:
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import RidgeClassifier


# Define base models
estimators = [
    ('lr',LogisticRegression(C= 1, penalty= 'l1',class_weight='balanced' ,solver= 'liblinear',random_state=42)),
    ('xg', XGBClassifier(colsample_bytree= 1.0, learning_rate= 0.01, max_depth= 3, n_estimators= 100, subsample= 0.8, random_state=42)),
    ('ds', DecisionTreeClassifier(criterion= 'gini', max_depth= 5, min_samples_leaf= 5, min_samples_split= 5,random_state=42))
    ]

# Final estimator in the stack
final_estimator = LogisticRegression(C= 1, penalty= 'l1',class_weight='balanced' ,solver= 'liblinear',random_state=42)
# Build the full pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('pca', PCA(n_components=23)),
    ('model', StackingClassifier(estimators=estimators, final_estimator=final_estimator))
    ])

# Setup stratified CV and GridSearch
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

cv_scores = cross_val_score(pipeline,X,y,cv=cv,scoring='f1')
print(cv_scores)
print(np.mean(cv_scores))
y_pred = cross_val_predict(pipeline,X,y,cv=cv)
print(classification_report(y,y_pred))
print(confusion_matrix(y,y_pred))

[0.46153846 0.5483871  0.52631579 0.50746269 0.58181818 0.57142857
 0.61016949 0.43137255 0.55882353 0.37288136]
0.5170197713489256
              precision    recall  f1-score   support

           0       0.93      0.83      0.87      1233
           1       0.43      0.66      0.52       237

    accuracy                           0.80      1470
   macro avg       0.68      0.75      0.70      1470
weighted avg       0.85      0.80      0.82      1470

[[1021  212]
 [  80  157]]


In [9]:
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import RidgeClassifier


# Define base models
estimators = [
    ('lr',SVC(C=1,kernel='rbf',degree=2,gamma=0.01,random_state=42)),
    ('xg', XGBClassifier(colsample_bytree= 1.0, learning_rate= 0.01, max_depth= 3, n_estimators= 100, subsample= 0.8, random_state=42)),
    ('ds', DecisionTreeClassifier(criterion= 'gini', max_depth= 5, min_samples_leaf= 5, min_samples_split= 5,random_state=42))
    ]

# Final estimator in the stack
final_estimator = SVC(C=1,kernel='rbf',degree=2,gamma=0.01,random_state=42)
# Build the full pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('pca', PCA(n_components=23)),
    ('model', StackingClassifier(estimators=estimators, final_estimator=final_estimator))
    ])

# Setup stratified CV and GridSearch
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

cv_scores = cross_val_score(pipeline,X,y,cv=cv,scoring='f1')
print(cv_scores)
print(np.mean(cv_scores))
y_pred = cross_val_predict(pipeline,X,y,cv=cv)
print(classification_report(y,y_pred))
print(confusion_matrix(y,y_pred))

[0.45614035 0.58181818 0.57692308 0.44067797 0.64150943 0.53333333
 0.52459016 0.44897959 0.51724138 0.42105263]
0.5142266109676197
              precision    recall  f1-score   support

           0       0.92      0.85      0.89      1233
           1       0.44      0.61      0.51       237

    accuracy                           0.81      1470
   macro avg       0.68      0.73      0.70      1470
weighted avg       0.84      0.81      0.83      1470

[[1053  180]
 [  93  144]]
