# Import Libraries

In [1]:
import pandas as pd
import numpy as np

from  sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, RandomizedSearchCV

# set columns show max 25
pd.set_option('display.max_columns', 25)

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('preprocessed/df.csv')
df.head()

Unnamed: 0,Age,Gender,Years at Company,Job Role,Monthly Income,Work-Life Balance,Job Satisfaction,Performance Rating,Number of Promotions,Overtime,Distance from Home,Education Level,Marital Status,Number of Dependents,Job Level,Company Size,Company Tenure,Remote Work,Leadership Opportunities,Innovation Opportunities,Company Reputation,Employee Recognition,Attrition
0,31,1,19,0,5390,4,2,3,2,0,22,3,1,0,2,2,89,0,0,0,4,2,1
1,59,0,4,3,5534,1,3,1,3,0,21,4,0,3,2,2,21,0,0,0,2,1,1
2,24,0,10,2,8159,3,3,1,0,0,11,2,1,3,2,2,74,0,0,0,1,1,1
3,36,0,7,0,3989,3,3,4,1,0,27,1,2,2,2,1,50,1,0,0,3,2,1
4,56,1,41,0,4821,2,4,3,0,1,71,1,0,0,3,2,68,0,0,0,2,2,1


# Train Test Split

In [3]:
# train test split
X = df.drop('Attrition', axis=1)
y = df['Attrition']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


(59546, 22)
(14887, 22)
(59546,)
(14887,)


# Model

**Logistik Regression**

In [14]:
# Logistrik Regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
lg_y_pred = logreg.predict(X_test)

print('Classification Report: \n', classification_report(y_test, lg_y_pred, digits=4))

Classification Report: 
               precision    recall  f1-score   support

           0     0.6563    0.5650    0.6073      7051
           1     0.6521    0.7338    0.6906      7836

    accuracy                         0.6539     14887
   macro avg     0.6542    0.6494    0.6489     14887
weighted avg     0.6541    0.6539    0.6511     14887



**KNN**

In [15]:
# KNN
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_y_pred = knn.predict(X_test)

# Show all the matrix evaluation
print(classification_report(y_test, knn_y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.4878    0.4711    0.4793      7051
           1     0.5383    0.5549    0.5465      7836

    accuracy                         0.5152     14887
   macro avg     0.5131    0.5130    0.5129     14887
weighted avg     0.5144    0.5152    0.5147     14887



In [16]:
# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_y_pred = rf.predict(X_test)

# Show all the matrix evaluation
print(classification_report(y_test, rf_y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.7265    0.7403    0.7334      7051
           1     0.7623    0.7492    0.7557      7836

    accuracy                         0.7450     14887
   macro avg     0.7444    0.7448    0.7445     14887
weighted avg     0.7453    0.7450    0.7451     14887



In [17]:
# Decision Tree
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_y_pred = dt.predict(X_test)

# Show all the matrix evaluation
print(classification_report(y_test, dt_y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.6445    0.6498    0.6472      7051
           1     0.6826    0.6775    0.6800      7836

    accuracy                         0.6644     14887
   macro avg     0.6636    0.6637    0.6636     14887
weighted avg     0.6646    0.6644    0.6645     14887



In [18]:
# SVC
svc = SVC()
svc.fit(X_train, y_train)
svc_y_pred = svc.predict(X_test)

# Show all the matrix evaluation
print(classification_report(y_test, svc_y_pred, digits=4))

In [None]:
# Naive Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)
nb_y_pred = nb.predict(X_test)

# Show all the matrix evaluation
print(classification_report(y_test, nb_y_pred, digits=4))

              precision    recall  f1-score   support

           0       0.69      0.76      0.72      7051
           1       0.76      0.70      0.73      7836

    accuracy                           0.73     14887
   macro avg       0.73      0.73      0.73     14887
weighted avg       0.73      0.73      0.73     14887



In [None]:
# XGBoost Classifier
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
xgb_y_pred = xgb.predict(X_test)

# Show all the matrix evaluation
print(classification_report(y_test, xgb_y_pred, digits=4))

              precision    recall  f1-score   support

           0       0.73      0.74      0.74      7051
           1       0.77      0.75      0.76      7836

    accuracy                           0.75     14887
   macro avg       0.75      0.75      0.75     14887
weighted avg       0.75      0.75      0.75     14887



In [None]:
# ADaBoost Classifier
ada = AdaBoostClassifier()
ada.fit(X_train, y_train)
adab_y_pred = ada.predict(X_test)

# Show all the matrix evaluation
print(classification_report(y_test, adab_y_pred, digits=4))

              precision    recall  f1-score   support

           0       0.75      0.75      0.75      7051
           1       0.77      0.77      0.77      7836

    accuracy                           0.76     14887
   macro avg       0.76      0.76      0.76     14887
weighted avg       0.76      0.76      0.76     14887

