In [1]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from imblearn.over_sampling import SMOTENC
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from BGWOPSO import BGWOPSO
from GWO import GWO
import functools

In [2]:
data = pd.read_csv("employee.csv")
print(data.shape)
data = data.drop(columns=["Over18", "EmployeeCount", "StandardHours"],axis=1)

(1470, 35)


In [3]:
data.nunique()

Age                           43
Attrition                      2
BusinessTravel                 3
DailyRate                    886
Department                     3
DistanceFromHome              29
Education                      5
EducationField                 6
EmployeeNumber              1470
EnvironmentSatisfaction        4
Gender                         2
HourlyRate                    71
JobInvolvement                 4
JobLevel                       5
JobRole                        9
JobSatisfaction                4
MaritalStatus                  3
MonthlyIncome               1349
MonthlyRate                 1427
NumCompaniesWorked            10
OverTime                       2
PercentSalaryHike             15
PerformanceRating              2
RelationshipSatisfaction       4
StockOptionLevel               4
TotalWorkingYears             40
TrainingTimesLastYear          7
WorkLifeBalance                4
YearsAtCompany                37
YearsInCurrentRole            19
YearsSince

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 32 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeNumber            1470 non-null   int64 
 9   EnvironmentSatisfaction   1470 non-null   int64 
 10  Gender                    1470 non-null   object
 11  HourlyRate                1470 non-null   int64 
 12  JobInvolvement            1470 non-null   int64 
 13  JobLevel                  1470 non-null   int64 
 14  JobRole                 

In [5]:
objectList = data.select_dtypes(include = "object").columns.to_list()
nonObjectList = data.select_dtypes(exclude="object").columns.to_list()

In [6]:
for x in ["Education", 'EnvironmentSatisfaction', "JobInvolvement", "JobLevel", "JobSatisfaction", "PerformanceRating", "RelationshipSatisfaction", "StockOptionLevel", "WorkLifeBalance"]:
    nonObjectList.remove(x)
    objectList.append(x)

In [7]:
scaler = MinMaxScaler()
le = LabelEncoder()
for object in objectList:
    data[object] = le.fit_transform(data[object])
    data[object] = scaler.fit_transform(data[object].values.reshape(-1,1))

for nonObject in nonObjectList:
    data[nonObject] = scaler.fit_transform(data[nonObject].values.reshape(-1,1))

In [8]:
data_class_right = data.drop("Attrition",axis=1)
data_class_right["Attrition"] = data["Attrition"]
data_class_right.to_csv("employee_class_on_right.csv", index=False, header=False)

In [9]:
X = data.drop("Attrition", axis=1).values
y = data["Attrition"].values

In [10]:
cat_index = [data.columns.get_loc(c) for c in objectList if c in data.drop('Attrition', axis=1)]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y)

### Seleksi Fitur BGWOPSO

In [12]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix

In [13]:
def fitness(x, X_train, X_test, y_train, y_test):
    alpha = 0.99
    beta = 1-alpha
    if x.ndim == 1:
        x = x.reshape(1, -1)
    loss = np.zeros(x.shape[0])
    for i in range(x.shape[0]):
        if np.sum(x[i, :]) > 0:
            model = SVC(C=10, gamma=0.1, kernel='rbf')
            model.fit(X_train[:, x[i,:].astype(bool)], y_train)
            acc = model.score(X_test[:, x[i,:].astype(bool)], y_test)
            error_rate = 1 - acc
            loss[i] = alpha * error_rate + beta * (np.sum(x[i, :]) / X.shape[1])
        else:
            loss[i] = np.inf
    return loss

In [14]:
# lossfunc_bgwopso = functools.partial(fitness, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
# optimizer_bgwopso = BGWOPSO(lossfunc_bgwopso, X_train.shape[1], 8, 70)
# optimizer_bgwopso.opt()
# selected_features_bgwopso = optimizer_bgwopso.gBest_X.astype(bool)
# print(selected_features_bgwopso)

In [15]:
# svm3 = SVC(C=10, gamma=0.1, kernel='rbf')
# svm3.fit(X_train[:, selected_features_bgwopso], y_train)
# y_pred3 = svm3.predict(X_test[:, selected_features_bgwopso])

# print("Accuracy ", accuracy_score(y_test,y_pred3))
# print("Precision ", precision_score(y_test, y_pred3))
# print("Recall / Sensifity ", recall_score(y_test, y_pred3))
# print("F1 ", f1_score(y_test, y_pred3))
# tn, fp, fn, tp = confusion_matrix(y_test, y_pred3).ravel()
# specificity = tn / (tn+fp)
# print("Specificity ", specificity)

### Seleksi Fitur GWO

In [16]:
lossfunc_gwo = functools.partial(fitness, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
fit_optimizer_gwo = GWO(lossfunc_gwo, 0, 1, X_train.shape[1], 10, 20)
selected_features_gwo = np.where(fit_optimizer_gwo>0.5)[0]
print(selected_features_gwo)

Completed in 12.457212924957275
[ 0  7  8 11 12 15 19 20 21 22 24 26 27]


In [17]:
svm2 = SVC(C=10, gamma=0.1, kernel='rbf')
svm2.fit(X_train[:, selected_features_gwo], y_train)
y_pred2 = svm2.predict(X_test[:, selected_features_gwo])

print("Accuracy ", accuracy_score(y_test,y_pred2))
print("Precision ", precision_score(y_test, y_pred2))
print("Recall / Sensifity ", recall_score(y_test, y_pred2))
print("F1 ", f1_score(y_test, y_pred2))
tn, fp, fn, tp = confusion_matrix(y_test, y_pred2).ravel()
specificity = tn / (tn+fp)
print("Specificity ", specificity)

Accuracy  0.8469387755102041
Precision  0.5625
Recall / Sensifity  0.19148936170212766
F1  0.2857142857142857
Specificity  0.97165991902834


In [18]:
# import sys
# import numpy
# numpy.set_printoptions(threshold=sys.maxsize)
print(X_test[:, selected_features_gwo])

[[0.14285714 0.72278665 1.         ... 0.025      0.66666667 0.025     ]
 [0.61904762 0.60232221 0.         ... 0.25       0.66666667 0.25      ]
 [0.30952381 0.29608128 0.66666667 ... 0.25       0.33333333 0.125     ]
 ...
 [0.42857143 0.83792937 0.33333333 ... 0.4        0.66666667 0.05      ]
 [0.9047619  0.69666183 0.         ... 0.125      1.         0.075     ]
 [0.45238095 0.18867925 1.         ... 0.425      0.66666667 0.425     ]]


### Tanpa Seleksi Fitur

In [19]:
# svm1 = SVC(C=10, gamma=0.1, kernel='rbf')
# svm1.fit(X_train, y_train).score(X_test, y_test)
# y_pred1 = svm1.predict(X_test)

# print("Accuracy ", accuracy_score(y_test,y_pred1))
# print("Precision ", precision_score(y_test, y_pred1))
# print("Recall / Sensifity ", recall_score(y_test, y_pred1))
# print("F1 ", f1_score(y_test, y_pred1))
# tn, fp, fn, tp = confusion_matrix(y_test, y_pred1).ravel()
# specificity = tn / (tn+fp)
# print("Specificity ", specificity)

In [20]:
# print("list of all selected features: ")
# for i, feature in enumerate(data.columns[np.where(selected_features_gwo)], start=1):
#     print(f"{i}. {feature}")

### PSO

In [21]:

# from Fitness import Data
# import PSO

# d = Data('employee_class_on_right.csv',False, 3) # Object for Data
# dim = d.getDimension()  # Dimensionality of the Features
# selected_pso = PSO.run(d,dim,3) # invoking
# selected_pso_bool = selected_pso.astype(bool)
# print(selected_pso_bool)

In [22]:
# svm4 = SVC(C=10, gamma=0.1, kernel='rbf')
# svm4.fit(X_train[:, selected_pso_bool], y_train)
# y_pred4 = svm4.predict(X_test[:, selected_pso_bool])

# print("Accuracy ", accuracy_score(y_test,y_pred4))
# print("Precision ", precision_score(y_test, y_pred4))
# print("Recall / Sensifity ", recall_score(y_test, y_pred4))
# print("F1 ", f1_score(y_test, y_pred4))
# tn, fp, fn, tp = confusion_matrix(y_test, y_pred4).ravel()
# specificity = tn / (tn+fp)
# print("Specificity ", specificity)