In [1]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from abcde import BGWOPSO
import functools

In [2]:
data = pd.read_csv("employee.csv")
data.nunique()

Age                           43
Attrition                      2
BusinessTravel                 3
DailyRate                    886
Department                     3
DistanceFromHome              29
Education                      5
EducationField                 6
EmployeeCount                  1
EmployeeNumber              1470
EnvironmentSatisfaction        4
Gender                         2
HourlyRate                    71
JobInvolvement                 4
JobLevel                       5
JobRole                        9
JobSatisfaction                4
MaritalStatus                  3
MonthlyIncome               1349
MonthlyRate                 1427
NumCompaniesWorked            10
Over18                         1
OverTime                       2
PercentSalaryHike             15
PerformanceRating              2
RelationshipSatisfaction       4
StandardHours                  1
StockOptionLevel               4
TotalWorkingYears             40
TrainingTimesLastYear          7
WorkLifeBa

In [3]:
data = data.drop(columns=["Over18", "EmployeeCount", "StandardHours"],axis=1)

In [4]:
data.nunique()

Age                           43
Attrition                      2
BusinessTravel                 3
DailyRate                    886
Department                     3
DistanceFromHome              29
Education                      5
EducationField                 6
EmployeeNumber              1470
EnvironmentSatisfaction        4
Gender                         2
HourlyRate                    71
JobInvolvement                 4
JobLevel                       5
JobRole                        9
JobSatisfaction                4
MaritalStatus                  3
MonthlyIncome               1349
MonthlyRate                 1427
NumCompaniesWorked            10
OverTime                       2
PercentSalaryHike             15
PerformanceRating              2
RelationshipSatisfaction       4
StockOptionLevel               4
TotalWorkingYears             40
TrainingTimesLastYear          7
WorkLifeBalance                4
YearsAtCompany                37
YearsInCurrentRole            19
YearsSince

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 32 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeNumber            1470 non-null   int64 
 9   EnvironmentSatisfaction   1470 non-null   int64 
 10  Gender                    1470 non-null   object
 11  HourlyRate                1470 non-null   int64 
 12  JobInvolvement            1470 non-null   int64 
 13  JobLevel                  1470 non-null   int64 
 14  JobRole                 

In [6]:
objectList = data.select_dtypes(include = "object").columns.to_list()
nonObjectList = data.select_dtypes(exclude="object").columns.to_list()

In [7]:
for x in ["Education", 'EnvironmentSatisfaction', "JobInvolvement", "JobLevel", "JobSatisfaction", "PerformanceRating", "RelationshipSatisfaction", "StockOptionLevel", "WorkLifeBalance"]:
    nonObjectList.remove(x)
    objectList.append(x)

In [8]:
scaler = MinMaxScaler()
le = LabelEncoder()
for object in objectList:
    data[object] = le.fit_transform(data[object])
    data[object] = scaler.fit_transform(data[object].values.reshape(-1,1))

for nonObject in nonObjectList:
    data[nonObject] = scaler.fit_transform(data[nonObject].values.reshape(-1,1))

In [9]:
X = data.drop("Attrition", axis=1).values
y = data["Attrition"].values

In [10]:
cat_index = [data.columns.get_loc(c) for c in objectList if c in data.drop('Attrition', axis=1)]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y)

In [12]:
def fitness(x, X_train, X_test, y_train, y_test):
    alpha = 0.99
    beta = 1-alpha
    if x.ndim == 1:
        x = x.reshape(1, -1)
    loss = np.zeros(x.shape[0])
    for i in range(x.shape[0]):
        if np.sum(x[i, :]) > 0:
            model = SVC(C=10, gamma=0.1, kernel='rbf')
            model.fit(X_train[:, x[i,:].astype(bool)], y_train)
            acc = model.score(X_test[:, x[i,:].astype(bool)], y_test)
            error_rate = 1 - acc
            loss[i] = alpha * error_rate + beta * (np.sum(x[i, :]) / X.shape[1])
        else:
            loss[i] = np.inf
    return loss

In [13]:
lossfunc = functools.partial(fitness, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)

optimizer = BGWOPSO(lossfunc, X_train.shape[1], 8, 70)
optimizer.opt()
selected_features = optimizer.gBest_X.astype(bool)

2.0
1.9714285714285715
1.9428571428571428
1.9142857142857144
1.8857142857142857
1.8571428571428572
1.8285714285714285
1.8
1.7714285714285714
1.7428571428571429
1.7142857142857144
1.6857142857142857
1.657142857142857
1.6285714285714286
1.6
1.5714285714285714
1.542857142857143
1.5142857142857142
1.4857142857142858
1.4571428571428573
1.4285714285714286
1.4
1.3714285714285714
1.342857142857143
1.3142857142857143
1.2857142857142856
1.2571428571428571
1.2285714285714286
1.2
1.1714285714285713
1.1428571428571428
1.1142857142857143
1.0857142857142859
1.0571428571428572
1.0285714285714285
1.0
0.9714285714285715
0.9428571428571428
0.9142857142857144
0.8857142857142857
0.8571428571428572
0.8285714285714285
0.8
0.7714285714285714
0.7428571428571429
0.7142857142857142
0.6857142857142857
0.6571428571428573
0.6285714285714286
0.6000000000000001
0.5714285714285714
0.5428571428571429
0.5142857142857142
0.48571428571428577
0.4571428571428571
0.4285714285714286
0.3999999999999999
0.37142857142857144
0.34

tanpa SF

In [14]:
svm = SVC(C=10, gamma=0.1, kernel='rbf')
svm.fit(X_train, y_train).score(X_test, y_test)

0.8639455782312925

pake SF

In [15]:
print("list of all selected features: ")
for i, feature in enumerate(data.columns[np.where(selected_features)], start=1):
    print(f"{i}. {feature}")

list of all selected features: 
1. Age
2. Attrition
3. BusinessTravel
4. DailyRate
5. DistanceFromHome
6. EducationField
7. EmployeeNumber
8. HourlyRate
9. JobInvolvement
10. JobLevel
11. JobRole
12. MonthlyIncome
13. MonthlyRate
14. NumCompaniesWorked
15. PercentSalaryHike
16. RelationshipSatisfaction
17. StockOptionLevel
18. TrainingTimesLastYear
19. YearsAtCompany
20. YearsInCurrentRole
21. YearsSinceLastPromotion


In [16]:
svm.fit(X_train[:, selected_features], y_train).score(X_test[:, selected_features], y_test)

0.891156462585034