# Feature selection using Bees Algorithm

In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from niapy.problems import Problem
from niapy.task import Task

from niapy.algorithms.basic import BeesAlgorithm
import pandas as np
import pandas as pd
from sklearn import svm
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import accuracy_score

In [2]:
class SVMFeatureSelection(Problem):
    def __init__(self, X_train, y_train, alpha=0.99):
        super().__init__(dimension=X_train.shape[1], lower=0, upper=1)
        self.X_train = X_train
        self.y_train = y_train
        self.alpha = alpha

    def _evaluate(self, x):
        selected = x > 0.5
        num_selected = selected.sum()
        if num_selected == 0:
            return 1.0
        accuracy = cross_val_score(SVC(), self.X_train[:, selected], self.y_train, cv=2, n_jobs=-1).mean()
        score = 1 - accuracy
        num_features = self.X_train.shape[1]
        return self.alpha * score + (1 - self.alpha) * (num_selected / num_features)

In [3]:
dataset = load_breast_cancer()
X = dataset.data
y = dataset.target
feature_names = dataset.feature_names

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1234)

problem = SVMFeatureSelection(X_train, y_train)
#task = Task(problem, max_iters=200)

In [4]:
beedict = {}

# Bees Algorithm (1/3: Change to population size)

In [5]:
# Set saved variable for checked parameters
bestScore = 0.0
bestCriterion = None
pop_num = 0
pop_features = None

for y in range(1,4):
    print("=" * 50)
    print("ITERATION " + str(y))
    for x in range(10,100,30):
        task = Task(problem, max_iters=200)
        print("-"*50)
        print("Bees algorithm: Population size = "+ str(x))

        algorithm = BeesAlgorithm(population_size=x, m=5, e=4, ngh=1, nep=4, nsp=2)

        best_features, best_fitness = algorithm.run(task=task)

        selected_features = best_features > 0.5
        print('Number of selected features:', selected_features.sum())
        print('Selected features:', ', '.join(feature_names[selected_features].tolist()))
        features1 = feature_names[selected_features].tolist()
        model_selected = SVC()
        model_all = SVC()

        model_selected.fit(X_train[:, selected_features], y_train)
        subset_acc = model_selected.score(X_test[:, selected_features], y_test)
        print('Subset accuracy:', subset_acc)
        
        if subset_acc > bestScore:
            bestScore = subset_acc
            bestCriterion = x
            pop_num = selected_features.sum()
            pop_features = feature_names[selected_features].tolist()

        model_all.fit(X_train, y_train)
        print('All Features Accuracy:', model_all.score(X_test, y_test))

beedict["Bees-Population"] = "pop: " + str(bestCriterion) + ", accuracy: " + str(round(bestScore, 5))

ITERATION 1
--------------------------------------------------
Bees algorithm: Population size = 10
Number of selected features: 7
Selected features: mean smoothness, texture error, compactness error, concavity error, worst texture, worst perimeter, worst concavity
Subset accuracy: 0.956140350877193
All Features Accuracy: 0.9122807017543859
--------------------------------------------------
Bees algorithm: Population size = 40
Number of selected features: 8
Selected features: mean concavity, mean symmetry, mean fractal dimension, symmetry error, worst radius, worst texture, worst perimeter, worst compactness
Subset accuracy: 0.956140350877193
All Features Accuracy: 0.9122807017543859
--------------------------------------------------
Bees algorithm: Population size = 70
Number of selected features: 6
Selected features: mean radius, mean texture, concavity error, worst texture, worst perimeter, worst concave points
Subset accuracy: 0.956140350877193
All Features Accuracy: 0.912280701754

In [6]:
beedict

{'Bees-Population': 'pop: 10, accuracy: 0.95614'}

# Bees Algorithm (2/3: Change to the number of selected sites (m))

In [7]:
# Set saved variable for checked parameters
bestScore = 0.0
bestCriterion = None
m_num = 0
m_features = None

for y in range(1,4):
    print("=" * 50)
    print("ITERATION " + str(y))
    for x in range(1,10,3):
        task = Task(problem, max_iters=200)
        print("-"*50)
        print("Bees algorithm: m = "+ str(x))

        algorithm = BeesAlgorithm(population_size=40, m=x, e=4, ngh=1, nep=4, nsp=2)

        best_features, best_fitness = algorithm.run(task=task)

        selected_features = best_features > 0.5
        print('Number of selected features:', selected_features.sum())
        print('Selected features:', ', '.join(feature_names[selected_features].tolist()))
        features1 = feature_names[selected_features].tolist()
        model_selected = SVC()
        model_all = SVC()

        model_selected.fit(X_train[:, selected_features], y_train)
        
        subset_acc = model_selected.score(X_test[:, selected_features], y_test)
        print('Subset accuracy:', subset_acc)

        if subset_acc > bestScore:
            bestScore = subset_acc
            bestCriterion = x
            m_num = selected_features.sum()
            m_features = feature_names[selected_features].tolist()
        
        model_all.fit(X_train, y_train)
        print('All Features Accuracy:', model_all.score(X_test, y_test))
beedict["Bees-M"] = "m: " + str(bestCriterion) + ", accuracy: " + str(round(bestScore, 5))

ITERATION 1
--------------------------------------------------
Bees algorithm: m = 1
Number of selected features: 5
Selected features: mean texture, mean concave points, smoothness error, worst texture, worst perimeter
Subset accuracy: 0.956140350877193
All Features Accuracy: 0.9122807017543859
--------------------------------------------------
Bees algorithm: m = 4
Number of selected features: 7
Selected features: mean texture, mean smoothness, mean symmetry, worst texture, worst perimeter, worst smoothness, worst concave points
Subset accuracy: 0.9473684210526315
All Features Accuracy: 0.9122807017543859
--------------------------------------------------
Bees algorithm: m = 7
Number of selected features: 14
Selected features: mean smoothness, mean concavity, mean concave points, mean symmetry, radius error, compactness error, concave points error, symmetry error, fractal dimension error, worst smoothness, worst concavity, worst concave points, worst symmetry, worst fractal dimension


# Bees Algorithm (3/3: Change to number of recruits (nep))

In [8]:
# Set saved variable for checked parameters
bestScore = 0.0
bestCriterion = None
nep_num = 0
nep_features = None

for y in range(1,4):
    print("=" * 50)
    print("ITERATION " + str(y))
    for x in range(1,10,3):
        task = Task(problem, max_iters=200)
        print("-"*50)
        print("Bees algorithm: nep = "+ str(x))

        algorithm = BeesAlgorithm(population_size=40, m=5, e=4, ngh=1, nep=x, nsp=2)

        best_features, best_fitness = algorithm.run(task=task)

        selected_features = best_features > 0.5
        print('Number of selected features:', selected_features.sum())
        print('Selected features:', ', '.join(feature_names[selected_features].tolist()))
        features1 = feature_names[selected_features].tolist()
        model_selected = SVC()
        model_all = SVC()

        model_selected.fit(X_train[:, selected_features], y_train)
        
        subset_acc = model_selected.score(X_test[:, selected_features], y_test)
        print('Subset accuracy:', subset_acc)

        if subset_acc > bestScore:
            bestScore = subset_acc
            bestCriterion = x
            nep_num = selected_features.sum()
            nep_features = feature_names[selected_features].tolist()
        
        model_all.fit(X_train, y_train)
        print('All Features Accuracy:', model_all.score(X_test, y_test))
beedict["Bees-NEP"] = "nep: " + str(bestCriterion) + ", accuracy: " + str(round(bestScore, 5))

ITERATION 1
--------------------------------------------------
Bees algorithm: nep = 1
Number of selected features: 6
Selected features: mean compactness, mean fractal dimension, smoothness error, concavity error, worst texture, worst perimeter
Subset accuracy: 0.9649122807017544
All Features Accuracy: 0.9122807017543859
--------------------------------------------------
Bees algorithm: nep = 4
Number of selected features: 6
Selected features: mean texture, mean concave points, fractal dimension error, worst radius, worst texture, worst perimeter
Subset accuracy: 0.956140350877193
All Features Accuracy: 0.9122807017543859
--------------------------------------------------
Bees algorithm: nep = 7
Number of selected features: 6
Selected features: mean texture, texture error, perimeter error, worst texture, worst perimeter, worst smoothness
Subset accuracy: 0.956140350877193
All Features Accuracy: 0.9122807017543859
ITERATION 2
--------------------------------------------------
Bees algor

# Data Preprocess

In [9]:
df = pd.DataFrame(dataset.data, columns=dataset.feature_names)
df['target'] = dataset.target
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [10]:
beedict

{'Bees-Population': 'pop: 10, accuracy: 0.95614',
 'Bees-M': 'm: 1, accuracy: 0.95614',
 'Bees-NEP': 'nep: 1, accuracy: 0.96491'}

In [11]:
print("pop_num: {}, pop_features: {}\nm_num: {}, m_features: {}\nnep_num: {}, nep_features: {}\n".format(pop_num, pop_features, m_num, m_features, nep_num, nep_features))

pop_num: 7, pop_features: ['mean smoothness', 'texture error', 'compactness error', 'concavity error', 'worst texture', 'worst perimeter', 'worst concavity']
m_num: 5, m_features: ['mean texture', 'mean concave points', 'smoothness error', 'worst texture', 'worst perimeter']
nep_num: 6, nep_features: ['mean compactness', 'mean fractal dimension', 'smoothness error', 'concavity error', 'worst texture', 'worst perimeter']



In [12]:
newdf = df.drop(columns=[col for col in df if col not in pop_features])
newdf.head()

Unnamed: 0,mean smoothness,texture error,compactness error,concavity error,worst texture,worst perimeter,worst concavity
0,0.1184,0.9053,0.04904,0.05373,17.33,184.6,0.7119
1,0.08474,0.7339,0.01308,0.0186,23.41,158.8,0.2416
2,0.1096,0.7869,0.04006,0.03832,25.53,152.5,0.4504
3,0.1425,1.156,0.07458,0.05661,26.5,98.87,0.6869
4,0.1003,0.7813,0.02461,0.05688,16.67,152.2,0.4


In [13]:
y =  dataset.target
X = newdf
print(X.shape, y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 21)

#Feature Scaling, this is important as itt will allow the algorithm to quickly learn a better solution.
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

(569, 7) (569,)


# Naive Bayes Model

In [14]:
def gaussianNB():
    
    from sklearn.naive_bayes import GaussianNB
    gnb = GaussianNB()
    gnb.fit(X_train,y_train)
    y_pred_test = gnb.predict(X_test)
    from sklearn.metrics import accuracy_score
    acc = accuracy_score(y_test,y_pred_test)
    print("Naive Bayes : ", acc)
    b_class = ["benign", "malign"]
    print(classification_report(y_test, y_pred_test, target_names=b_class))
    
    return y_pred_test

# LR Model

In [15]:
def logisticRegression():
    from sklearn.linear_model import LogisticRegression
    logreg = LogisticRegression(solver = 'liblinear',multi_class='auto')
    logreg.fit(X_train,y_train)
    y_pred = logreg.predict(X_test)
    from sklearn.metrics import accuracy_score
    acc1 = accuracy_score(y_test,y_pred)
    print("Logistic Regression: ", acc1)
    b_class = ["benign", "malign"]
    print(classification_report(y_test, y_pred, target_names=b_class))
    
    return y_pred

# Decision Tree Model

In [16]:
def decisionTrees():
    from sklearn.tree import DecisionTreeClassifier
    dt = DecisionTreeClassifier()
    dt.fit(X_train,y_train)
    y_pred2 = dt.predict(X_test)
    acc2 = accuracy_score(y_test,y_pred2)
    print(acc2)
    print("Decision Trees:", acc2)
    b_class = ["benign", "malign"]
    print(classification_report(y_test, y_pred2, target_names=b_class))

    return y_pred2

# KNN Model

In [17]:
def knn():
    
    from sklearn.neighbors import KNeighborsClassifier
    clf = KNeighborsClassifier(n_neighbors=8,algorithm='ball_tree')
    clf.fit(X_train,y_train)
    y_pred3 = clf.predict(X_test)
    acc3 =   accuracy_score(y_test,y_pred3)
    print("K Nearest Neighbors: ", acc3)
    b_class = ["benign", "malign"]
    print(classification_report(y_test, y_pred3, target_names=b_class))  
    
    return y_pred3

# Support Vector Model

In [18]:
def svc():
    
    from sklearn.svm import SVC
    svc1 = SVC(C=50,kernel='rbf',gamma=1)     
    svc1.fit(X_train,y_train)
    y_pred4 = svc1.predict(X_test)
    from sklearn.metrics import accuracy_score
    acc4=    accuracy_score(y_test,y_pred4)
    print("Support Vector Classifier Accuracy", acc4)
    b_class = ["benign", "malign"]
    print(classification_report(y_test, y_pred4, target_names=b_class))  
    
    return y_pred4

# Model Results

In [19]:
models = {
        "gaussianNB": gaussianNB(),
        "decisionTrees": decisionTrees() ,
        "logisticRegression": logisticRegression(),
        "svc": svc(),
        "knn": knn()
    }

results = {}
print("\nNow testing algorithms")
for algo in models:

    print("trained model: ", algo )

print("\n")

Naive Bayes :  0.9298245614035088
              precision    recall  f1-score   support

      benign       0.90      0.90      0.90        39
      malign       0.95      0.95      0.95        75

    accuracy                           0.93       114
   macro avg       0.92      0.92      0.92       114
weighted avg       0.93      0.93      0.93       114

0.9035087719298246
Decision Trees: 0.9035087719298246
              precision    recall  f1-score   support

      benign       0.87      0.85      0.86        39
      malign       0.92      0.93      0.93        75

    accuracy                           0.90       114
   macro avg       0.89      0.89      0.89       114
weighted avg       0.90      0.90      0.90       114

Logistic Regression:  0.9736842105263158
              precision    recall  f1-score   support

      benign       1.00      0.92      0.96        39
      malign       0.96      1.00      0.98        75

    accuracy                           0.97       114

In [20]:
beedict

{'Bees-Population': 'pop: 10, accuracy: 0.95614',
 'Bees-M': 'm: 1, accuracy: 0.95614',
 'Bees-NEP': 'nep: 1, accuracy: 0.96491'}