# Feature selection using Bees Algorithm

In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from niapy.problems import Problem
from niapy.task import Task

from niapy.algorithms.basic import BeesAlgorithm
import pandas as np
import pandas as pd
from sklearn import svm
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

In [2]:
class GaussianNBFeatureSelection(Problem):
    def __init__(self, X_train, y_train, alpha=0.99):
        super().__init__(dimension=X_train.shape[1], lower=0, upper=1)
        self.X_train = X_train
        self.y_train = y_train
        self.alpha = alpha

    def _evaluate(self, x):
        selected = x > 0.5
        num_selected = selected.sum()
        if num_selected == 0:
            return 1.0
        accuracy = cross_val_score(GaussianNB(), self.X_train[:, selected], self.y_train, cv=2, n_jobs=-1).mean()
        score = 1 - accuracy
        num_features = self.X_train.shape[1]
        return self.alpha * score + (1 - self.alpha) * (num_selected / num_features)

In [3]:
class LogisticRegressionFeatureSelection(Problem):
    def __init__(self, X_train, y_train, alpha=0.99):
        super().__init__(dimension=X_train.shape[1], lower=0, upper=1)
        self.X_train = X_train
        self.y_train = y_train
        self.alpha = alpha

    def _evaluate(self, x):
        selected = x > 0.5
        num_selected = selected.sum()
        if num_selected == 0:
            return 1.0
        accuracy = cross_val_score(LogisticRegression(), self.X_train[:, selected], self.y_train, cv=2, n_jobs=-1).mean()
        score = 1 - accuracy
        num_features = self.X_train.shape[1]
        return self.alpha * score + (1 - self.alpha) * (num_selected / num_features)

In [4]:
class DecisionTreeClassifierFeatureSelection(Problem):
    def __init__(self, X_train, y_train, alpha=0.99):
        super().__init__(dimension=X_train.shape[1], lower=0, upper=1)
        self.X_train = X_train
        self.y_train = y_train
        self.alpha = alpha

    def _evaluate(self, x):
        selected = x > 0.5
        num_selected = selected.sum()
        if num_selected == 0:
            return 1.0
        accuracy = cross_val_score(DecisionTreeClassifier(), self.X_train[:, selected], self.y_train, cv=2, n_jobs=-1).mean()
        score = 1 - accuracy
        num_features = self.X_train.shape[1]
        return self.alpha * score + (1 - self.alpha) * (num_selected / num_features)

In [5]:
class KNeighborsClassifierFeatureSelection(Problem):
    def __init__(self, X_train, y_train, alpha=0.99):
        super().__init__(dimension=X_train.shape[1], lower=0, upper=1)
        self.X_train = X_train
        self.y_train = y_train
        self.alpha = alpha

    def _evaluate(self, x):
        selected = x > 0.5
        num_selected = selected.sum()
        if num_selected == 0:
            return 1.0
        accuracy = cross_val_score(KNeighborsClassifier(), self.X_train[:, selected], self.y_train, cv=2, n_jobs=-1).mean()
        score = 1 - accuracy
        num_features = self.X_train.shape[1]
        return self.alpha * score + (1 - self.alpha) * (num_selected / num_features)

In [6]:
class SVMFeatureSelection(Problem):
    def __init__(self, X_train, y_train, alpha=0.99):
        super().__init__(dimension=X_train.shape[1], lower=0, upper=1)
        self.X_train = X_train
        self.y_train = y_train
        self.alpha = alpha

    def _evaluate(self, x):
        selected = x > 0.5
        num_selected = selected.sum()
        if num_selected == 0:
            return 1.0
        accuracy = cross_val_score(SVC(), self.X_train[:, selected], self.y_train, cv=2, n_jobs=-1).mean()
        score = 1 - accuracy
        num_features = self.X_train.shape[1]
        return self.alpha * score + (1 - self.alpha) * (num_selected / num_features)

In [7]:
dataset = load_breast_cancer()
X = dataset.data
y = dataset.target
feature_names = dataset.feature_names

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1234)

problem_nb = GaussianNBFeatureSelection(X_train, y_train)
problem_lr = LogisticRegressionFeatureSelection(X_train, y_train)
problem_dt = DecisionTreeClassifierFeatureSelection(X_train, y_train)
problem_knn = KNeighborsClassifierFeatureSelection(X_train, y_train)
problem_svc = SVMFeatureSelection(X_train, y_train)
#task = Task(problem, max_iters=200)

In [8]:
beedict = {}

# Bees Algorithm (1/3: Change to population size)

In [9]:
# Set saved variable for checked parameters
bestScore = 0.0
bestCriterion = None
pop_num = 0
pop_features = None

for x in range(10,100,30):
    # GaussianNB
    task = Task(problem_nb, max_iters=200)
    
    print("-"*50)
    print("Bees algorithm: Population size = "+ str(x))
    algorithm = BeesAlgorithm(population_size=x, m=5, e=4, ngh=1, nep=4, nsp=2)
    best_features, best_fitness = algorithm.run(task=task)
    selected_features = best_features > 0.5
    print('Number of selected features:', selected_features.sum())
    print('Selected features:', ', '.join(feature_names[selected_features].tolist()))
    features1 = feature_names[selected_features].tolist()
    
    model_selected = GaussianNB()
    model_all = GaussianNB()
    model_name = "GaussianNB"

    model_selected.fit(X_train[:, selected_features], y_train)
    subset_acc = model_selected.score(X_test[:, selected_features], y_test)
    print(model_name + ' - Subset accuracy:', subset_acc)
    if subset_acc > bestScore:
        bestScore = subset_acc
        bestCriterion = x
        pop_num = selected_features.sum()
        pop_features = feature_names[selected_features].tolist()
    model_all.fit(X_train, y_train)
    print(model_name + ' - All Features Accuracy:', model_all.score(X_test, y_test))
    
    # DecisionTreeClassifier
    task = Task(problem_dt, max_iters=200)
    
    print("-"*50)
    print("Bees algorithm: Population size = "+ str(x))
    algorithm = BeesAlgorithm(population_size=x, m=5, e=4, ngh=1, nep=4, nsp=2)
    best_features, best_fitness = algorithm.run(task=task)
    selected_features = best_features > 0.5
    print('Number of selected features:', selected_features.sum())
    print('Selected features:', ', '.join(feature_names[selected_features].tolist()))
    features1 = feature_names[selected_features].tolist()
    
    model_selected = DecisionTreeClassifier()
    model_all = DecisionTreeClassifier()
    model_name = "DecisionTreeClassifier"

    model_selected.fit(X_train[:, selected_features], y_train)
    subset_acc = model_selected.score(X_test[:, selected_features], y_test)
    print(model_name + ' - Subset accuracy:', subset_acc)
    if subset_acc > bestScore:
        bestScore = subset_acc
        bestCriterion = x
        pop_num = selected_features.sum()
        pop_features = feature_names[selected_features].tolist()
    model_all.fit(X_train, y_train)
    print(model_name + ' - All Features Accuracy:', model_all.score(X_test, y_test))
    
    # KNeighborsClassifier
    task = Task(problem_knn, max_iters=200)
    
    print("-"*50)
    print("Bees algorithm: Population size = "+ str(x))
    algorithm = BeesAlgorithm(population_size=x, m=5, e=4, ngh=1, nep=4, nsp=2)
    best_features, best_fitness = algorithm.run(task=task)
    selected_features = best_features > 0.5
    print('Number of selected features:', selected_features.sum())
    print('Selected features:', ', '.join(feature_names[selected_features].tolist()))
    features1 = feature_names[selected_features].tolist()
    
    model_selected = KNeighborsClassifier()
    model_all = KNeighborsClassifier()
    model_name = "KNeighborsClassifier"

    model_selected.fit(X_train[:, selected_features], y_train)
    subset_acc = model_selected.score(X_test[:, selected_features], y_test)
    print(model_name + ' - Subset accuracy:', subset_acc)
    if subset_acc > bestScore:
        bestScore = subset_acc
        bestCriterion = x
        pop_num = selected_features.sum()
        pop_features = feature_names[selected_features].tolist()
    model_all.fit(X_train, y_train)
    print(model_name + ' - All Features Accuracy:', model_all.score(X_test, y_test))
    
    # SVC
    task = Task(problem_svc, max_iters=200)
    
    print("-"*50)
    print("Bees algorithm: Population size = "+ str(x))
    algorithm = BeesAlgorithm(population_size=x, m=5, e=4, ngh=1, nep=4, nsp=2)
    best_features, best_fitness = algorithm.run(task=task)
    selected_features = best_features > 0.5
    print('Number of selected features:', selected_features.sum())
    print('Selected features:', ', '.join(feature_names[selected_features].tolist()))
    features1 = feature_names[selected_features].tolist()
    
    model_selected = SVC()
    model_all = SVC()
    model_name = "SVC"

    model_selected.fit(X_train[:, selected_features], y_train)
    subset_acc = model_selected.score(X_test[:, selected_features], y_test)
    print(model_name + ' - Subset accuracy:', subset_acc)
    if subset_acc > bestScore:
        bestScore = subset_acc
        bestCriterion = x
        pop_num = selected_features.sum()
        pop_features = feature_names[selected_features].tolist()
    model_all.fit(X_train, y_train)
    print(model_name + ' - All Features Accuracy:', model_all.score(X_test, y_test))

beedict["Bees-Population"] = "pop: " + str(bestCriterion) + ", accuracy: " + str(round(bestScore, 5))

--------------------------------------------------
Bees algorithm: Population size = 10
Number of selected features: 8
Selected features: concave points error, worst radius, worst texture, worst perimeter, worst area, worst smoothness, worst compactness, worst symmetry
GaussianNB - Subset accuracy: 0.9736842105263158
GaussianNB - All Features Accuracy: 0.9649122807017544
--------------------------------------------------
Bees algorithm: Population size = 10
Number of selected features: 15
Selected features: mean radius, mean texture, mean smoothness, mean concavity, mean concave points, texture error, perimeter error, area error, compactness error, concavity error, concave points error, worst area, worst compactness, worst concave points, worst fractal dimension
DecisionTreeClassifier - Subset accuracy: 1.0
DecisionTreeClassifier - All Features Accuracy: 0.9736842105263158
--------------------------------------------------
Bees algorithm: Population size = 10
Number of selected feature

In [10]:
beedict

{'Bees-Population': 'pop: 10, accuracy: 1.0'}

# Bees Algorithm (2/3: Change to the number of selected sites (m))

In [11]:
# Set saved variable for checked parameters
bestScore = 0.0
bestCriterion = None
m_num = 0
m_features = None

for x in range(1,10,3):
    # GaussianNB
    task = Task(problem_nb, max_iters=200)
        
    print("-"*50)
    print("Bees algorithm: m = "+ str(x))
    algorithm = BeesAlgorithm(population_size=40, m=x, e=4, ngh=1, nep=4, nsp=2)
    best_features, best_fitness = algorithm.run(task=task)
    selected_features = best_features > 0.5
    print('Number of selected features:', selected_features.sum())
    print('Selected features:', ', '.join(feature_names[selected_features].tolist()))
    features1 = feature_names[selected_features].tolist()
        
    model_selected = GaussianNB()
    model_all = GaussianNB()
    model_name = "GaussianNB"

    model_selected.fit(X_train[:, selected_features], y_train)
    subset_acc = model_selected.score(X_test[:, selected_features], y_test)
    print(model_name + ' - Subset accuracy:', subset_acc)
    if subset_acc > bestScore:
        bestScore = subset_acc
        bestCriterion = x
        m_num = selected_features.sum()
        m_features = feature_names[selected_features].tolist()
        
    model_all.fit(X_train, y_train)
    print(model_name + ' - All Features Accuracy:', model_all.score(X_test, y_test))
    
    # DecisionTreeClassifier
    task = Task(problem_dt, max_iters=200)
        
    print("-"*50)
    print("Bees algorithm: m = "+ str(x))
    algorithm = BeesAlgorithm(population_size=40, m=x, e=4, ngh=1, nep=4, nsp=2)
    best_features, best_fitness = algorithm.run(task=task)
    selected_features = best_features > 0.5
    print('Number of selected features:', selected_features.sum())
    print('Selected features:', ', '.join(feature_names[selected_features].tolist()))
    features1 = feature_names[selected_features].tolist()
        
    model_selected = DecisionTreeClassifier()
    model_all = DecisionTreeClassifier()
    model_name = "DecisionTreeClassifier"

    model_selected.fit(X_train[:, selected_features], y_train)
    subset_acc = model_selected.score(X_test[:, selected_features], y_test)
    print(model_name + ' - Subset accuracy:', subset_acc)
    if subset_acc > bestScore:
        bestScore = subset_acc
        bestCriterion = x
        m_num = selected_features.sum()
        m_features = feature_names[selected_features].tolist()
        
    model_all.fit(X_train, y_train)
    print(model_name + ' - All Features Accuracy:', model_all.score(X_test, y_test))
    
    # KNeighborsClassifier
    task = Task(problem_knn, max_iters=200)
        
    print("-"*50)
    print("Bees algorithm: m = "+ str(x))
    algorithm = BeesAlgorithm(population_size=40, m=x, e=4, ngh=1, nep=4, nsp=2)
    best_features, best_fitness = algorithm.run(task=task)
    selected_features = best_features > 0.5
    print('Number of selected features:', selected_features.sum())
    print('Selected features:', ', '.join(feature_names[selected_features].tolist()))
    features1 = feature_names[selected_features].tolist()
        
    model_selected = KNeighborsClassifier()
    model_all = KNeighborsClassifier()
    model_name = "KNeighborsClassifier"

    model_selected.fit(X_train[:, selected_features], y_train)
    subset_acc = model_selected.score(X_test[:, selected_features], y_test)
    print(model_name + ' - Subset accuracy:', subset_acc)
    if subset_acc > bestScore:
        bestScore = subset_acc
        bestCriterion = x
        m_num = selected_features.sum()
        m_features = feature_names[selected_features].tolist()
        
    model_all.fit(X_train, y_train)
    print(model_name + ' - All Features Accuracy:', model_all.score(X_test, y_test))
    
    # SVC
    task = Task(problem_svc, max_iters=200)
        
    print("-"*50)
    print("Bees algorithm: m = "+ str(x))
    algorithm = BeesAlgorithm(population_size=40, m=x, e=4, ngh=1, nep=4, nsp=2)
    best_features, best_fitness = algorithm.run(task=task)
    selected_features = best_features > 0.5
    print('Number of selected features:', selected_features.sum())
    print('Selected features:', ', '.join(feature_names[selected_features].tolist()))
    features1 = feature_names[selected_features].tolist()
        
    model_selected = SVC()
    model_all = SVC()
    model_name = "SVC"

    model_selected.fit(X_train[:, selected_features], y_train)
    subset_acc = model_selected.score(X_test[:, selected_features], y_test)
    print(model_name + ' - Subset accuracy:', subset_acc)
    if subset_acc > bestScore:
        bestScore = subset_acc
        bestCriterion = x
        m_num = selected_features.sum()
        m_features = feature_names[selected_features].tolist()
        
    model_all.fit(X_train, y_train)
    print(model_name + ' - All Features Accuracy:', model_all.score(X_test, y_test))
    
beedict["Bees-M"] = "m: " + str(bestCriterion) + ", accuracy: " + str(round(bestScore, 5))

--------------------------------------------------
Bees algorithm: m = 1
Number of selected features: 7
Selected features: mean texture, mean smoothness, mean symmetry, fractal dimension error, worst radius, worst texture, worst concavity
GaussianNB - Subset accuracy: 0.9649122807017544
GaussianNB - All Features Accuracy: 0.9649122807017544
--------------------------------------------------
Bees algorithm: m = 1
Number of selected features: 10
Selected features: mean texture, mean perimeter, mean concave points, perimeter error, compactness error, worst texture, worst area, worst compactness, worst concave points, worst fractal dimension
DecisionTreeClassifier - Subset accuracy: 0.9385964912280702
DecisionTreeClassifier - All Features Accuracy: 0.956140350877193
--------------------------------------------------
Bees algorithm: m = 1
Number of selected features: 7
Selected features: mean texture, mean smoothness, radius error, worst radius, worst perimeter, worst compactness, worst fra

# Bees Algorithm (3/3: Change to number of recruits (nep))

In [12]:
# Set saved variable for checked parameters
bestScore = 0.0
bestCriterion = None
nep_num = 0
nep_features = None

for x in range(1,10,3):
    # GaussianNB
    task = Task(problem_nb, max_iters=200)
    
    print("-"*50)
    print("Bees algorithm: nep = "+ str(x))
    algorithm = BeesAlgorithm(population_size=40, m=5, e=4, ngh=1, nep=x, nsp=2)
    best_features, best_fitness = algorithm.run(task=task)
    selected_features = best_features > 0.5
    print('Number of selected features:', selected_features.sum())
    print('Selected features:', ', '.join(feature_names[selected_features].tolist()))
    features1 = feature_names[selected_features].tolist()
        
    model_selected = GaussianNB()
    model_all = GaussianNB()
    model_name = "GaussianNB"

    model_selected.fit(X_train[:, selected_features], y_train)  
    subset_acc = model_selected.score(X_test[:, selected_features], y_test)
    print('Subset accuracy:', subset_acc)
    if subset_acc > bestScore:
        bestScore = subset_acc
        bestCriterion = x
        nep_num = selected_features.sum()
        nep_features = feature_names[selected_features].tolist()
    model_all.fit(X_train, y_train)
    print('All Features Accuracy:', model_all.score(X_test, y_test))
    
    # DecisionTreeClassifier
    task = Task(problem_dt, max_iters=200)
    
    print("-"*50)
    print("Bees algorithm: nep = "+ str(x))
    algorithm = BeesAlgorithm(population_size=40, m=5, e=4, ngh=1, nep=x, nsp=2)
    best_features, best_fitness = algorithm.run(task=task)
    selected_features = best_features > 0.5
    print('Number of selected features:', selected_features.sum())
    print('Selected features:', ', '.join(feature_names[selected_features].tolist()))
    features1 = feature_names[selected_features].tolist()
        
    model_selected = DecisionTreeClassifier()
    model_all = DecisionTreeClassifier()
    model_name = "DecisionTreeClassifier"

    model_selected.fit(X_train[:, selected_features], y_train)  
    subset_acc = model_selected.score(X_test[:, selected_features], y_test)
    print('Subset accuracy:', subset_acc)
    if subset_acc > bestScore:
        bestScore = subset_acc
        bestCriterion = x
        nep_num = selected_features.sum()
        nep_features = feature_names[selected_features].tolist()
    model_all.fit(X_train, y_train)
    print('All Features Accuracy:', model_all.score(X_test, y_test))
    
    # KNeighborsClassifier
    task = Task(problem_knn, max_iters=200)
    
    print("-"*50)
    print("Bees algorithm: nep = "+ str(x))
    algorithm = BeesAlgorithm(population_size=40, m=5, e=4, ngh=1, nep=x, nsp=2)
    best_features, best_fitness = algorithm.run(task=task)
    selected_features = best_features > 0.5
    print('Number of selected features:', selected_features.sum())
    print('Selected features:', ', '.join(feature_names[selected_features].tolist()))
    features1 = feature_names[selected_features].tolist()
        
    model_selected = KNeighborsClassifier()
    model_all = KNeighborsClassifier()
    model_name = "KNeighborsClassifier"

    model_selected.fit(X_train[:, selected_features], y_train)  
    subset_acc = model_selected.score(X_test[:, selected_features], y_test)
    print('Subset accuracy:', subset_acc)
    if subset_acc > bestScore:
        bestScore = subset_acc
        bestCriterion = x
        nep_num = selected_features.sum()
        nep_features = feature_names[selected_features].tolist()
    model_all.fit(X_train, y_train)
    print('All Features Accuracy:', model_all.score(X_test, y_test))
    
    # SVC
    task = Task(problem_svc, max_iters=200)
    
    print("-"*50)
    print("Bees algorithm: nep = "+ str(x))
    algorithm = BeesAlgorithm(population_size=40, m=5, e=4, ngh=1, nep=x, nsp=2)
    best_features, best_fitness = algorithm.run(task=task)
    selected_features = best_features > 0.5
    print('Number of selected features:', selected_features.sum())
    print('Selected features:', ', '.join(feature_names[selected_features].tolist()))
    features1 = feature_names[selected_features].tolist()
        
    model_selected = SVC()
    model_all = SVC()
    model_name = "SVC"

    model_selected.fit(X_train[:, selected_features], y_train)  
    subset_acc = model_selected.score(X_test[:, selected_features], y_test)
    print('Subset accuracy:', subset_acc)
    if subset_acc > bestScore:
        bestScore = subset_acc
        bestCriterion = x
        nep_num = selected_features.sum()
        nep_features = feature_names[selected_features].tolist()
    model_all.fit(X_train, y_train)
    print('All Features Accuracy:', model_all.score(X_test, y_test))
    
beedict["Bees-NEP"] = "nep: " + str(bestCriterion) + ", accuracy: " + str(round(bestScore, 5))

--------------------------------------------------
Bees algorithm: nep = 1
Number of selected features: 11
Selected features: mean texture, mean area, mean concave points, mean symmetry, texture error, worst texture, worst area, worst smoothness, worst concavity, worst concave points, worst symmetry
Subset accuracy: 0.9649122807017544
All Features Accuracy: 0.9649122807017544
--------------------------------------------------
Bees algorithm: nep = 1
Number of selected features: 10
Selected features: mean texture, mean compactness, mean concave points, mean symmetry, radius error, texture error, area error, compactness error, worst area, worst concave points
Subset accuracy: 0.9649122807017544
All Features Accuracy: 0.9649122807017544
--------------------------------------------------
Bees algorithm: nep = 1
Number of selected features: 10
Selected features: mean radius, mean texture, mean fractal dimension, perimeter error, concavity error, symmetry error, fractal dimension error, wors

# Data Preprocess

In [13]:
df = pd.DataFrame(dataset.data, columns=dataset.feature_names)
df['target'] = dataset.target
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [14]:
beedict

{'Bees-Population': 'pop: 10, accuracy: 1.0',
 'Bees-M': 'm: 4, accuracy: 0.99123',
 'Bees-NEP': 'nep: 1, accuracy: 0.96491'}

In [15]:
print("pop_num: {}, pop_features: {}\nm_num: {}, m_features: {}\nnep_num: {}, nep_features: {}\n".format(pop_num, pop_features, m_num, m_features, nep_num, nep_features))

pop_num: 15, pop_features: ['mean radius', 'mean texture', 'mean smoothness', 'mean concavity', 'mean concave points', 'texture error', 'perimeter error', 'area error', 'compactness error', 'concavity error', 'concave points error', 'worst area', 'worst compactness', 'worst concave points', 'worst fractal dimension']
m_num: 13, m_features: ['mean texture', 'mean concavity', 'mean concave points', 'mean fractal dimension', 'area error', 'compactness error', 'concavity error', 'symmetry error', 'worst texture', 'worst area', 'worst smoothness', 'worst concavity', 'worst concave points']
nep_num: 11, nep_features: ['mean texture', 'mean area', 'mean concave points', 'mean symmetry', 'texture error', 'worst texture', 'worst area', 'worst smoothness', 'worst concavity', 'worst concave points', 'worst symmetry']



In [16]:
newdf = df.drop(columns=[col for col in df if col not in pop_features])
newdf.head()

Unnamed: 0,mean radius,mean texture,mean smoothness,mean concavity,mean concave points,texture error,perimeter error,area error,compactness error,concavity error,concave points error,worst area,worst compactness,worst concave points,worst fractal dimension
0,17.99,10.38,0.1184,0.3001,0.1471,0.9053,8.589,153.4,0.04904,0.05373,0.01587,2019.0,0.6656,0.2654,0.1189
1,20.57,17.77,0.08474,0.0869,0.07017,0.7339,3.398,74.08,0.01308,0.0186,0.0134,1956.0,0.1866,0.186,0.08902
2,19.69,21.25,0.1096,0.1974,0.1279,0.7869,4.585,94.03,0.04006,0.03832,0.02058,1709.0,0.4245,0.243,0.08758
3,11.42,20.38,0.1425,0.2414,0.1052,1.156,3.445,27.23,0.07458,0.05661,0.01867,567.7,0.8663,0.2575,0.173
4,20.29,14.34,0.1003,0.198,0.1043,0.7813,5.438,94.44,0.02461,0.05688,0.01885,1575.0,0.205,0.1625,0.07678


In [17]:
y =  dataset.target
X = newdf
print(X.shape, y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 21)

#Feature Scaling, this is important as itt will allow the algorithm to quickly learn a better solution.
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

(569, 15) (569,)


# Naive Bayes Model

In [18]:
def gaussianNB():
    
    from sklearn.naive_bayes import GaussianNB
    gnb = GaussianNB()
    gnb.fit(X_train,y_train)
    y_pred_test = gnb.predict(X_test)
    from sklearn.metrics import accuracy_score
    acc = accuracy_score(y_test,y_pred_test)
    print("Naive Bayes : ", acc)
    b_class = ["benign", "malign"]
    print(classification_report(y_test, y_pred_test, target_names=b_class))
    
    return y_pred_test

# LR Model

In [19]:
def logisticRegression():
    from sklearn.linear_model import LogisticRegression
    logreg = LogisticRegression(solver = 'liblinear',multi_class='auto')
    logreg.fit(X_train,y_train)
    y_pred = logreg.predict(X_test)
    from sklearn.metrics import accuracy_score
    acc1 = accuracy_score(y_test,y_pred)
    print("Logistic Regression: ", acc1)
    b_class = ["benign", "malign"]
    print(classification_report(y_test, y_pred, target_names=b_class))
    
    return y_pred

# Decision Tree Model

In [20]:
def decisionTrees():
    from sklearn.tree import DecisionTreeClassifier
    dt = DecisionTreeClassifier()
    dt.fit(X_train,y_train)
    y_pred2 = dt.predict(X_test)
    acc2 = accuracy_score(y_test,y_pred2)
    print(acc2)
    print("Decision Trees:", acc2)
    b_class = ["benign", "malign"]
    print(classification_report(y_test, y_pred2, target_names=b_class))

    return y_pred2

# KNN Model

In [21]:
def knn():
    
    from sklearn.neighbors import KNeighborsClassifier
    clf = KNeighborsClassifier(n_neighbors=8,algorithm='ball_tree')
    clf.fit(X_train,y_train)
    y_pred3 = clf.predict(X_test)
    acc3 =   accuracy_score(y_test,y_pred3)
    print("K Nearest Neighbors: ", acc3)
    b_class = ["benign", "malign"]
    print(classification_report(y_test, y_pred3, target_names=b_class))  
    
    return y_pred3

# Support Vector Model

In [22]:
def svc():
    
    from sklearn.svm import SVC
    svc1 = SVC(C=50,kernel='rbf',gamma=1)     
    svc1.fit(X_train,y_train)
    y_pred4 = svc1.predict(X_test)
    from sklearn.metrics import accuracy_score
    acc4=    accuracy_score(y_test,y_pred4)
    print("Support Vector Classifier Accuracy", acc4)
    b_class = ["benign", "malign"]
    print(classification_report(y_test, y_pred4, target_names=b_class))  
    
    return y_pred4

# Model Results

In [23]:
models = {
        "gaussianNB": gaussianNB(),
        "decisionTrees": decisionTrees() ,
        "logisticRegression": logisticRegression(),
        "svc": svc(),
        "knn": knn()
    }

results = {}
print("\nNow testing algorithms")
for algo in models:

    print("trained model: ", algo )

print("\n")

Naive Bayes :  0.9122807017543859
              precision    recall  f1-score   support

      benign       0.87      0.87      0.87        39
      malign       0.93      0.93      0.93        75

    accuracy                           0.91       114
   macro avg       0.90      0.90      0.90       114
weighted avg       0.91      0.91      0.91       114

0.9210526315789473
Decision Trees: 0.9210526315789473
              precision    recall  f1-score   support

      benign       0.89      0.87      0.88        39
      malign       0.93      0.95      0.94        75

    accuracy                           0.92       114
   macro avg       0.91      0.91      0.91       114
weighted avg       0.92      0.92      0.92       114

Logistic Regression:  0.9824561403508771
              precision    recall  f1-score   support

      benign       1.00      0.95      0.97        39
      malign       0.97      1.00      0.99        75

    accuracy                           0.98       114

In [24]:
beedict

{'Bees-Population': 'pop: 10, accuracy: 1.0',
 'Bees-M': 'm: 4, accuracy: 0.99123',
 'Bees-NEP': 'nep: 1, accuracy: 0.96491'}