# Feature selection using Artificial Bee Colony Algorithm

In [1]:
#Artificial Bee Colony optimization to find an optimal subset of features for a ML classifier

In [2]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from niapy.problems import Problem
from niapy.task import Task
from niapy.algorithms.basic import ArtificialBeeColonyAlgorithm
import pandas as np
import pandas as pd
from sklearn import svm
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [3]:
class GaussianNBFeatureSelection(Problem):
    def __init__(self, X_train, y_train, alpha=0.99):
        super().__init__(dimension=X_train.shape[1], lower=0, upper=1)
        self.X_train = X_train
        self.y_train = y_train
        self.alpha = alpha

    def _evaluate(self, x):
        selected = x > 0.5
        num_selected = selected.sum()
        if num_selected == 0:
            return 1.0
        accuracy = cross_val_score(GaussianNB(), self.X_train[:, selected], self.y_train, cv=2, n_jobs=-1).mean()
        score = 1 - accuracy
        num_features = self.X_train.shape[1]
        return self.alpha * score + (1 - self.alpha) * (num_selected / num_features)

In [4]:
class LogisticRegressionFeatureSelection(Problem):
    def __init__(self, X_train, y_train, alpha=0.99):
        super().__init__(dimension=X_train.shape[1], lower=0, upper=1)
        self.X_train = X_train
        self.y_train = y_train
        self.alpha = alpha

    def _evaluate(self, x):
        selected = x > 0.5
        num_selected = selected.sum()
        if num_selected == 0:
            return 1.0
        accuracy = cross_val_score(LogisticRegression(), self.X_train[:, selected], self.y_train, cv=2, n_jobs=-1).mean()
        score = 1 - accuracy
        num_features = self.X_train.shape[1]
        return self.alpha * score + (1 - self.alpha) * (num_selected / num_features)

In [5]:
class DecisionTreeClassifierFeatureSelection(Problem):
    def __init__(self, X_train, y_train, alpha=0.99):
        super().__init__(dimension=X_train.shape[1], lower=0, upper=1)
        self.X_train = X_train
        self.y_train = y_train
        self.alpha = alpha

    def _evaluate(self, x):
        selected = x > 0.5
        num_selected = selected.sum()
        if num_selected == 0:
            return 1.0
        accuracy = cross_val_score(DecisionTreeClassifier(), self.X_train[:, selected], self.y_train, cv=2, n_jobs=-1).mean()
        score = 1 - accuracy
        num_features = self.X_train.shape[1]
        return self.alpha * score + (1 - self.alpha) * (num_selected / num_features)

In [6]:
class KNeighborsClassifierFeatureSelection(Problem):
    def __init__(self, X_train, y_train, alpha=0.99):
        super().__init__(dimension=X_train.shape[1], lower=0, upper=1)
        self.X_train = X_train
        self.y_train = y_train
        self.alpha = alpha

    def _evaluate(self, x):
        selected = x > 0.5
        num_selected = selected.sum()
        if num_selected == 0:
            return 1.0
        accuracy = cross_val_score(KNeighborsClassifier(), self.X_train[:, selected], self.y_train, cv=2, n_jobs=-1).mean()
        score = 1 - accuracy
        num_features = self.X_train.shape[1]
        return self.alpha * score + (1 - self.alpha) * (num_selected / num_features)

In [7]:
class SVMFeatureSelection(Problem):
    def __init__(self, X_train, y_train, alpha=0.99):
        super().__init__(dimension=X_train.shape[1], lower=0, upper=1)
        self.X_train = X_train
        self.y_train = y_train
        self.alpha = alpha

    def _evaluate(self, x):
        selected = x > 0.5
        num_selected = selected.sum()
        if num_selected == 0:
            return 1.0
        accuracy = cross_val_score(SVC(), self.X_train[:, selected], self.y_train, cv=2, n_jobs=-1).mean()
        score = 1 - accuracy
        num_features = self.X_train.shape[1]
        return self.alpha * score + (1 - self.alpha) * (num_selected / num_features)

In [8]:
dataset = load_breast_cancer()
X = dataset.data
y = dataset.target
feature_names = dataset.feature_names

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1234)

problem_nb = GaussianNBFeatureSelection(X_train, y_train)
problem_lr = LogisticRegressionFeatureSelection(X_train, y_train)
problem_dt = DecisionTreeClassifierFeatureSelection(X_train, y_train)
problem_knn = KNeighborsClassifierFeatureSelection(X_train, y_train)
problem_svc = SVMFeatureSelection(X_train, y_train)

#500 is the max number of iterations
print("-"*50)
# GaussianNB
task = Task(problem_nb, max_iters=200)

algorithm = ArtificialBeeColonyAlgorithm(population_size=10, limit=100)
best_features, best_fitness = algorithm.run(task)
selected_features = best_features > 0.5
print('Number of selected features:', selected_features.sum())
print('Selected features:', ', '.join(feature_names[selected_features].tolist()))
features1 = feature_names[selected_features].tolist()

model_selected = GaussianNB()
model_all = GaussianNB()
model_name = "GaussianNB"

model_selected.fit(X_train[:, selected_features], y_train)
print(model_name + 'Subset accuracy:', model_selected.score(X_test[:, selected_features], y_test))
model_all.fit(X_train, y_train)
print(model_name + 'All Features Accuracy:', model_all.score(X_test, y_test))
print("-"*50)
# DecisionTreeClassifier
task = Task(problem_dt, max_iters=200)

algorithm = ArtificialBeeColonyAlgorithm(population_size=10, limit=100)
best_features, best_fitness = algorithm.run(task)
selected_features = best_features > 0.5
print('Number of selected features:', selected_features.sum())
print('Selected features:', ', '.join(feature_names[selected_features].tolist()))
features1 = feature_names[selected_features].tolist()

model_selected = DecisionTreeClassifier()
model_all = DecisionTreeClassifier()
model_name = "DecisionTreeClassifier"

model_selected.fit(X_train[:, selected_features], y_train)
print(model_name + 'Subset accuracy:', model_selected.score(X_test[:, selected_features], y_test))
model_all.fit(X_train, y_train)
print(model_name + 'All Features Accuracy:', model_all.score(X_test, y_test))
print("-"*50)
# KNeighborsClassifier
task = Task(problem_knn, max_iters=200)

algorithm = ArtificialBeeColonyAlgorithm(population_size=10, limit=100)
best_features, best_fitness = algorithm.run(task)
selected_features = best_features > 0.5
print('Number of selected features:', selected_features.sum())
print('Selected features:', ', '.join(feature_names[selected_features].tolist()))
features1 = feature_names[selected_features].tolist()

model_selected = KNeighborsClassifier()
model_all = KNeighborsClassifier()
model_name = "KNeighborsClassifier"

model_selected.fit(X_train[:, selected_features], y_train)
print(model_name + 'Subset accuracy:', model_selected.score(X_test[:, selected_features], y_test))
model_all.fit(X_train, y_train)
print(model_name + 'All Features Accuracy:', model_all.score(X_test, y_test))
print("-"*50)
# SVC
task = Task(problem_svc, max_iters=200)

algorithm = ArtificialBeeColonyAlgorithm(population_size=10, limit=100)
best_features, best_fitness = algorithm.run(task)
selected_features = best_features > 0.5
print('Number of selected features:', selected_features.sum())
print('Selected features:', ', '.join(feature_names[selected_features].tolist()))
features1 = feature_names[selected_features].tolist()

model_selected = SVC()
model_all = SVC()
model_name = "SVC"

model_selected.fit(X_train[:, selected_features], y_train)
print(model_name + 'Subset accuracy:', model_selected.score(X_test[:, selected_features], y_test))
model_all.fit(X_train, y_train)
print(model_name + 'All Features Accuracy:', model_all.score(X_test, y_test))

--------------------------------------------------
Number of selected features: 9
Selected features: mean texture, mean smoothness, concavity error, fractal dimension error, worst radius, worst texture, worst perimeter, worst concave points, worst symmetry
GaussianNBSubset accuracy: 0.9649122807017544
GaussianNBAll Features Accuracy: 0.9649122807017544
--------------------------------------------------
Number of selected features: 13
Selected features: mean radius, mean texture, mean smoothness, perimeter error, compactness error, concavity error, concave points error, symmetry error, worst texture, worst area, worst smoothness, worst concavity, worst symmetry
DecisionTreeClassifierSubset accuracy: 0.9649122807017544
DecisionTreeClassifierAll Features Accuracy: 0.956140350877193
--------------------------------------------------
Number of selected features: 5
Selected features: mean texture, mean perimeter, mean concavity, worst texture, worst perimeter
KNeighborsClassifierSubset accur

In [9]:
# GaussianNB
print("-"*50)
task = Task(problem_nb, max_iters=200)

algorithm = ArtificialBeeColonyAlgorithm(population_size=20, limit=200)
best_features, best_fitness = algorithm.run(task)
selected_features = best_features > 0.5
print('Number of selected features:', selected_features.sum())
print('Selected features:', ', '.join(feature_names[selected_features].tolist()))
features2 = feature_names[selected_features].tolist()

model_selected = GaussianNB()
model_all = GaussianNB()
model_name = "GaussianNB"

model_selected.fit(X_train[:, selected_features], y_train)
print(model_name + 'Subset accuracy:', model_selected.score(X_test[:, selected_features], y_test))

model_all.fit(X_train, y_train)
print(model_name + 'All Features Accuracy:', model_all.score(X_test, y_test))

# DecisionTreeClassifier
print("-"*50)
task = Task(problem_dt, max_iters=200)

algorithm = ArtificialBeeColonyAlgorithm(population_size=20, limit=200)
best_features, best_fitness = algorithm.run(task)
selected_features = best_features > 0.5
print('Number of selected features:', selected_features.sum())
print('Selected features:', ', '.join(feature_names[selected_features].tolist()))
features2 = feature_names[selected_features].tolist()

model_selected = DecisionTreeClassifier()
model_all = DecisionTreeClassifier()
model_name = "DecisionTreeClassifier"

model_selected.fit(X_train[:, selected_features], y_train)
print(model_name + 'Subset accuracy:', model_selected.score(X_test[:, selected_features], y_test))

model_all.fit(X_train, y_train)
print(model_name + 'All Features Accuracy:', model_all.score(X_test, y_test))

# KNeighborsClassifier
print("-"*50)
task = Task(problem_knn, max_iters=200)

algorithm = ArtificialBeeColonyAlgorithm(population_size=20, limit=200)
best_features, best_fitness = algorithm.run(task)
selected_features = best_features > 0.5
print('Number of selected features:', selected_features.sum())
print('Selected features:', ', '.join(feature_names[selected_features].tolist()))
features2 = feature_names[selected_features].tolist()

model_selected = KNeighborsClassifier()
model_all = KNeighborsClassifier()
model_name = "KNeighborsClassifier"

model_selected.fit(X_train[:, selected_features], y_train)
print(model_name + 'Subset accuracy:', model_selected.score(X_test[:, selected_features], y_test))

model_all.fit(X_train, y_train)
print(model_name + 'All Features Accuracy:', model_all.score(X_test, y_test))

# SVC
print("-"*50)
task = Task(problem_svc, max_iters=200)

algorithm = ArtificialBeeColonyAlgorithm(population_size=20, limit=200)
best_features, best_fitness = algorithm.run(task)
selected_features = best_features > 0.5
print('Number of selected features:', selected_features.sum())
print('Selected features:', ', '.join(feature_names[selected_features].tolist()))
features2 = feature_names[selected_features].tolist()

model_selected = SVC()
model_all = SVC()
model_name = "SVC"

model_selected.fit(X_train[:, selected_features], y_train)
print(model_name + 'Subset accuracy:', model_selected.score(X_test[:, selected_features], y_test))

model_all.fit(X_train, y_train)
print(model_name + 'All Features Accuracy:', model_all.score(X_test, y_test))

--------------------------------------------------
Number of selected features: 9
Selected features: mean texture, mean smoothness, mean compactness, fractal dimension error, worst radius, worst texture, worst area, worst smoothness, worst concave points
GaussianNBSubset accuracy: 0.956140350877193
GaussianNBAll Features Accuracy: 0.9649122807017544
--------------------------------------------------
Number of selected features: 17
Selected features: mean radius, mean texture, mean smoothness, mean concave points, mean fractal dimension, radius error, perimeter error, area error, compactness error, concave points error, fractal dimension error, worst texture, worst area, worst smoothness, worst concavity, worst concave points, worst fractal dimension
DecisionTreeClassifierSubset accuracy: 0.9473684210526315
DecisionTreeClassifierAll Features Accuracy: 0.9473684210526315
--------------------------------------------------
Number of selected features: 4
Selected features: mean radius, mean

In [10]:
# GaussianNB
print("-"*50)
task = Task(problem_nb, max_iters=200)

algorithm = ArtificialBeeColonyAlgorithm(population_size=50, limit=300)
best_features, best_fitness = algorithm.run(task)
selected_features = best_features > 0.5
print('Number of selected features:', selected_features.sum())
print('Selected features:', ', '.join(feature_names[selected_features].tolist()))
features3 = feature_names[selected_features].tolist()

model_selected = GaussianNB()
model_all = GaussianNB()
model_name = "GaussianNB"

model_selected.fit(X_train[:, selected_features], y_train)
print(model_name + 'Subset accuracy:', model_selected.score(X_test[:, selected_features], y_test))
model_all.fit(X_train, y_train)
print(model_name + 'All Features Accuracy:', model_all.score(X_test, y_test))

# DecisionTreeClassifier
print("-"*50)
task = Task(problem_dt, max_iters=200)

algorithm = ArtificialBeeColonyAlgorithm(population_size=50, limit=300)
best_features, best_fitness = algorithm.run(task)
selected_features = best_features > 0.5
print('Number of selected features:', selected_features.sum())
print('Selected features:', ', '.join(feature_names[selected_features].tolist()))
features3 = feature_names[selected_features].tolist()

model_selected = DecisionTreeClassifier()
model_all = DecisionTreeClassifier()
model_name = "DecisionTreeClassifier"

model_selected.fit(X_train[:, selected_features], y_train)
print(model_name + 'Subset accuracy:', model_selected.score(X_test[:, selected_features], y_test))
model_all.fit(X_train, y_train)
print(model_name + 'All Features Accuracy:', model_all.score(X_test, y_test))

# KNeighborsClassifier
print("-"*50)
task = Task(problem_knn, max_iters=200)

algorithm = ArtificialBeeColonyAlgorithm(population_size=50, limit=300)
best_features, best_fitness = algorithm.run(task)
selected_features = best_features > 0.5
print('Number of selected features:', selected_features.sum())
print('Selected features:', ', '.join(feature_names[selected_features].tolist()))
features3 = feature_names[selected_features].tolist()

model_selected = KNeighborsClassifier()
model_all = KNeighborsClassifier()
model_name = "KNeighborsClassifier"

model_selected.fit(X_train[:, selected_features], y_train)
print(model_name + 'Subset accuracy:', model_selected.score(X_test[:, selected_features], y_test))
model_all.fit(X_train, y_train)
print(model_name + 'All Features Accuracy:', model_all.score(X_test, y_test))

# SVC
print("-"*50)
task = Task(problem_svc, max_iters=200)

algorithm = ArtificialBeeColonyAlgorithm(population_size=50, limit=300)
best_features, best_fitness = algorithm.run(task)
selected_features = best_features > 0.5
print('Number of selected features:', selected_features.sum())
print('Selected features:', ', '.join(feature_names[selected_features].tolist()))
features3 = feature_names[selected_features].tolist()

model_selected = SVC()
model_all = SVC()
model_name = "SVC"

model_selected.fit(X_train[:, selected_features], y_train)
print(model_name + 'Subset accuracy:', model_selected.score(X_test[:, selected_features], y_test))
model_all.fit(X_train, y_train)
print(model_name + 'All Features Accuracy:', model_all.score(X_test, y_test))

--------------------------------------------------
Number of selected features: 3
Selected features: worst texture, worst area, worst concave points
GaussianNBSubset accuracy: 0.9473684210526315
GaussianNBAll Features Accuracy: 0.9649122807017544
--------------------------------------------------
Number of selected features: 14
Selected features: mean texture, mean perimeter, mean area, mean concavity, mean concave points, mean symmetry, texture error, area error, concave points error, worst radius, worst area, worst smoothness, worst concavity, worst concave points
DecisionTreeClassifierSubset accuracy: 1.0
DecisionTreeClassifierAll Features Accuracy: 0.956140350877193
--------------------------------------------------
Number of selected features: 6
Selected features: mean texture, mean concave points, smoothness error, concave points error, worst radius, worst perimeter
KNeighborsClassifierSubset accuracy: 0.9649122807017544
KNeighborsClassifierAll Features Accuracy: 0.92105263157894

In [11]:
# GaussianNB
print("-"*50)
task = Task(problem_nb, max_iters=400)

algorithm = ArtificialBeeColonyAlgorithm(population_size=80, limit=500)
best_features, best_fitness = algorithm.run(task)
selected_features = best_features > 0.5
print('Number of selected features:', selected_features.sum())
print('Selected features:', ', '.join(feature_names[selected_features].tolist()))
features3 = feature_names[selected_features].tolist()

model_selected = GaussianNB()
model_all = GaussianNB()
model_name = "GaussianNB"

model_selected.fit(X_train[:, selected_features], y_train)
print(model_name + 'Subset accuracy:', model_selected.score(X_test[:, selected_features], y_test))
model_all.fit(X_train, y_train)
print(model_name + 'All Features Accuracy:', model_all.score(X_test, y_test))

# DecisionTreeClassifier
print("-"*50)
task = Task(problem_dt, max_iters=400)

algorithm = ArtificialBeeColonyAlgorithm(population_size=80, limit=500)
best_features, best_fitness = algorithm.run(task)
selected_features = best_features > 0.5
print('Number of selected features:', selected_features.sum())
print('Selected features:', ', '.join(feature_names[selected_features].tolist()))
features3 = feature_names[selected_features].tolist()

model_selected = DecisionTreeClassifier()
model_all = DecisionTreeClassifier()
model_name = "DecisionTreeClassifier"

model_selected.fit(X_train[:, selected_features], y_train)
print(model_name + 'Subset accuracy:', model_selected.score(X_test[:, selected_features], y_test))
model_all.fit(X_train, y_train)
print(model_name + 'All Features Accuracy:', model_all.score(X_test, y_test))

# KNeighborsClassifier
print("-"*50)
task = Task(problem_knn, max_iters=400)

algorithm = ArtificialBeeColonyAlgorithm(population_size=80, limit=500)
best_features, best_fitness = algorithm.run(task)
selected_features = best_features > 0.5
print('Number of selected features:', selected_features.sum())
print('Selected features:', ', '.join(feature_names[selected_features].tolist()))
features3 = feature_names[selected_features].tolist()

model_selected = KNeighborsClassifier()
model_all = KNeighborsClassifier()
model_name = "KNeighborsClassifier"

model_selected.fit(X_train[:, selected_features], y_train)
print(model_name + 'Subset accuracy:', model_selected.score(X_test[:, selected_features], y_test))
model_all.fit(X_train, y_train)
print(model_name + 'All Features Accuracy:', model_all.score(X_test, y_test))

# SVC
print("-"*50)
task = Task(problem_svc, max_iters=400)

algorithm = ArtificialBeeColonyAlgorithm(population_size=80, limit=500)
best_features, best_fitness = algorithm.run(task)
selected_features = best_features > 0.5
print('Number of selected features:', selected_features.sum())
print('Selected features:', ', '.join(feature_names[selected_features].tolist()))
features3 = feature_names[selected_features].tolist()

model_selected = SVC()
model_all = SVC()
model_name = "SVC"

model_selected.fit(X_train[:, selected_features], y_train)
print(model_name + 'Subset accuracy:', model_selected.score(X_test[:, selected_features], y_test))
model_all.fit(X_train, y_train)
print(model_name + 'All Features Accuracy:', model_all.score(X_test, y_test))

--------------------------------------------------
Number of selected features: 7
Selected features: mean texture, mean concave points, worst texture, worst area, worst smoothness, worst concave points, worst fractal dimension
GaussianNBSubset accuracy: 0.956140350877193
GaussianNBAll Features Accuracy: 0.9649122807017544
--------------------------------------------------
Number of selected features: 10
Selected features: mean area, mean smoothness, mean fractal dimension, perimeter error, area error, smoothness error, concave points error, worst texture, worst area, worst compactness
DecisionTreeClassifierSubset accuracy: 0.9210526315789473
DecisionTreeClassifierAll Features Accuracy: 0.9649122807017544
--------------------------------------------------
Number of selected features: 5
Selected features: mean radius, mean texture, perimeter error, worst radius, worst perimeter
KNeighborsClassifierSubset accuracy: 0.956140350877193
KNeighborsClassifierAll Features Accuracy: 0.92105263157

# Data Preprocess

In [12]:

df = pd.DataFrame(dataset.data, columns=dataset.feature_names)
df['target'] = dataset.target
df.head()


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [14]:

df = df.drop(columns=[col for col in df if col not in features3])
df.head()

Unnamed: 0,mean compactness,mean concavity,concavity error,worst concave points
0,0.2776,0.3001,0.05373,0.2654
1,0.07864,0.0869,0.0186,0.186
2,0.1599,0.1974,0.03832,0.243
3,0.2839,0.2414,0.05661,0.2575
4,0.1328,0.198,0.05688,0.1625


In [15]:
y =  dataset.target
X = df
print(X.shape, y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 21)

#Feature Scaling, this is important as itt will allow the algorithm to quickly learn a better solution.
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


(569, 4) (569,)


## Training a Naive Bayes Algorithm

In [16]:
def gaussianNB():
    
    from sklearn.naive_bayes import GaussianNB
    gnb = GaussianNB()
    gnb.fit(X_train,y_train)
    y_pred_test = gnb.predict(X_test)
    from sklearn.metrics import accuracy_score
    acc = accuracy_score(y_test,y_pred_test)
    print("Naive Bayes : ", acc)
    b_class = ["benign", "malign"]
    print(classification_report(y_test, y_pred_test, target_names=b_class))
    
    return y_pred_test

## Training a Logistic Regression Algorithm

In [17]:
def logisticRegression():
    from sklearn.linear_model import LogisticRegression
    logreg = LogisticRegression(solver = 'liblinear',multi_class='auto')
    logreg.fit(X_train,y_train)
    y_pred = logreg.predict(X_test)
    from sklearn.metrics import accuracy_score
    acc1 = accuracy_score(y_test,y_pred)
    print("Logistic Regression: ", acc1)
    b_class = ["benign", "malign"]
    print(classification_report(y_test, y_pred, target_names=b_class))
    
    return y_pred

# Training a Decision Trees

In [18]:
def decisionTrees():
    from sklearn.tree import DecisionTreeClassifier
    dt = DecisionTreeClassifier()
    dt.fit(X_train,y_train)
    y_pred2 = dt.predict(X_test)
    acc2 = accuracy_score(y_test,y_pred2)
    print(acc2)
    print("Decision Trees:", acc2)
    b_class = ["benign", "malign"]
    print(classification_report(y_test, y_pred2, target_names=b_class))

    return y_pred2

In [19]:
def knn():
    
    from sklearn.neighbors import KNeighborsClassifier
    clf = KNeighborsClassifier(n_neighbors=8,algorithm='ball_tree')
    clf.fit(X_train,y_train)
    y_pred3 = clf.predict(X_test)
    acc3 =   accuracy_score(y_test,y_pred3)
    print("K Nearest Neighbors: ", acc3)
    b_class = ["benign", "malign"]
    print(classification_report(y_test, y_pred3, target_names=b_class))  
    
    return y_pred3

In [20]:
def svc():
    
    from sklearn.svm import SVC
    svc1 = SVC(C=50,kernel='rbf',gamma=1)     
    svc1.fit(X_train,y_train)
    y_pred4 = svc1.predict(X_test)
    from sklearn.metrics import accuracy_score
    acc4=    accuracy_score(y_test,y_pred4)
    print("Support Vector Classifier Accuracy", acc4)
    b_class = ["benign", "malign"]
    print(classification_report(y_test, y_pred4, target_names=b_class))  
    
    return y_pred4

In [21]:

models = {
        "gaussianNB": gaussianNB(),
        "decisionTrees": decisionTrees() ,
        "logisticRegression": logisticRegression(),
        "svc": svc(),
        "knn": knn()
    }

results = {}
print("\nNow testing algorithms")
for algo in models:

    print("trained model: ", algo )

print("\n")

# for key, value in models.items():
#     results[key] = value
#     print(key, value)

Naive Bayes :  0.9122807017543859
              precision    recall  f1-score   support

      benign       0.89      0.85      0.87        39
      malign       0.92      0.95      0.93        75

    accuracy                           0.91       114
   macro avg       0.91      0.90      0.90       114
weighted avg       0.91      0.91      0.91       114

0.956140350877193
Decision Trees: 0.956140350877193
              precision    recall  f1-score   support

      benign       0.95      0.92      0.94        39
      malign       0.96      0.97      0.97        75

    accuracy                           0.96       114
   macro avg       0.95      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114

Logistic Regression:  0.9649122807017544
              precision    recall  f1-score   support

      benign       0.97      0.92      0.95        39
      malign       0.96      0.99      0.97        75

    accuracy                           0.96       114
 