In [208]:
import numpy as np
from pandas import read_csv
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
# from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [248]:
# evaluating the feature subset
def evaluate(features):
    if(np.count_nonzero(features)==0):
        return 1
    features = [f==1 for f in features]
    X_train, X_test, y_train, y_test = train_test_split(X[:, features], y, test_size=0.2, random_state=42)
    
    # Creating an SVM classifier and finding error rate
    clf = SVC()
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    
    err_rate = 1 - accuracy_score(y_test, y_pred)
    return err_rate

In [316]:
def fitness_function(features):
    alpha = 0.8
    beta = 0.2
    
    err_rate = evaluate(features)
    
    X = np.count_nonzero(features==1)
    N = len(features)
    
    fitness_X = alpha*err_rate + beta*(X/N)
    return fitness_X

In [293]:
# Transfer function for converting continuous space to discrete space

# Sigmoid family
def tf1(X):
    return 1/(1+np.exp(-X))

def tf2(X):
    return 1/(1+np.exp(-2*X))

def tf3(X):
    return 1/(1+np.exp(-X/2))

# T shaped family
def tf4(X):
    return 1/(1+np.tanh(X))

def tf5(X):
    return np.absolute(np.divide(X,np.sqrt(1+np.square(X))))

TF_list = [tf1,tf2,tf3,tf4,tf5]

In [294]:
# def binary_crow_search(num_features, num_iterations):
#     num_crows = 10
#     AP=0.1  # Awareness probability
    
#     # upper and lower bounds of flight length
#     fl = 2

#     crows = np.random.randint(2, size=(num_crows, num_features))   # N*d 2d binary vector

#     # evaluating crow positions
#     fitness_scores = np.array([fitness_function(crow) for crow in crows]) 

#     # memory initialisation with initial position of crows
#     mem = crows

#     # Perform iterations of the algorithm
#     for iteration in range(num_iterations):
#         for i in range(num_crows):
#             j_rand = np.random.randint(num_crows)  # random crow j

#             ri = np.random.rand()
#             if ri >= AP:
#                 crows[i] = crows[i] + (ri * fl * (mem[j_rand] - crows[i]))
#                 T = transfer_func1(crows[i])
#                 crows[i] = 1*(T>ri) + 0*(T<=ri)   # continuous to discrete
#             else:
#                 crows[i] = np.random.randint(2,size = num_features)

#         # Evaluate new position of crows
#         fitness_scores = np.array([fitness_function(crow) for crow in crows])

#         # Update the memory of each crow
#         for i in range(num_crows):
#             if fitness_scores[i] > fitness_function(mem[i]):
#                 mem[i] = crows[i]

#     # Final feature subset
#     fitness_scores = np.array([fitness_function(features) for features in mem])

#     best_index = np.argmax(fitness_scores)
    
#     best_fitness = fitness_scores[best_index]
#     best_solution = mem[best_index]
#     return best_solution, best_fitness, mem

In [309]:
# Defining the Binary Crow Search Algorithm for feature selection
def binary_crow_search_with_tvfl(num_features, transfer_func):
    num_iterations = 100
    num_crows = 10
    AP=0.1  # Awareness probability
    
    # upper and lower bounds of flight length
    tvfl_max = 4
    tvfl_min = 1

    crows = np.random.randint(2, size=(num_crows, num_features))   # N*d 2d binary vector

    # evaluating crow positions
    fitness_scores = np.array([fitness_function(crow) for crow in crows]) 

    # memory initialisation with initial position of crows
    mem = crows

    r = np.random.randint(2, size=num_crows)
    
    # Perform iterations of the algorithm
    for iteration in range(num_iterations):

        # Update the flight length
        time_ratio = iteration/num_iterations
        tvfl = (1-time_ratio)*tvfl_max + time_ratio*tvfl_min
        for i in range(num_crows):
            j_rand = np.random.randint(num_crows)  # i follows a random crow j
            if r[j_rand] >= AP:
                crows[i] = crows[i] + (r[i] * tvfl * (mem[j_rand] - crows[i]))
                T = transfer_func(crows[i])
                crows[i] = 1*(T>r[i]) + 0*(T<=r[i])   # continuous to discrete
            else:
                crows[i] = np.random.randint(2,size = num_features)
        # Evaluate new position of crows
#         fitness_scores = np.array([fitness_function(crow) for crow in crows])

        # Update the memory of each crow
        for i in range(num_crows):
            if fitness_function(crows[i]) < fitness_function(mem[i]):
                mem[i] = crows[i]

    # Final feature subset
    fitness_scores = np.array([fitness_function(features) for features in mem])

    best_index = np.argmin(fitness_scores)
    
    best_fitness = fitness_scores[best_index]
    best_solution = mem[best_index]
    return best_solution, best_fitness, mem

In [310]:
def find_accuracy(solutions):
    accuracies = []

    for solution in solutions:
        accuracies.append(1 - evaluate(solution))

    accuracies = np.array(accuracies)
    max_accuracy = np.max(accuracies)
    avg_accuracy = np.mean(accuracies)
    return max_accuracy

In [311]:
# breast cancer dataset
cancer= read_csv("dataR2.csv")
cancer

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1,Classification
0,48,23.500000,70,2.707,0.467409,8.8071,9.702400,7.99585,417.114,0
1,83,20.690495,92,3.115,0.706897,8.8438,5.429285,4.06405,468.786,0
2,82,23.124670,91,4.498,1.009651,17.9393,22.432040,9.27715,554.697,0
3,68,21.367521,77,3.226,0.612725,9.8827,7.169560,12.76600,928.220,0
4,86,21.111111,92,3.549,0.805386,6.6994,4.819240,10.57635,773.920,0
...,...,...,...,...,...,...,...,...,...,...
111,45,26.850000,92,3.330,0.755688,54.6800,12.100000,10.96000,268.230,1
112,62,26.840000,100,4.530,1.117400,12.4500,21.420000,7.32000,330.160,1
113,65,32.050000,97,5.730,1.370998,61.4800,22.540000,10.33000,314.050,1
114,72,25.590000,82,2.820,0.570392,24.9600,33.750000,3.27000,392.460,1


In [317]:
y = np.array(cancer['Classification'])
X = np.array(cancer.drop('Classification',axis=1))

In [319]:
num_features = X.shape[1]  # Number of features in the dataset

for trans_fun in TF_list:
    best_solution, best_fitness, solutions = binary_crow_search_with_tvfl(num_features, trans_fun)

    # best_solution, best_fitness = binary_crow_search(num_features, num_iterations)
    max_accuracy = find_accuracy(solutions)

    # Selected features based on the best solution
    selected_features = np.where(best_solution == 1)[0]

    print("Selected Features:", selected_features,trans_fun)
    print("Best Accuracy", max_accuracy)

Selected Features: [2 3] <function tf1 at 0x00000243DEA12820>
Best Accuracy 0.7916666666666666
Selected Features: [0 1 2 3 7] <function tf2 at 0x00000243DEA124C0>
Best Accuracy 0.8333333333333334
Selected Features: [0 2 3 6 7] <function tf3 at 0x00000243DEA12670>
Best Accuracy 0.8333333333333334
Selected Features: [2 3] <function tf4 at 0x00000243DEA12790>
Best Accuracy 0.75
Selected Features: [1 2 3] <function tf5 at 0x00000243DE9B5F70>
Best Accuracy 0.7916666666666666


In [320]:
# Testing on iris dataset

from sklearn.datasets import load_iris
dataset2 = load_iris()

In [321]:
X = dataset2["data"]
y = dataset2["target"]
X.shape[1]

4

In [322]:
num_features = X.shape[1]  # Number of features in the dataset

for trans_fun in TF_list:
    best_solution, best_fitness, solutions = binary_crow_search_with_tvfl(num_features, trans_fun)

    # best_solution, best_fitness = binary_crow_search(num_features, num_iterations)
    max_accuracy = find_accuracy(solutions)

    # Selected features based on the best solution
    selected_features = np.where(best_solution == 1)[0]

    print("Selected Features:", selected_features, trans_fun)
    print("Best Accuracy", max_accuracy)

Selected Features: [3] <function tf1 at 0x00000243DEA12820>
Best Accuracy 1.0
Selected Features: [0 3] <function tf2 at 0x00000243DEA124C0>
Best Accuracy 1.0
Selected Features: [2] <function tf3 at 0x00000243DEA12670>
Best Accuracy 1.0
Selected Features: [2 3] <function tf4 at 0x00000243DEA12790>
Best Accuracy 1.0
Selected Features: [0 3] <function tf5 at 0x00000243DE9B5F70>
Best Accuracy 1.0


In [323]:
# Testing on wine detection dataset

from sklearn.datasets import load_wine
dataset3 = load_wine()

In [324]:
X = dataset3["data"]
y = dataset3["target"]
X.shape[1]

13

In [325]:
num_features = X.shape[1]  # Number of features in the dataset

for trans_fun in TF_list:
    best_solution, best_fitness, solutions = binary_crow_search_with_tvfl(num_features, trans_fun)

    # best_solution, best_fitness = binary_crow_search(num_features, num_iterations)
    max_accuracy = find_accuracy(solutions)

    # Selected features based on the best solution
    selected_features = np.where(best_solution == 1)[0]

    print("Selected Features:", selected_features, trans_fun)
    print("Best Accuracy", max_accuracy)

Selected Features: [ 1  2  3  4  6  8  9 11 12] <function tf1 at 0x00000243DEA12820>
Best Accuracy 0.8055555555555556
Selected Features: [ 5  6  7 12] <function tf2 at 0x00000243DEA124C0>
Best Accuracy 0.8333333333333334
Selected Features: [ 2  6 10] <function tf3 at 0x00000243DEA12670>
Best Accuracy 0.8888888888888888
Selected Features: [0 1 3 5 6 9] <function tf4 at 0x00000243DEA12790>
Best Accuracy 0.8611111111111112
Selected Features: [ 2  5  9 11] <function tf5 at 0x00000243DE9B5F70>
Best Accuracy 0.8611111111111112


In [326]:
haberman = read_csv("haberman.csv")
haberman

Unnamed: 0,30,64,1,1.1
0,30,62,3,1
1,30,65,0,1
2,31,59,2,1
3,31,65,4,1
4,33,58,10,1
...,...,...,...,...
300,75,62,1,1
301,76,67,0,1
302,77,65,3,1
303,78,65,1,2


In [327]:
y = np.array(haberman["1.1"])
X = np.array(haberman.drop("1.1", axis=1))
X.shape[1]

3

In [328]:
num_features = X.shape[1]  # Number of features in the dataset

for trans_fun in TF_list:
    best_solution, best_fitness, solutions = binary_crow_search_with_tvfl(num_features, trans_fun)

    # best_solution, best_fitness = binary_crow_search(num_features, num_iterations)
    max_accuracy = find_accuracy(solutions)

    # Selected features based on the best solution
    selected_features = np.where(best_solution == 1)[0]

    print("Selected Features:", selected_features, trans_fun)
    print("Best Accuracy", max_accuracy)

Selected Features: [0] <function tf1 at 0x00000243DEA12820>
Best Accuracy 0.7704918032786885
Selected Features: [2] <function tf2 at 0x00000243DEA124C0>
Best Accuracy 0.7704918032786885
Selected Features: [2] <function tf3 at 0x00000243DEA12670>
Best Accuracy 0.7704918032786885
Selected Features: [2] <function tf4 at 0x00000243DEA12790>
Best Accuracy 0.7704918032786885
Selected Features: [1] <function tf5 at 0x00000243DE9B5F70>
Best Accuracy 0.7704918032786885
