<a href="https://colab.research.google.com/github/prhvmm/Feature-Selection-of-Breast-Cancer-Diagnostic-using-Genetic-Algorithm/blob/main/Feature_Selection_of_Breast_Cancer_Diagnostic_Dataset_using_Genetic_Algorithms.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from random import randint
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

# Variables and Data


In [3]:
data_bc = pd.read_csv(r'data.csv') #change the path to your directory
label_bc = data_bc["diagnosis"]
label_bc = np.where(label_bc == 'M', 1, 0) #changing the classes from M and B to 1 and 0
data_bc.drop(["id", "diagnosis", "Unnamed: 32"], axis = 1, inplace = True)

final_score = []
          
#logmodel = svm.SVC(kernel='linear')
#logmodel = svm.SVC(kernel='rbf')         
#logmodel = RandomForestClassifier(n_estimators=200, random_state=0)
#logmodel = KNeighborsClassifier()
logmodel = XGBClassifier(objective='reg:logistic', use_label_encoder=False)



# Functions

In [5]:
def split(df, label):
    X_tr, X_te, Y_tr, Y_te = train_test_split(df, label, test_size=0.3)
    return X_tr, X_te, Y_tr, Y_te



def initialization_of_population(size, n_feat, select_feature):
    population = []
    for i in range(size):
        chromosome = np.zeros(n_feat, dtype=np.bool_)     
        for j in range(select_feature):
            flag = True
            while flag:
                random_index = randint(0, n_feat-1)
                if(chromosome[random_index]==False):
                    chromosome[random_index]=True
                    flag = False
        population.append(chromosome)
    return population

def fitness_score(population):
    scores = []
    predictions = []
    n_rows = X_train.shape[0]

    select_rows = np.ones(n_rows, dtype=np.bool_)
    for j in range(n_rows):
        random_number = randint(1, 100)
        if(random_number > 80):
            select_rows[j] = False

    for chromosome in population:
        logmodel.fit(X_train.iloc[select_rows,chromosome], Y_train[select_rows])        
        prediction = logmodel.predict(X_test.iloc[:, chromosome])
        predictions.append(prediction)
        scores.append(metrics.f1_score(Y_test, prediction))
        
    scores, population, predictions = np.array(scores), np.array(population), np.array(predictions)
    
    inds = np.argsort(scores)
                                      
    return list(scores[inds][::-1]), list(population[inds,:][::-1]), list(predictions[inds][::-1]) 

def selection(pop_after_fit, n_parents):
    population_nextgen = []
    for i in range(n_parents):
        population_nextgen.append(pop_after_fit[i])
    return population_nextgen

def crossover(pop_after_sel):
    pop_nextgen = pop_after_sel
    for i in range(0, len(pop_after_sel),2):
        new_par = []
        child_1 , child_2 = pop_nextgen[i] , pop_nextgen[i+1]
        new_par = np.concatenate((child_1[:len(child_1)//2],child_2[len(child_1)//2:]))
        pop_nextgen.append(new_par)
    return pop_nextgen

def mutation(pop_after_cross, mutation_rate, n_feat):   
    mutation_range = int(mutation_rate * n_feat)
    pop_next_gen = []
    for n in range(0, len(pop_after_cross)):
        chromo = pop_after_cross[n]
        rand_posi = [] 
        for i in range(0, mutation_range):
            pos = randint(0, n_feat-1)
            rand_posi.append(pos)
        for j in rand_posi:
            chromo[j] = not chromo[j]  
        pop_next_gen.append(chromo)
    return pop_next_gen

def generations(size, n_feat, n_parents, mutation_rate, select_feature):
    
    best_score= [-100]
    population_nextgen = initialization_of_population(size, n_feat,select_feature)
    i=1
    while True:
        scores, pop_after_fit,pred = fitness_score(population_nextgen)
        
        best_score.append(scores[0])
        if(best_score[i]<=best_score[i-1]):
            final_score.append(best_score[i])
            break
        pop_after_sel = selection(pop_after_fit,n_parents)
        pop_after_cross = crossover(pop_after_sel)
        population_nextgen = mutation(pop_after_cross, mutation_rate, n_feat)
        i = i+1
    return pred[0]
   
def cf_matrix_plot(cf_matrix, select_feature):
    group_names = ['True Neg', 'False Pos', 'False Neg', 'True Pos']
    group_counts = ["{0:0.0f}".format(value) for value in cf_matrix.flatten()]
    group_percentages = ["{0:.2%}".format(value) for value in cf_matrix.flatten()/np.sum(cf_matrix)]
    labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)
    svm = sns.heatmap(cf_matrix, annot=labels, fmt='', cmap='Blues')
    svm.set_title('Number of Features: %i - Classifier: XGB' %select_feature)    
    figure = svm.get_figure()
    figure.savefig('With_%i_features.png' %select_feature, dpi=400)
    figure.clf()

#Test for All Features
select_feature parameter indicates the number of features we would like to keep.
The for loop runs 30 times which is the number of all features, and an F1 score for every selection will be printed.

In [None]:


X_train,X_test, Y_train, Y_test = split(data_bc,label_bc)

results = {}
select_feature = 10
n_feat = data_bc.shape[1]


for i in range(n_feat):
    pred = generations(size=80,n_feat=n_feat,n_parents=64,mutation_rate=0.2,select_feature=select_feature)
    results[select_feature] = final_score[select_feature-1]
    cf_matrix = confusion_matrix(Y_test,pred)
    cf_matrix_plot(cf_matrix,select_feature)
    select_feature+=1
    

for i in results:
    print('with %i features score is %f' %(i,results[i]))
    


#Constant Number of Features


In [None]:
X_train,X_test, Y_train, Y_test = split(data_bc, label_bc)

results = {}
select_feature = 10
n_feat = data_bc.shape[1]



pred = generations(size=80, n_feat=n_feat, n_parents=64, mutation_rate=0.2, select_feature=select_feature)

cf_matrix = confusion_matrix(Y_test,pred)
cf_matrix_plot(cf_matrix, select_feature)

    
print(final_score[-1])