In [3]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import pandas as pd

In [None]:
# Load the dataset
df = pd.read_csv('TCGA_Labeled.csv', index_col=0)
labels = df['label'].values
features = df.drop(['label','file_id'], axis=1)
feature_arr = features.values
X_train,X_test,y_train,y_test = train_test_split(feature_arr, labels, test_size=0.3)
gene_expression_data = feature_arr

In [14]:
# Define the fitness function
def fitness_function(solution):
    selected_features = []
    for i in range(len(solution)):
        if solution[i] == 1:
            selected_features.append(i)
    X_train_temp = X_train[:, selected_features]
    X_test_temp = X_test[:, selected_features]
    model = SVC()
    model.fit(X_train_temp, y_train)
    return model.score(X_test_temp, y_test)

In [15]:

def initialize_population(population_size, num_features):
    population = np.random.randint(2, size=(population_size, num_features))
    return population

In [16]:
def update_grasshoppers(population, best_position, global_best_position, step_size):
    new_population = np.zeros(population.shape)
    for i, grasshopper in enumerate(population):
        for j, feature in enumerate(grasshopper):
            if np.random.rand() < 0.5:
                new_population[i, j] = feature + step_size * np.random.rand() * (best_position[j] - feature) + step_size * np.random.rand() * (global_best_position[j] - feature)
            else:
                new_population[i, j] = feature - step_size * np.random.rand() * (best_position[j] - feature) - step_size * np.random.rand() * (global_best_position[j] - feature)
    return new_population

In [17]:


# Define the parameters
problem_size = gene_expression_data.shape[1]
epoch = 1
pop_size = 50

def grasshopper_optimization_algorithm(gene_expression_data, labels, population_size, max_iterations):
    population = initialize_population(population_size, gene_expression_data.shape[1])
    fitness = np.zeros(population_size)
    best_fitness = np.zeros(max_iterations)
    best_solution = np.zeros((max_iterations, gene_expression_data.shape[1]))
    best_position = np.zeros(gene_expression_data.shape[1])
    global_best_position = np.zeros(gene_expression_data.shape[1])
    global_best_fitness = 0
    for i in range(population_size):
        fitness[i] = fitness_function(population[i])
        if fitness[i] > global_best_fitness:
            global_best_fitness = fitness[i]
            global_best_position = population[i]
    for iteration in range(max_iterations):
        step_size = 1 / (iteration + 1)
        new_population = update_grasshoppers(population, best_position, global_best_position, step_size)
        for i, grasshopper in enumerate(new_population):
            for j, feature in enumerate(grasshopper):
                if np.random.rand() < 1 / (1 + np.exp(-feature)):
                    new_population[i, j] = 1
                else:
                    new_population[i, j] = 0
        population = new_population
        for i in range(population_size):
            fitness[i] = fitness_function(population[i])
            if fitness[i] > global_best_fitness:
                global_best_fitness = fitness[i]
                global_best_position = population[i]
        best_fitness[iteration] = global_best_fitness
        best_solution[iteration] = global_best_position
        print("Iteration: {0} | Best Fitness: {1}".format(iteration, global_best_fitness))
    return best_fitness, best_solution


In [None]:
# Run the algorithm
best_fitness, best_solution = grasshopper_optimization_algorithm(gene_expression_data, labels, pop_size, epoch)

print(best_solution.shape)
print("Best Fitness: {0}".format(best_fitness[-1]))
print("Best Solution: {0}".format(best_solution[-1]))
print("Number of Selected Features: {0}".format(np.sum(best_solution[-1])))

In [15]:
# Print the selected features
selected_features = []
for i in range(len(best_solution[-1])):
    if best_solution[-1][i] == 1:
        selected_features.append(i)
print("Selected Features: {0}".format(selected_features))


Selected Features: [1, 2, 3, 4, 5, 7, 8, 9, 11, 12, 13, 14, 22, 23, 25, 26, 30, 31, 33, 35, 36, 37, 39, 40, 41, 42, 43, 44, 47, 50, 51, 52, 53, 55, 57, 58, 59, 61, 63, 66, 67, 68, 69, 75, 77, 78, 82, 83, 84, 85, 86, 90, 95, 97, 99, 101, 102, 103, 105, 106, 107, 108, 109, 110, 111, 112, 113, 115, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 128, 129, 131, 132, 133, 135, 138, 139, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 152, 154, 155, 156, 157, 158, 160, 163, 165, 167, 168, 169, 171, 172, 177, 178, 182, 183, 185, 187, 189, 190, 191, 195, 196, 197, 201, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 218, 219, 220, 221, 223, 224, 226, 231, 234, 236, 237, 239, 241, 242, 244, 247, 249, 250, 251, 252, 253, 255, 256, 258, 262, 263, 266, 268, 269, 270, 271, 272, 274, 275, 276, 277, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 294, 295, 296, 297, 299, 300, 301, 302, 303, 308, 309, 310, 314, 315, 316, 317, 319, 321, 328, 329, 335, 337, 338, 339, 341

In [None]:
# Save the dataset with the selected features and labels
selected_features.append(0)
df = pd.read_csv('TCGA_Labeled.csv', index_col=0)
df = df.iloc[:, selected_features]
# add the label column
df['label'] = labels
df.to_csv('TCGA_Labeled_Selected.csv')

In [5]:
df = pd.read_csv('TCGA_Labeled_Selected_GOA.csv', index_col=0)
df

Unnamed: 0_level_0,OR4F29,OR4F16,SAMD11,NOC2L,KLHL17,PERM1,HES4,ISG15,RNF223,C1orf159,...,MT-CO2,MT-ATP8,MT-ATP6,MT-ND3,MT-ND4L,MT-ND5,MT-CYB,Unnamed: 0,OR4F5,label
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-05-4396-01A-21R-1858-07,0,0,106,1418,179,23,161,567,90,323,...,56024,3663,44172,14599,9814,65764,66240,210,0,1
TCGA-05-4405-01A-21R-1858-07,0,0,258,2247,243,12,193,4097,75,313,...,143808,5200,64772,18956,11055,61927,118335,280,0,1
TCGA-05-4410-01A-21R-1858-07,0,0,50,2406,178,3,100,2505,73,201,...,171773,6595,80620,48126,19806,195265,138311,305,0,1
TCGA-05-4415-01A-22R-1858-07,0,0,1217,7965,350,1,165,3623,8,484,...,122266,6479,64313,24838,20018,46187,122685,404,0,1
TCGA-05-4417-01A-22R-1858-07,0,0,75,1900,135,0,184,1896,18,167,...,74169,5186,54226,24466,16561,43740,82328,55,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-L9-A7SV-01A-11R-A39D-07,0,0,20,9577,1841,37,168,2940,27,1143,...,288124,41152,232727,78536,82654,257838,418327,116,0,1
TCGA-MP-A4T8-01A-11R-A24X-07,0,0,818,10550,1937,18,1194,19935,335,1589,...,286644,8265,198207,84707,25941,113166,289595,397,0,1
TCGA-44-2665-01B-06R-A277-07,0,0,7,145,20,1,10,20,2,30,...,20920,2418,23246,5792,4824,26554,21434,544,0,1
TCGA-55-6981-11A-01R-1949-07,0,0,86,1292,178,2,182,917,22,303,...,64629,4595,51172,22890,10936,95249,97723,314,0,0
