<a href="https://colab.research.google.com/github/spozi/gpu-svgpm/blob/main/GPFeatureGenerator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -U deap imbalanced-learn



In [2]:
from collections import Counter
from imblearn.datasets import fetch_datasets
ecoli = fetch_datasets()['ecoli']
ecoli.data.shape


(336, 7)

In [8]:
X = ecoli.data
y = ecoli.target

Version 2 GP Feature Generator

In [14]:
import random
import operator
import math
import statistics

import numpy as np

from deap import algorithms
from deap import base
from deap import creator
from deap import tools
from deap import gp

# Define new functions
def protectedDiv(left, right):
    try:
        return left / right
    except ZeroDivisionError:
        return 1

nFeatures = ecoli.data.shape[1]
pset = gp.PrimitiveSet("MAIN", nFeatures) 
pset.addPrimitive(operator.add, 2)
pset.addPrimitive(operator.sub, 2)
pset.addPrimitive(operator.mul, 2)
pset.addPrimitive(protectedDiv, 2)
pset.addPrimitive(operator.neg, 1)
pset.addPrimitive(math.cos, 1)
pset.addPrimitive(math.sin, 1)
pset.addEphemeralConstant("rand", lambda: round(random.uniform(0.1, 1.0), 10))

creator.create("FitnessMin", base.Fitness, weights=(1.0,))
creator.create("Tree", gp.PrimitiveTree)
creator.create("Individual", list, fitness=creator.FitnessMin)

toolbox = base.Toolbox()
toolbox.register("main_expr", gp.genHalfAndHalf, pset=pset, min_=1, max_=5)
toolbox.register('MAIN', tools.initIterate, creator.Tree, toolbox.main_expr)

func_cycle = [toolbox.MAIN]

toolbox.register("individual", tools.initCycle, creator.Individual, func_cycle)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)



In [22]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import KernelPCA
from sklearn.feature_selection import SelectKBest, chi2, SelectPercentile
from sklearn.svm import SVC
from sklearn.metrics import f1_score


import warnings
warnings.filterwarnings("ignore")

def evalSymbReg(score):
  return score,

def evalSymbRegPop(population):
  # Generate dataset

  # Evaluate each individual in population
  #1. Compute the expression of every individual
  list_vecs = []
  for individual in population:
    # func = toolbox.compile(expr=individual)

    #The following code should be optimized/vectorized
    #Evaluating expression on each vector
    func = toolbox.compile(expr=individual)
    vec = []
    for x in X: #Iterate every vector x (row) in data (matrix) X
      try:
        val = func(*x)
        vec.append(val)
      except:
        vec.append(0)
    list_vecs.append(vec)

  #2. Convert list_vecs to numpy array
  evaluated_X = np.array(list_vecs).T
  evaluated_X = np.nan_to_num(evaluated_X, copy=True, nan=0.0, posinf=0.0, neginf=0.0)

  #3. Individual (feature) selection
  # https://scikit-learn.org/stable/modules/feature_selection.html#feature-selection-using-selectfrommodel
  clf = ExtraTreesClassifier(n_estimators=50)
  clf = clf.fit(evaluated_X, y)

  #4. Extract features that at top threshold (get the 75 percentile)
  q1 = np.percentile(clf.feature_importances_, 75)  #Get the top 75 percentile features
  features = [True if val >= q1 else False for val in clf.feature_importances_.tolist()] #Get the features indices
  X_new = evaluated_X[:, features]  #Extract the required data

  #4. Use svc to get total nSV
  clf_svc = SVC(C=0.25, gamma="auto") #Smaller C value will increase the margin size of the hyperplane
  clf_svc.fit(X_new, y)
  y_pred = clf_svc.predict(X_new)

  #5. Compute the score
  nSV = clf_svc.support_vectors_.shape[0]
  f1 = f1_score(y, y_pred)
  fitness = f1/nSV

  #6. Output the fitness value
  ind_pop_fitness = []
  for f in features:
    if f is True:
      ind_pop_fitness.append(fitness)
    else:
      ind_pop_fitness.append(0)
  
  return ind_pop_fitness

  #6. N1 Complexity (TBC)

psets = [pset]
toolbox.register("compile", gp.compileADF, psets=psets)
toolbox.register('evaluate', evalSymbReg)
toolbox.register('select', tools.selTournament, tournsize=3)
toolbox.register('mate', gp.cxOnePoint)
toolbox.register("expr", gp.genFull, min_=1, max_=2)
toolbox.register('mutate', gp.mutUniform, expr=toolbox.expr)

In [23]:
def main():
  random.seed(1024)
  ind = toolbox.individual()
  
  pop = toolbox.population(n=100)
  hof = tools.HallOfFame(1)
  stats = tools.Statistics(lambda ind: ind.fitness.values)
  stats.register("avg", np.mean)
  stats.register("std", np.std)
  stats.register("min", np.min)
  stats.register("max", np.max)

  logbook = tools.Logbook()
  logbook.header = "gen", "evals", "std", "min", "avg", "max"

  CXPB, MUTPB, NGEN = 0.5, 0.2, 40

  # # Evaluate the entire population
#################################This is for adhoc code draft##########################################################################
  #0. Test each indvidiual in population

  # list_vecs = []
  # for ind in pop:
  #   func = toolbox.compile(expr=ind)

  #   #The following code should be optimized
  #   #Evaluating expression on each vector
  #   func = toolbox.compile(expr=ind)
  #   vec = []
  #   for x in X: #Iterate every vector x (row) in data (matrix) X
  #     val = func(*x)
  #     # try:
  #     #   val = func(*x)
  #     # except:
  #     #   print(x, [str(eq) for eq in ind])
  #       # print(ind[0])
  #     # print(val)
  #     vec.append(val)
  #   list_vecs.append(vec)

  # #2. Convert list_vecs to numpy array
  # evaluated_X = np.array(list_vecs).T
  # evaluated_X = np.nan_to_num(evaluated_X, copy=True, nan=0.0, posinf=0.0, neginf=0.0)

  # clf = ExtraTreesClassifier(n_estimators=50)
  # clf = clf.fit(evaluated_X, y)

  # #4. Extract features that at top threshold (get the 75 percentile)
  # q1 = np.percentile(clf.feature_importances_, 75)
  # print( clf.feature_importances_.tolist())
  # features = [True if val >= q1 else False for val in clf.feature_importances_.tolist()]
  # print(len(features), evaluated_X.shape)
  # new_X = evaluated_X[:, features]
  # print(new_X.shape)
  # 2/0

  # #4. 



###########################################################################################################
  #1. Compute the metric on the set of individuals
  ind_pop_fitness = evalSymbRegPop(pop)

  #2. Then, determine the best individual using toolbox
  for ind, fitness in zip(pop, ind_pop_fitness):
    ind.fitness.values = toolbox.evaluate(fitness)

  hof.update(pop)
  record = stats.compile(pop)
  logbook.record(gen=0, evals=len(pop), **record)
  print(logbook.stream)

  for g in range(1, NGEN):
    # Select the offspring
    offspring = toolbox.select(pop, len(pop))
    # Clone the offspring
    offspring = [toolbox.clone(ind) for ind in offspring]

    # Apply crossover and mutation
    for ind1, ind2 in zip(offspring[::2], offspring[1::2]):
        for tree1, tree2 in zip(ind1, ind2):
            if random.random() < CXPB:
                toolbox.mate(tree1, tree2)
                del ind1.fitness.values
                del ind2.fitness.values

    for ind in offspring:
        for tree, pset in zip(ind, psets):
            if random.random() < MUTPB:
                toolbox.mutate(individual=tree, pset=pset)
                del ind.fitness.values
                        
    # Evaluate the individuals with an invalid fitness
    invalids = [ind for ind in offspring if not ind.fitness.valid]

    #1. Compute the metric on the set of individuals
    ind_pop_invalid_fitness = evalSymbRegPop(invalids)

    #2. Then, determine the best individual using toolbox
    for ind, fitness in zip(invalids, ind_pop_invalid_fitness):
      ind.fitness.values = toolbox.evaluate(fitness)
            
    # Replacement of the population by the offspring
    pop = offspring
    hof.update(pop)
    record = stats.compile(pop)
    logbook.record(gen=g, evals=len(invalids), **record)
    print(logbook.stream)
  
  print('Best individual : ', hof[0][0], hof[0].fitness)
  return pop, stats, hof

if __name__ == "__main__":
    pop, stats, hof = main()

gen	evals	std	min	avg	max
0  	100  	0  	0  	0  	0  
1  	64   	0  	0  	0  	0  
2  	56   	0  	0  	0  	0  
3  	71   	0  	0  	0  	0  
4  	64   	0  	0  	0  	0  
5  	73   	0  	0  	0  	0  
6  	60   	0.00160019	0  	0.000672215	0.00448143
7  	59   	0.00258019	0  	0.00176876 	0.00641399
8  	68   	0.00253536	0  	0.00140408 	0.00641399
9  	66   	0.00236706	0  	0.00116685 	0.00641399
10 	65   	0.00227724	0  	0.000987588	0.00641399
11 	61   	0.0023147 	0  	0.00100691 	0.00641399
12 	51   	0.00233804	0  	0.00105173 	0.00641399
13 	58   	0.00253396	0  	0.00126347 	0.00641399
14 	73   	0.00229025	0  	0.000962099	0.00641399
15 	57   	0.0020843 	0  	0.000769679	0.00641399
16 	59   	0.00174007	0  	0.00051312 	0.00641399
17 	56   	0.00163651	0  	0.00044898 	0.00641399
18 	61   	0.00163651	0  	0.00044898 	0.00641399
19 	72   	0.0013979 	0  	0.0003207  	0.00641399
20 	59   	0.00183557	0  	0.000577259	0.00641399
21 	57   	0.0020843 	0  	0.000769679	0.00641399
22 	55   	0.00222557	0  	0.000897959	0.00641399
23

In [25]:
for individual in pop:
  for i in individual:
    print(i, individual.fitness )

sub(add(ARG5, ARG0), ARG1) (0.006413994169096209,)
sub(add(ARG5, ARG0), 0.1612495229) (0.0,)
sub(add(ARG1, ARG0), ARG1) (0.0,)
sub(ARG5, ARG1) (0.0,)
sub(sin(ARG3), sin(ARG1)) (0.0,)
sub(add(ARG5, ARG0), sub(sub(ARG0, ARG0), sin(ARG0))) (0.0,)
sub(add(ARG0, ARG1), ARG0) (0.0,)
cos(cos(sub(ARG6, ARG2))) (0.0,)
sub(add(ARG0, ARG0), ARG1) (0.0,)
sub(add(add(ARG5, ARG0), ARG5), sin(ARG1)) (0.0,)
sub(add(ARG4, ARG0), ARG0) (0.0,)
sub(ARG1, ARG1) (0.0,)
sub(add(cos(0.3131798497), ARG0), ARG0) (0.0,)
sub(add(ARG5, ARG1), ARG1) (0.0,)
sub(ARG1, ARG1) (0.0,)
sub(add(ARG5, ARG0), add(cos(mul(ARG2, ARG0)), add(ARG5, ARG0))) (0.0,)
mul(protectedDiv(ARG4, ARG3), ARG0) (0.0,)
sub(add(sin(ARG4), cos(ARG5)), ARG1) (0.0,)
sub(add(ARG5, ARG0), add(ARG5, ARG1)) (0.0,)
sub(ARG1, add(sin(ARG0), ARG1)) (0.0,)
sub(add(add(cos(0.8758890383), sub(0.5250938369, ARG4)), ARG1), ARG6) (0.0,)
sub(add(ARG5, ARG0), ARG1) (0.006413994169096209,)
sub(add(add(add(ARG5, ARG0), ARG0), ARG0), sin(protectedDiv(ARG4, neg(sub