<a href="https://colab.research.google.com/github/spozi/gpu-svgpm/blob/main/GPFeatureGenerator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -U deap imbalanced-learn

Collecting deap
  Downloading deap-1.3.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (160 kB)
[?25l[K     |██                              | 10 kB 10.6 MB/s eta 0:00:01[K     |████                            | 20 kB 15.1 MB/s eta 0:00:01[K     |██████                          | 30 kB 15.5 MB/s eta 0:00:01[K     |████████▏                       | 40 kB 10.7 MB/s eta 0:00:01[K     |██████████▏                     | 51 kB 5.5 MB/s eta 0:00:01[K     |████████████▏                   | 61 kB 5.5 MB/s eta 0:00:01[K     |██████████████▎                 | 71 kB 5.3 MB/s eta 0:00:01[K     |████████████████▎               | 81 kB 6.0 MB/s eta 0:00:01[K     |██████████████████▎             | 92 kB 5.8 MB/s eta 0:00:01[K     |████████████████████▍           | 102 kB 5.2 MB/s eta 0:00:01[K     |██████████████████████▍         | 112 kB 5.2 MB/s eta 0:00:01[K     |████████████████████████▍       | 122 kB 5.2 MB/s eta 0:00:01

In [28]:
from collections import Counter
from imblearn.datasets import fetch_datasets
ecoli = fetch_datasets()['ecoli']
ecoli.data.shape

X = ecoli.data
y = ecoli.target

In [29]:
#Specify X_train, X_test, y_train, y_test here
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
X_train.shape

(225, 7)

Version 2 GP Feature Generator

In [55]:
import random
import operator
import math
import statistics

import numpy as np

from deap import algorithms
from deap import base
from deap import creator
from deap import tools
from deap import gp

# Define new functions
def protectedDiv(left, right):
    try:
        return left / right
    except ZeroDivisionError:
        return 1

nFeatures = ecoli.data.shape[1]
pset = gp.PrimitiveSet("MAIN", nFeatures) 
pset.addPrimitive(operator.add, 2)
pset.addPrimitive(operator.sub, 2)
pset.addPrimitive(operator.mul, 2)
pset.addPrimitive(protectedDiv, 2)
pset.addPrimitive(operator.neg, 1)
pset.addPrimitive(math.erfc, 1)
pset.addPrimitive(math.erf, 1)
pset.addPrimitive(math.exp, 1)
pset.addPrimitive(math.gamma, 1)
pset.addPrimitive(math.sqrt, 1)
pset.addPrimitive(math.cos, 1)
pset.addPrimitive(math.sin, 1)
pset.addEphemeralConstant("rand5", lambda: round(random.uniform(0.1, 1.0), 10))

creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Tree", gp.PrimitiveTree)
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register("main_expr", gp.genHalfAndHalf, pset=pset, min_=1, max_=5)
toolbox.register('MAIN', tools.initIterate, creator.Tree, toolbox.main_expr)

func_cycle = [toolbox.MAIN]

toolbox.register("individual", tools.initCycle, creator.Individual, func_cycle)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

In [66]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import KernelPCA
from sklearn.feature_selection import SelectKBest, chi2, SelectPercentile
from sklearn.svm import SVC
from sklearn.metrics import f1_score


import warnings
warnings.filterwarnings("ignore")

def evalSymbReg(score):
  return score,

def evalSymbRegPop(population):
  # Evaluate each individual in population
  #1. Compute the expression of every individual
  list_vecs = []
  for individual in population:
    #The following code should be optimized/vectorized
    #Evaluating expression on each vector
    func = toolbox.compile(expr=individual)
    vec = []
    for x in X_train: #Iterate every vector x (row) in data (matrix) X
      try:
        val = func(*x)
        vec.append(val)
      except:
        vec.append(0)
    list_vecs.append(vec)

  #2. Convert list_vecs to numpy array
  evaluated_X = np.array(list_vecs).T
  evaluated_X = np.float32(evaluated_X)
  evaluated_X = np.nan_to_num(evaluated_X, copy=True, nan=0.0, posinf=0.0, neginf=0.0)

  #3. Individual (feature) selection
  # https://scikit-learn.org/stable/modules/feature_selection.html#feature-selection-using-selectfrommodel
  # print("Feature selection")
  # print(evaluated_X.dtype)
  clf = ExtraTreesClassifier(n_estimators=50)
  clf = clf.fit(evaluated_X, y_train)

  #4. Extract features that at top threshold (get the 75 percentile)
  # print("Feature extraction")
  q1 = np.percentile(clf.feature_importances_, 50)  #Get the top 75 percentile features
  features = [True if val >= q1 else False for val in clf.feature_importances_.tolist()] #Get the features indices
  X_train_new = evaluated_X[:, features]  #Extract the required data


  #4. Use svc to get total nSV
  # print("SVC")
  clf_svc = SVC(C=1, gamma=1) #Smaller C value will increase the margin size of the hyperplane. The new features should be generate such that it has very high linear seperability
  clf_svc.fit(X_train_new, y_train)
  y_pred = clf_svc.predict(X_train_new)

  #5. Compute the score
  nSV = clf_svc.support_vectors_.shape[0]
  f1 = f1_score(y_train, y_pred)
  fitness = f1/nSV

  #6. Output the fitness value
  ind_pop_fitness = []
  for f in features:
    if f is True:
      ind_pop_fitness.append(fitness)
    else:
      ind_pop_fitness.append(0)
  
  return ind_pop_fitness

psets = [pset]
toolbox.register("compile", gp.compileADF, psets=psets)
toolbox.register('evaluate', evalSymbReg)
toolbox.register('select', tools.selTournament, tournsize=3)
toolbox.register('mate', gp.cxOnePoint)
toolbox.register("expr", gp.genFull, min_=1, max_=2)
toolbox.register('mutate', gp.mutUniform, expr=toolbox.expr)

In [67]:
def main():
  random.seed(1024)
  ind = toolbox.individual()
  
  pop = toolbox.population(n=1000)
  hof = tools.HallOfFame(500)
  stats = tools.Statistics(lambda ind: ind.fitness.values)
  stats.register("avg", np.mean)
  stats.register("std", np.std)
  stats.register("min", np.min)
  stats.register("max", np.max)

  logbook = tools.Logbook()
  logbook.header = "gen", "evals", "std", "min", "avg", "max"

  CXPB, MUTPB, NGEN = 0.5, 0.2, 1000

  # # Evaluate the entire population
#################################This is for adhoc code draft##########################################################################
  #0. Test each indvidiual in population

  # list_vecs = []
  # for ind in pop:
  #   func = toolbox.compile(expr=ind)

  #   #The following code should be optimized
  #   #Evaluating expression on each vector
  #   func = toolbox.compile(expr=ind)
  #   vec = []
  #   for x in X: #Iterate every vector x (row) in data (matrix) X
  #     val = func(*x)
  #     # try:
  #     #   val = func(*x)
  #     # except:
  #     #   print(x, [str(eq) for eq in ind])
  #       # print(ind[0])
  #     # print(val)
  #     vec.append(val)
  #   list_vecs.append(vec)

  # #2. Convert list_vecs to numpy array
  # evaluated_X = np.array(list_vecs).T
  # evaluated_X = np.nan_to_num(evaluated_X, copy=True, nan=0.0, posinf=0.0, neginf=0.0)

  # clf = ExtraTreesClassifier(n_estimators=50)
  # clf = clf.fit(evaluated_X, y)

  # #4. Extract features that at top threshold (get the 75 percentile)
  # q1 = np.percentile(clf.feature_importances_, 75)
  # print( clf.feature_importances_.tolist())
  # features = [True if val >= q1 else False for val in clf.feature_importances_.tolist()]
  # print(len(features), evaluated_X.shape)
  # new_X = evaluated_X[:, features]
  # print(new_X.shape)
  # 2/0

  # #4. 



###########################################################################################################
  #1. Compute the metric on the set of individuals
  ind_pop_fitness = evalSymbRegPop(pop)

  #2. Then, determine the best individual using toolbox
  for ind, fitness in zip(pop, ind_pop_fitness):
    ind.fitness.values = toolbox.evaluate(fitness)

  hof.update(pop)
  record = stats.compile(pop)
  logbook.record(gen=0, evals=len(pop), **record)
  print(logbook.stream)

  for g in range(1, NGEN):
    # Select the offspring
    offspring = toolbox.select(pop, len(pop))
    # Clone the offspring
    offspring = [toolbox.clone(ind) for ind in offspring]

    # Apply crossover and mutation
    for ind1, ind2 in zip(offspring[::2], offspring[1::2]):
        for tree1, tree2 in zip(ind1, ind2):
            if random.random() < CXPB:
                toolbox.mate(tree1, tree2)
                del ind1.fitness.values
                del ind2.fitness.values

    for ind in offspring:
        for tree, pset in zip(ind, psets):
            if random.random() < MUTPB:
                toolbox.mutate(individual=tree, pset=pset)
                del ind.fitness.values
                        
    # Evaluate the individuals with an invalid fitness
    invalids = [ind for ind in offspring if not ind.fitness.valid]

    #1. Compute the metric on the set of individuals
    ind_pop_invalid_fitness = evalSymbRegPop(invalids)

    #2. Then, determine the best individual using toolbox
    for ind, fitness in zip(invalids, ind_pop_invalid_fitness):
      ind.fitness.values = toolbox.evaluate(fitness)
            
    # Replacement of the population by the offspring
    pop = offspring
    hof.update(pop)
    record = stats.compile(pop)
    logbook.record(gen=g, evals=len(invalids), **record)
    print(logbook.stream)
  
  print('Best individual : ', hof[0][0], hof[0].fitness)
  return pop, stats, hof

if __name__ == "__main__":
    pop, stats, hof = main()

gen	evals	std       	min	avg       	max       
0  	1000 	0.00234742	0  	0.00234742	0.00469484
1  	598  	0.00234583	0  	0.00316714	0.00512821
2  	590  	0.00228671	0  	0.00342162	0.00512821
3  	596  	0.00223101	0  	0.0033033 	0.00512821
4  	614  	0.00220177	0  	0.00324761	0.00512821
5  	598  	0.00221667	0  	0.00329945	0.00512821
6  	597  	0.00218548	0  	0.00326985	0.00512821
7  	585  	0.00216756	0  	0.00328063	0.00512821
8  	620  	0.00249167	0  	0.00357462	0.00574713
9  	608  	0.00240652	0  	0.00342801	0.00574713
10 	598  	0.00233074	0  	0.0034429 	0.00574713
11 	597  	0.00233357	0  	0.00341289	0.00574713
12 	592  	0.00230939	0  	0.00340112	0.00574713
13 	573  	0.00230712	0  	0.00346542	0.00574713
14 	593  	0.00231534	0  	0.00340218	0.00574713
15 	607  	0.00230114	0  	0.0033595 	0.00574713
16 	596  	0.00230629	0  	0.00336576	0.00574713
17 	587  	0.00228991	0  	0.0033655 	0.00574713
18 	608  	0.00229668	0  	0.0033159 	0.00574713
19 	604  	0.00228574	0  	0.00330216	0.00574713
20 	605  	0.0

In [23]:
for individual in hof:
  if individual.fitness.values[0] > 0:
    for i in individual:
      print(i, individual.fitness.values)

add(add(sin(sub(add(protectedDiv(mul(ARG5, ARG1), sin(ARG6)), add(mul(ARG3, cos(cos(0.9464188944))), mul(cos(0.779807991), cos(sin(0.5074070346))))), sin(sin(cos(ARG5))))), ARG0), ARG6) (0.004265908282971916,)
protectedDiv(cos(ARG0), cos(neg(protectedDiv(ARG0, ARG2)))) (0.004265908282971916,)
add(add(ARG5, ARG0), ARG6) (0.004265908282971916,)
add(add(neg(ARG0), cos(sub(ARG1, ARG4))), ARG6) (0.004265908282971916,)
protectedDiv(neg(ARG1), sin(ARG3)) (0.004265908282971916,)
add(ARG6, protectedDiv(ARG0, 0.718086486)) (0.004265908282971916,)
add(ARG0, sin(neg(ARG5))) (0.004265908282971916,)
mul(add(add(ARG5, cos(ARG6)), ARG0), ARG0) (0.004265908282971916,)
sin(add(add(ARG6, 0.283493343), mul(sin(neg(protectedDiv(ARG0, ARG2))), add(ARG5, ARG5)))) (0.004265908282971916,)
sub(sub(ARG2, ARG0), mul(sin(sub(add(protectedDiv(mul(sub(ARG2, 0.145793632), ARG1), sin(ARG3)), add(mul(ARG3, cos(cos(ARG6))), mul(cos(ARG6), cos(sub(ARG2, ARG6))))), neg(protectedDiv(ARG1, ARG2)))), ARG6)) (0.00426590828297

**Prediction task**

1.   Create new data from the list of fittest individual (fittest features) for both training and testing data.
2.   Fit the svm with transformed training data
3.   Predict the transformed testing data



In [92]:
from sklearn.model_selection import GridSearchCV

#1. Evaluate the expression
list_vecs = []
for individual in hof:
  if individual.fitness.values[0] > 0:
    for i in individual:
      func = toolbox.compile(expr=individual)
      vec = []
      for x in X_train: #Iterate every vector x (row) in data (matrix) X
        try:
          val = func(*x)
          vec.append(val)
        except:
          vec.append(0)
        list_vecs.append(vec)

#2. Convert list_vecs to numpy array
X_train_new = np.array(list_vecs).T   #Need to refactor X_train g
X_train_new = np.nan_to_num(X_train, copy=True, nan=0.0, posinf=0.0, neginf=0.0)

#3. Fit the SVM
#Grid search
parameters = {'gamma':[2**g for g in range(-4,6)], 'C':[2**c for c in range(-5,6)]}

list_f1_result = []
for c in range(-5,6):
  for g in range(-4,6):
    clf_svc = SVC(C=2**c, gamma=2**g)
    clf_svc.fit(X_train_new, y_train)
    y_pred = clf_svc.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    list_f1_result.append(f1)

print("Highest F1-Score: ", max(list_f1_result))

0.8799999999999999
