# GA for tuning Hyperparameters

In [1]:
import ROOT 
from ROOT import TMVA
from genetic_algo_helpers import *
import random
import copy

In [2]:
TMVA.Tools.Instance()

<cppyy.gbl.TMVA.Tools object at 0x6038f2926d20>

In [3]:
#First we import the training data
#Note we will use CrossValidation data file for hypertuning and tuning data for training
trainingFile = ROOT.TFile("tuningData.root")
testFile = ROOT.TFile("../../modelInputData/validData.root")

#Getting the TTree objects from input files
sigTrain = trainingFile.Get("sig")
bkgTrain = trainingFile.Get("bkg")
nSigTrain = sigTrain.GetEntries()
nBkgTrain = bkgTrain.GetEntries()

sigTest = testFile.Get("sig")
bkgTest = testFile.Get("bkg")
nSigTest = sigTest.GetEntries()
nBkgTest = bkgTest.GetEntries()

#GlobalEventWeights
sigWeight = 1.0
bkgWeight = float(nSigTrain)/float(nBkgTrain)

In [4]:
#Now we define function which evalues the AUC of a model
def evaluate(individual):
    option_string = indTMVAOpt(individual, float_prec=2)

    #Output discarded to avoid file creation
    outputFile = ROOT.TFile.Open("/dev/null", "RECREATE")
    factory = ROOT.TMVA.Factory("TMVAHypertuning", outputFile, "!V:!Silent")
    dataLoader = ROOT.TMVA.DataLoader("datasetTuning")

    #add trees and variables
    dataLoader.AddSignalTree(sigTrain, sigWeight, TMVA.Types.kTraining)
    dataLoader.AddBackgroundTree(bkgTrain, bkgWeight, TMVA.Types.kTraining)
    dataLoader.AddSignalTree(sigTest, sigWeight, TMVA.Types.kTesting)
    dataLoader.AddBackgroundTree(bkgTest, bkgWeight, TMVA.Types.kTesting)
    dataLoader.AddVariable("ADC_mean", 'F')
    dataLoader.AddVariable("nhits_min", 'F')
    dataLoader.AddVariable("entry_dist", 'F')
    dataLoader.AddVariable("exit_dist", 'F')
    dataLoader.AddVariable("docasqrx_max", 'F')
    dataLoader.AddVariable("docasqry_max", 'F')

    #TrainBDT
    factory.BookMethod(dataLoader, TMVA.Types.kBDT, "BDT", option_string)
    factory.TrainAllMethods()
    factory.TestAllMethods()
    factory.EvaluateAllMethods()

    #Returning AUC output
    auc = factory.GetROCIntegral("datasetTuning","BDT")

    outputFile.Close()

    return auc

In [5]:
#The ranges of hyperparameters
parameter_ranges = {
    "NTrees": (100, 800),
    "MaxDepth": (2, 5),
    "MinNodeSize": (1.0, 5.0),
    "Shrinkage": (0.01, 0.1),
    "nCuts" : (20, 100),
    "BaggedSampleFraction" : (0.5, 1.0)
}

In [6]:
#FInal Helper function
def tournament_selection(population, scores, k):
    selected = random.sample(list(zip(population, scores)), k)
    selected.sort(key = lambda x:x[1], reverse=True)    #HIgher AUC scores first
    return selected[0][0]

In [7]:
#GA
def genetic_algorithm(parameter_ranges, pop_size, generations, crossover_rate, mutation_rate, float_prec, tournament_size):
    """
    Genetic Algorithm to optimize TMVA BDT for hyperparametes

    Parameters:
    - parameter_ranges : dict of parameter ranges
    - pop_size : int, number of individuals per generation
    - generations : int, number of generations
    - crossover_rate : float
    - mutation_rate : float
    - float_prec : int
    - tournament_size : int

    Returns:
    - best_individual : dict of best hyperparameters
    - best_auc : float, corresponding AUC
    """

    #Initialize populations
    population = [createIndividual(parameter_ranges) for _ in range(pop_size)]
    scores = []

    best_individual = None
    best_auc = -1.0

    for gen in range(generations):
        print(f"\n Generation {gen + 1}/{generations}")

        #Evaluate individuals
        scores = []
        for i,ind in enumerate(population):
            print(f" -> Evaluating individual {i+1}/{pop_size}...", end="")
            auc = evaluate(ind)
            print(f" AUC = {auc:.5f}")
            scores.append(auc)

            if auc > best_auc:
                best_auc = auc
                best_individual = copy.deepcopy(ind)

        
        #Logging best individual of current generation
        print(f"Best AUc this generation : {max(scores):.5f}")

    
        #Create new generation 
        new_population = []
        while len(new_population) < pop_size:
            parent1 = tournament_selection(population, scores, tournament_size)
            parent2 = tournament_selection(population, scores, tournament_size)
            child = crossOver(parent1, parent2, crossover_rate)
            child = mutate(child, mutation_rate, parameter_ranges)
            new_population.append(child)

        population = new_population


    print("\n GENETIC ALGORITHM COMPLETED.")
    print(f"Best AUC : {best_auc:.5f}")
    print(f"Best Hyperparameters: {best_individual}")

    return best_individual, best_auc

In [8]:
best_params, best_auc = genetic_algorithm(parameter_ranges, pop_size= 10, generations= 2, crossover_rate=0.7, mutation_rate=0.2, float_prec=2, tournament_size=3)


 Generation 1/2
 -> Evaluating individual 1/10... AUC = 0.99952
 -> Evaluating individual 2/10... AUC = 0.99951
 -> Evaluating individual 3/10... AUC = 0.99957
 -> Evaluating individual 4/10... AUC = 0.99951
 -> Evaluating individual 5/10... AUC = 0.99956
 -> Evaluating individual 6/10... AUC = 0.99955
 -> Evaluating individual 7/10... AUC = 0.99927
 -> Evaluating individual 8/10... AUC = 0.99954
 -> Evaluating individual 9/10... AUC = 0.99941
 -> Evaluating individual 10/10... AUC = 0.99952
Best AUc this generation : 0.99957

 Generation 2/2
 -> Evaluating individual 1/10...

: 