In [91]:
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter
import numpy as np

df = pd.read_csv('bruh.csv')


X = df.drop(['ra','dec','redshift','class','hlo'], axis=1) 
y = df['class']

In [92]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_encoded = le.fit_transform(y)

print(f"Original unique labels: {le.classes_}")
print(f"Encoded numerical labels: {np.unique(y_encoded)}")

Original unique labels: ['GALAXY' 'STAR']
Encoded numerical labels: [0 1]


In [93]:
X.shape

(8304, 5)

In [94]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded,  # X are features, y_encoded are the 0/1 labels
    test_size=0.2,
    stratify=y_encoded,  # Use the encoded labels for stratification
    random_state=42
)

In [143]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(6643, 5)
(1661, 5)
(6643,)
(1661,)


## Defining Hyperparameter space

In [96]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

hyperparameter_space = {
    'criterion': ['gini', 'entropy'],  # Splitting criterion
    'splitter': ['best', 'random'],  # Strategy to choose the split at each node
    'max_depth': [None, 5, 10, 15, 20],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    'max_features': ['sqrt', 'log2', None],  # Number of features to consider for the best split
    'max_leaf_nodes': [None, 10, 20, 30],  # Grow a tree with max_leaf_nodes in best-first fashion
    'min_impurity_decrease': [0.0, 0.1, 0.2],  # A node will be split if this split induces a decrease of the impurity greater than or equal to this value
    'ccp_alpha': [0.0, 0.1, 0.2]  # Complexity parameter used for Minimal Cost-Complexity Pruning
}


## Defining sub functions

In [97]:
import random

def initialize_population(population_size):
    return [{key: random.choice(values) for key, values in hyperparameter_space.items()} for _ in range(population_size)]

In [98]:
def crossover(parent1, parent2):
    # Generating a random crossover point
    crossover_point = np.random.randint(1, len(parent1))
    # Altering the parent choromosomes
    child1 = dict(list(parent1.items())[:crossover_point] + list(parent2.items())[crossover_point:])
    child2 = dict(list(parent2.items())[:crossover_point] + list(parent1.items())[crossover_point:])
    return child1, child2

In [99]:
def mutate(individual, mutation_rate):
    # Randomly selecting mutating genes
    mask = np.random.rand(len(individual)) < mutation_rate
    # Labeling the genes
    genes = [item for i, item in enumerate(individual.items()) if mask[i]]
    # Creating new variations for labeled genes
    new_genes = [(param, random.choice(hyperparameter_space[param])) for param, value in genes]
    for param, value in new_genes:
        individual[param] = value
    return individual

In [100]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

def calculate_fitness(y_test, parameters):
    # Create a Decision Tree model
    dt_model = DecisionTreeClassifier(random_state=42, **parameters)
    # Fit the model on the training data
    dt_model.fit(X_train, y_train)
    # Make predictions on the testing data
    y_pred = dt_model.predict(X_test)
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

## Defining main Genetic Algorithm function

In [136]:
def genetic_algorithm(y , population_size=20, generations=10, mutation_rate=0.4):
    # Initializing the first population
    population = initialize_population(population_size)
    # Looping through each generation
    for generation in range(generations):
        # Calculating the fitness scores of first population
        fitness_scores = [calculate_fitness(y, parameters) for parameters in population]
        # Selecting the top 2 performing parents
        idx_best_2 = np.argsort(fitness_scores)[::-1][:2]
        # Adding parents to the new population
        new_population = [population[i] for i in idx_best_2]    
        # Creating the offsprings
        for _ in range(int((len(population) / 2) - 1)):
            parent1 = new_population[0]
            parent2 = new_population[1]
            # Crossing over the parent choromosomes
            child1, child2 = crossover(parent1, parent2)
            # Mutating the genes
            child1 = mutate(child1, mutation_rate)
            child2 = mutate(child2, mutation_rate)
            # Adding new offsprings to population
            new_population.extend([child1, child2])
    
        
        population = np.array(new_population)
        new_scores = [calculate_fitness(y, parameters) for parameters in population]    
        print(generation, new_scores[0])   
    # Selecting the best performing choromosome
    best_parameters = population[np.argmax(new_scores)]
    best_score = max(new_scores)
    return best_parameters, best_score

## Genetic search with 10 iterations
#### Fitness = Train test split Accuracy

In [139]:
%%time
best_parameters, best_score = genetic_algorithm(y_test)

print("Best Parameters:", best_parameters)
print("Best Score:", best_score)

0 0.7441300421432872
1 0.8675496688741722
2 0.8940397350993378
3 0.9042745334136063
4 0.9042745334136063
5 0.9042745334136063
6 0.9084888621312462
7 0.9084888621312462
8 0.9084888621312462
9 0.9139072847682119
Best Parameters: {'criterion': 'gini', 'splitter': 'random', 'max_depth': 20, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'ccp_alpha': 0.0}
Best Score: 0.9139072847682119
CPU times: user 8.07 s, sys: 3.98 ms, total: 8.08 s
Wall time: 8.08 s


## Genetic search with 15 iterations
#### Fitness = Train test split Accuracy

In [140]:
%%time
best_parameters, best_score = genetic_algorithm(y_test,generations=15)

print("Best Parameters:", best_parameters)
print("Best Score:", best_score)

0 0.8290186634557496
1 0.8579169175195666
2 0.8579169175195666
3 0.8964479229379891
4 0.9024683925346177
5 0.9024683925346177
6 0.9139072847682119
7 0.9139072847682119
8 0.9139072847682119
9 0.9163154726068633
10 0.9193257074051776
11 0.9193257074051776
12 0.9193257074051776
13 0.9193257074051776
14 0.9193257074051776
Best Parameters: {'criterion': 'gini', 'splitter': 'best', 'max_depth': 20, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'ccp_alpha': 0.0}
Best Score: 0.9193257074051776
CPU times: user 9.26 s, sys: 4.98 ms, total: 9.27 s
Wall time: 9.27 s


In [126]:
from itertools import product

# Generate all possible hyperparameter combinations
all_combinations = list(product(*hyperparameter_space.values()))

# Convert the combinations into dictionaries
hyperparameter_combinations = [dict(zip(hyperparameter_space.keys(), values)) for values in all_combinations]

# Print the number of combinations and a sample combination
print("Number of combinations:", len(hyperparameter_combinations))
print("Sample combination:", hyperparameter_combinations[0])

Number of combinations: 19440
Sample combination: {'criterion': 'gini', 'splitter': 'best', 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'ccp_alpha': 0.0}


## Grid search of all combinations
#### Fitness = Train test split Accuracy 

In [129]:
%%time
accuracies = []
for i,parameters in enumerate(hyperparameter_combinations):
    if i%500 == 0: print(i)
    accuracies.append(calculate_fitness(y_test, parameters))

best_score = np.argmax(accuracies)
print("Best Parameters:", hyperparameter_combinations[best_score])
print("Best Score:", accuracies[best_score])

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
12000
12500
13000
13500
14000
14500
15000
15500
16000
16500
17000
17500
18000
18500
19000
Best Parameters: {'criterion': 'gini', 'splitter': 'best', 'max_depth': None, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'ccp_alpha': 0.0}
Best Score: 0.9193257074051776
CPU times: user 2min 26s, sys: 247 ms, total: 2min 26s
Wall time: 2min 26s


## Random search of 2000 combinations
#### Fitness = Train test split Accuracy

In [138]:
%%time
random.shuffle(hyperparameter_combinations)
accuracies = []
for parameters in hyperparameter_combinations[:2000]:
    accuracies.append(calculate_fitness(y_test, parameters))

best_score = np.argmax(accuracies)
print("Best Parameters:", hyperparameter_combinations[best_score])
print("Best Score:",accuracies[best_score])

Best Parameters: {'criterion': 'gini', 'splitter': 'best', 'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'ccp_alpha': 0.0}
Best Score: 0.9114990969295605
CPU times: user 15.2 s, sys: 36.9 ms, total: 15.2 s
Wall time: 15.2 s


In [144]:
def calculate_fitness_cv(X, y, parameters, cv_splits=5):
    
    # 1. Create the Decision Tree model
    dt_model = DecisionTreeClassifier(random_state=42, **parameters)
    
    # 2. Define the Cross-Validation strategy
    # StratifiedKFold is essential to maintain label proportions
    cv = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=42)
    
    # 3. Calculate the cross-validation scores
    scores = cross_val_score(dt_model, X, y, cv=cv, scoring='accuracy')
    
    # 4. Fitness is the mean accuracy across all folds
    mean_accuracy = scores.mean()
    
    # For a GA, fitness is usually returned as a single float (or a tuple)
    return mean_accuracy

In [151]:
def genetic_algorithm_cv(X,y , population_size=20, generations=10, mutation_rate=0.4):
    # Initializing the first population
    population = initialize_population(population_size)
    # Looping through each generation
    for generation in range(generations):
        # Calculating the fitness scores of first population
        fitness_scores = [calculate_fitness_cv(X,y, parameters) for parameters in population]
        # Selecting the top 2 performing parents
        idx_best_2 = np.argsort(fitness_scores)[::-1][:2]
        # Adding parents to the new population
        new_population = [population[i] for i in idx_best_2]    
        # Creating the offsprings
        for _ in range(int((len(population) / 2) - 1)):
            parent1 = new_population[0]
            parent2 = new_population[1]
            # Crossing over the parent choromosomes
            child1, child2 = crossover(parent1, parent2)
            # Mutating the genes
            child1 = mutate(child1, mutation_rate)
            child2 = mutate(child2, mutation_rate)
            # Adding new offsprings to population
            new_population.extend([child1, child2])
    
        
        population = np.array(new_population)
        new_scores = [calculate_fitness_cv(X, y, parameters) for parameters in population]    
        print(generation, new_scores[0])   
    # Selecting the best performing choromosome
    best_parameters = population[np.argmax(new_scores)]
    best_score = max(new_scores)
    return best_parameters, best_score

## Genetic search with 10 iterations
## Fitness = Cross Validation Accuracy 

In [148]:
%%time
best_parameters, best_score = genetic_algorithm_cv(X,y)

print("Best Parameters:", best_parameters)
print("Best Score:", best_score)

0 0.8074463779259119
1 0.8108161000413453
2 0.8943879068350465
3 0.8973995923489261
4 0.8973995923489261
5 0.9167869551656354
6 0.9167869551656354
7 0.9167869551656354
8 0.9167869551656354
9 0.9167869551656354
Best Parameters: {'criterion': 'entropy', 'splitter': 'best', 'max_depth': None, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'ccp_alpha': 0.0}
Best Score: 0.9167869551656354
CPU times: user 54.7 s, sys: 1e+03 ns, total: 54.7 s
Wall time: 54.7 s


## Genetic search with 15 iterations
#### Fitness = Cross Validation Accuracy 

In [153]:
%%time
best_parameters, best_score = genetic_algorithm_cv(X,y,generations=15)

print("Best Parameters:", best_parameters)
print("Best Score:", best_score)

0 0.7415686587409238
1 0.8517588475515548
2 0.9081161733024814
3 0.9081161733024814
4 0.9167869551656354
5 0.9167869551656354
6 0.9167869551656354
7 0.9167869551656354
8 0.9167869551656354
9 0.9167869551656354
10 0.9167869551656354
11 0.9205205167448843
12 0.9216041287364993
13 0.9216041287364993
14 0.9216041287364993
Best Parameters: {'criterion': 'entropy', 'splitter': 'best', 'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'ccp_alpha': 0.0}
Best Score: 0.9216041287364993
CPU times: user 1min 35s, sys: 69.6 ms, total: 1min 35s
Wall time: 1min 36s


## Grid search of all combinations
#### Fitness = Cross Validation Accuracy 

In [154]:
%%time
accuracies = []
for i,parameters in enumerate(hyperparameter_combinations):
    if i%500 == 0: print(i)
    accuracies.append(calculate_fitness_cv(X, y, parameters))

best_score = np.argmax(accuracies)
print("Best Parameters:", hyperparameter_combinations[best_score])
print("Best Score:", accuracies[best_score])

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
12000
12500
13000
13500
14000
14500
15000
15500
16000
16500
17000
17500
18000
18500
19000
Best Parameters: {'criterion': 'entropy', 'splitter': 'best', 'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'ccp_alpha': 0.0}
Best Score: 0.9216041287364993
CPU times: user 24min 55s, sys: 1.12 s, total: 24min 56s
Wall time: 24min 57s


## Random search of 2000 combinations
#### Fitness = Cross Validation Accuracy 

In [157]:
%%time
random.shuffle(hyperparameter_combinations)
accuracies = []
for i,parameters in enumerate(hyperparameter_combinations[:2000]):
    if i%100 == 0: print(i)
    accuracies.append(calculate_fitness_cv(X, y, parameters))

best_score = np.argmax(accuracies)
print("Best Parameters:", hyperparameter_combinations[best_score])
print("Best Score:",accuracies[best_score])

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
Best Parameters: {'criterion': 'entropy', 'splitter': 'best', 'max_depth': 15, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'ccp_alpha': 0.0}
Best Score: 0.9163053175979051
CPU times: user 2min 51s, sys: 84.9 ms, total: 2min 51s
Wall time: 2min 52s


## Bayesian search
#### Fitness = Cross Validation Accuracy 

In [160]:
!pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-25.7.0-py3-none-any.whl.metadata (12 kB)
Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
Downloading pyaml-25.7.0-py3-none-any.whl (26 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-25.7.0 scikit-optimize-0.10.2


In [163]:
from sklearn.tree import DecisionTreeClassifier
from skopt import BayesSearchCV
from skopt.space import Integer, Categorical, Real
from sklearn.model_selection import StratifiedKFold

cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
dt_model = DecisionTreeClassifier(random_state=42)

# 2. Initialize the Bayesian Search object
bayes_search = BayesSearchCV(
    estimator=dt_model,
    search_spaces=hyperparameter_space,
    n_iter=50,             # Number of trials (iterations) the search will run
    cv=cv_strategy,        # Uses 5-fold CV to calculate fitness
    scoring='accuracy',    # The fitness metric to maximize
    n_jobs=-1,             # Use all CPU cores
    verbose=1,             # Print status updates
    random_state=42
)

In [166]:
%%time
bayes_search.fit(X, y)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

0,1,2
,estimator,DecisionTreeC...ndom_state=42)
,search_spaces,"{'ccp_alpha': [0.0, 0.1, ...], 'criterion': ['gini', 'entropy'], 'max_depth': [None, 5, ...], 'max_features': ['sqrt', 'log2', ...], ...}"
,optimizer_kwargs,
,n_iter,50
,scoring,'accuracy'
,fit_params,
,n_jobs,-1
,n_points,1
,iid,'deprecated'
,refit,True

0,1,2
,criterion,'entropy'
,splitter,'best'
,max_depth,15
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [167]:
max_accuracy = bayes_search.best_score_ 
print(f"ðŸ¥‡ Maximum CV Accuracy Found: {max_accuracy:.4f}")

ðŸ¥‡ Maximum CV Accuracy Found: 0.9216
