In [1]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import *

from sklearn.model_selection import cross_val_score

import pygad

from tqdm import tqdm

In [2]:
X_train = np.load('X_train.npy')
X_test  = np.load('X_test.npy')
y_train = np.load('y_train.npy')
y_test  = np.load('y_test.npy')

# Классический генетический алгоритм

Оптимизация гипер параметров с помощью PyGad

## KNN

In [4]:
from sklearn.neighbors import KNeighborsClassifier

In [40]:
num_generations = 20
sol_per_pop     = 10

In [41]:
total_evals = num_generations * sol_per_pop
pbar = tqdm(total=total_evals, desc="GA evaluations")

GA evaluations:   0%|          | 1/200 [00:11<36:29, 11.00s/it]


In [42]:
def fitness_func(ga_instance, solution, solution_idx):
    pbar.update(1)
    
    model = KNeighborsClassifier(
        n_neighbors=int(solution[0]),
        p=int(solution[1]),
        n_jobs=-1
    )

    score = cross_val_score(
        model,
        X_train, 
        y_train, 
        cv=3,
        scoring='f1_macro',
        n_jobs=-1
    ).mean()

    return score

In [43]:
gene_space = [
    {'low': 1, 'high': 100},  # n_neighbours
    {'low': 1, 'high': 10}     # p
]

In [44]:
ga_instance = pygad.GA(
    num_generations=num_generations,
    sol_per_pop=sol_per_pop,
    num_parents_mating=5,
    num_genes=2,
    fitness_func=fitness_func,
    gene_space=gene_space,
    parent_selection_type="rank",
    keep_parents=2,
    mutation_type="random",
    mutation_percent_genes=50,
    random_seed=42
)

In [45]:
ga_instance.run()
# pbar.close()
solution, solution_fitness, solution_idx = ga_instance.best_solution()
print(f"Лучшие гиперпараметры: n_neighbors={int(solution[0])}, p={int(solution[1])}")
print(f"Лучшее значение F1-macro: {solution_fitness:.4f}")

GA evaluations:  98%|█████████▊| 195/200 [2:04:23<02:13, 26.61s/it]  

Лучшие гиперпараметры: n_neighbors=2, p=7
Лучшее значение F1-macro: 0.9943


### Лучшие значения

Лучшие гиперпараметры: n_neighbors=2, p=7

Лучшее значение F1-macro: 0.9943

## SVC (отброшен ввиду слишком долгих вычислений)

In [13]:
from sklearn.svm import SVC

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [15]:
populationa_size = 20
num_generations = 50

In [16]:
kernel_options = ['linear', 'poly', 'rbf', 'sigmoid']
gamma_type_options = ['scale', 'value']

In [17]:
pbar2 = tqdm(total=populationa_size * num_generations, desc="GA iterations")

GA iterations:   0%|          | 1/1000 [00:18<5:04:53, 18.31s/it]


In [18]:
def decode_solution(solution):
    """
    Преобразует вектор решения (фиксированной длины) в словарь гиперпараметров для SVC.
    Ожидается, что решение имеет длину 6:
      0: log10(C) -> C = 10^(solution[0])
      1: kernel_idx (0 до 3)
      2: gamma_type_idx (0 или 1): 0 -> 'scale', 1 -> 'value'
      3: log10(gamma) -> используется если kernel в ['poly', 'rbf', 'sigmoid'] и gamma_type=='value'
      4: degree -> используется только если kernel=='poly'
      5: coef0  -> используется если kernel in ['poly', 'sigmoid']
    """
    params = {
        'C': 10 ** solution[0],
        'kernel': kernel_options[int(solution[1])],
        'gamma': 'scale',  # по умолчанию
        'degree': 3,       # дефолтное для poly
        'coef0': 0.0       # дефолтное для poly и sigmoid
    }
    
    # Определение gamma_type
    gamma_type = gamma_type_options[int(solution[2])]
    
    # Для некоторых ядер может использоваться дополнительный параметр gamma
    if params['kernel'] in ['poly', 'rbf', 'sigmoid'] and gamma_type == 'value':
        params['gamma'] = 10 ** solution[3]
    # Иначе оставляем 'scale'
    
    # Если ядро poly, то берем значение для degree
    if params['kernel'] == 'poly':
        params['degree'] = int(solution[4])
    
    # Если ядро poly или sigmoid, то берем значение для coef0
    if params['kernel'] in ['poly', 'sigmoid']:
        params['coef0'] = solution[5]
    
    return params


In [19]:
def fitness_func(ga_instance, solution, solution_idx):
    pbar2.update(1)

    params = decode_solution(solution)
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('svc',    SVC(**params, random_state=42))
    ])

    
    score = cross_val_score(
        pipeline,
        X_train,
        y_train,
        cv=3,
        scoring='f1_macro',
        n_jobs=-1
    ).mean()

    return score


In [20]:
# Явное перечисление вариантов для целочисленных генов:
gene_space = [
    {'low': -3, 'high': 3},                    # log10(C)
    [0, 1, 2, 3],                             # kernel_idx
    [0, 1],                                   # gamma_type_idx
    {'low': -5, 'high': 2},                    # log10(gamma)
    [2, 3, 4, 5],                             # degree
    {'low': -1.0, 'high': 1.0}                 # coef0
]


In [21]:
ga_instance = pygad.GA(
    num_generations=num_generations,
    sol_per_pop=populationa_size,
    num_parents_mating=5,
    fitness_func=fitness_func,
    num_genes=len(gene_space),
    gene_space=gene_space,
    parent_selection_type="sss",
    keep_parents=2,
    crossover_type="single_point",
    mutation_type="random",
    mutation_percent_genes=20
)

In [None]:
ga_instance.run()

In [None]:
best_solution, best_solution_fitness, best_idx = ga_instance.best_solution()
best_params = decode_solution(best_solution)

In [None]:
print("Лучшая приспособленность (f1_macro):", best_solution_fitness)
print("Лучшие гиперпараметры:")
for k, v in best_params.items():
    print(f"  {k}: {v}")

## DCT

In [3]:
from sklearn.tree import DecisionTreeClassifier as DTC

In [4]:
total_evals = 30*20
pbar3 = tqdm(total=total_evals, desc="GA evaluations")

GA evaluations:   0%|          | 0/600 [00:00<?, ?it/s]

In [5]:
gene_space = [
    [0, 1, 2],       # criterion
    [0, 1],             # splitter
    list(range(1, 16)),             # max_depth от 1 до 15
    list(range(2, 21)),             # min_samples_split от 2 до 20
    list(range(1, 21)),             # min_samples_leaf от 1 до 20
    [0, 1, 2],         # max_features
]

In [6]:
def decode_solution(solution):

    params = {
        'criterion': ["gini", "entropy", "log_loss"][int(solution[0])],
        'splitter': ["best", "random"][int(solution[1])],
        'max_depth': int(solution[2]),
        'min_samples_split': int(solution[3]),  
        'min_samples_leaf':  int(solution[4]),
        'max_features': ["sqrt", "log2", None][int(solution[5])]
    }
    
    return params

In [None]:
def fitness_func(s,solution, solution_idx):
    pbar3.update(1)

    params = decode_solution(solution=solution)

    model = DTC(**params, random_state=42)
    score = cross_val_score(model, X_train, y_train, cv=5, scoring="f1_macro").mean()
    return score

In [12]:
ga = pygad.GA(
    num_generations=30,            # число поколений
    num_parents_mating=10,         # число родителей для скрещивания
    fitness_func=fitness_func,
    sol_per_pop=20,                # размер популяции
    num_genes=len(gene_space),
    gene_space=gene_space,
    parent_selection_type="rank",
    crossover_type="single_point",
    mutation_type="random",
    mutation_percent_genes=20,
    keep_parents=5,
    on_generation=lambda ga: print(f"Поколение {ga.generations_completed}: "
                                  f"лучш. fitness = {ga.best_solution()[1]:.4f}")
)

In [13]:
ga.run()

GA evaluations:  10%|▉         | 59/600 [02:48<19:35,  2.17s/it] 

Поколение 1: лучш. fitness = 0.9714


GA evaluations:  16%|█▌        | 95/600 [03:24<06:43,  1.25it/s]

Поколение 2: лучш. fitness = 0.9714


GA evaluations:  22%|██▏       | 129/600 [04:08<08:11,  1.04s/it]

Поколение 3: лучш. fitness = 0.9714


GA evaluations:  27%|██▋       | 163/600 [04:37<06:07,  1.19it/s]

Поколение 4: лучш. fitness = 0.9714


GA evaluations:  34%|███▎      | 201/600 [05:18<07:14,  1.09s/it]

Поколение 5: лучш. fitness = 0.9714


GA evaluations:  40%|███▉      | 239/600 [05:54<05:10,  1.16it/s]

Поколение 6: лучш. fitness = 0.9714


GA evaluations:  46%|████▌     | 275/600 [06:37<04:41,  1.15it/s]

Поколение 7: лучш. fitness = 0.9726


GA evaluations:  52%|█████▏    | 313/600 [07:11<04:00,  1.19it/s]

Поколение 8: лучш. fitness = 0.9726


GA evaluations:  58%|█████▊    | 349/600 [07:45<05:03,  1.21s/it]

Поколение 9: лучш. fitness = 0.9726


GA evaluations:  64%|██████▍   | 385/600 [08:21<02:41,  1.33it/s]

Поколение 10: лучш. fitness = 0.9726


GA evaluations:  70%|██████▉   | 419/600 [08:55<02:21,  1.28it/s]

Поколение 11: лучш. fitness = 0.9726


GA evaluations:  76%|███████▌  | 455/600 [09:24<01:50,  1.32it/s]

Поколение 12: лучш. fitness = 0.9726


GA evaluations:  82%|████████▏ | 491/600 [10:05<01:32,  1.17it/s]

Поколение 13: лучш. fitness = 0.9726


GA evaluations:  87%|████████▋ | 523/600 [10:42<01:25,  1.12s/it]

Поколение 14: лучш. fitness = 0.9726


GA evaluations:  93%|█████████▎| 557/600 [11:11<00:42,  1.00it/s]

Поколение 15: лучш. fitness = 0.9726


GA evaluations:  99%|█████████▉| 595/600 [11:43<00:04,  1.10it/s]

Поколение 16: лучш. fitness = 0.9726


GA evaluations: 629it [12:25,  1.05it/s]                         

Поколение 17: лучш. fitness = 0.9756


GA evaluations: 667it [12:55,  1.25it/s]

Поколение 18: лучш. fitness = 0.9756


GA evaluations: 705it [13:29,  1.06s/it]

Поколение 19: лучш. fitness = 0.9756


GA evaluations: 743it [14:09,  1.10it/s]

Поколение 20: лучш. fitness = 0.9756


GA evaluations: 779it [14:57,  1.26s/it]

Поколение 21: лучш. fitness = 0.9817


GA evaluations: 817it [15:42,  1.05s/it]

Поколение 22: лучш. fitness = 0.9817


GA evaluations: 853it [16:23,  1.23s/it]

Поколение 23: лучш. fitness = 0.9817


GA evaluations: 891it [17:18,  1.75s/it]

Поколение 24: лучш. fitness = 0.9817


GA evaluations: 923it [18:07,  1.15s/it]

Поколение 25: лучш. fitness = 0.9817


GA evaluations: 959it [18:52,  1.41s/it]

Поколение 26: лучш. fitness = 0.9817


GA evaluations: 993it [19:40,  1.03it/s]

Поколение 27: лучш. fitness = 0.9817


GA evaluations: 1027it [20:15,  1.79s/it]

Поколение 28: лучш. fitness = 0.9817


GA evaluations: 1063it [20:55,  1.04s/it]

Поколение 29: лучш. fitness = 0.9817


GA evaluations: 1100it [21:51,  1.54s/it]

Поколение 30: лучш. fitness = 0.9817


In [15]:
best_solution, best_fitness, _ = ga.best_solution()
best_params = decode_solution(best_solution)

print("Лучшая точность:", best_fitness)
print("Лучшие гиперпараметры:", best_params)

GA evaluations: 1120it [46:11,  1.49s/it]

Лучшая точность: 0.9816791615907189
Лучшие гиперпараметры: {'criterion': 'gini', 'splitter': 'best', 'max_depth': 15, 'min_samples_split': 11, 'min_samples_leaf': 2, 'max_features': None}


### Лучшие значения

Лучшая точность: 0.9816791615907189

Лучшие гиперпараметры: {'criterion': 'gini', 'splitter': 'best', 'max_depth': 15, 'min_samples_split': 11, 'min_samples_leaf': 2, 'max_features': None}