In [1]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml

In [2]:
mnist = fetch_openml('mnist_784', version=1, parser='auto')
X, y = mnist.data, mnist.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [3]:
def fitness_rf_classifier(particle, X_train, X_test, y_train, y_test):
    rf_classifier = RandomForestClassifier(n_estimators=10, random_state=42)
    rf_classifier.fit(X_train.iloc[:, particle], y_train)

    y_pred = rf_classifier.predict(X_test.iloc[:, particle])

    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

In [4]:
def differential_evolution(X_train, X_test, y_train, y_test, pop_size, totalfeat, max_iterations):
    # Initialization
    num_particles, col_len = pop_size
    max_value = 783
    std_dev = 150
    mean = max_value / 2

    # Initialize population using normal distribution
    population = np.random.normal(mean, std_dev, (num_particles, col_len)).astype(int)
    population[population > max_value] = max_value
    population[population < 0] = 0

    for iteration in tqdm(range(max_iterations)):
        for i in range(num_particles):
            # Select three random individuals
            candidates = np.random.choice(num_particles, 3, replace=False)
            a, b, c = population[candidates]

            # Mutation
            mutant = np.clip(a + 0.5 * (b - c), 0, max_value).astype(int)

            # Crossover
            crossover_mask = np.random.rand(col_len) < 0.7
            trial = np.where(crossover_mask, mutant, population[i])

            # Selection
            if fitness_rf_classifier(trial, X_train, X_test, y_train, y_test) > fitness_rf_classifier(population[i], X_train, X_test, y_train, y_test):
                population[i] = trial

    # Find the best individual
    best_individual = population[np.argmax([fitness_rf_classifier(ind, X_train, X_test, y_train, y_test) for ind in population])]
    
    return best_individual

In [5]:
num_features = 28  # Change this according to your actual number of features
pop_size = 16
max_iter = 10

num_particles = 16
col_len = 28
max_value = 783

best_solution = differential_evolution(X_train, X_test, y_train, y_test, (num_particles, col_len), max_value, max_iter)

100%|██████████| 10/10 [10:05<00:00, 60.59s/it]


In [6]:
print(best_solution)

[459 371 409 514 305 262 295 439 326 291 588 365 307 290 302 254 515 324
 219 348 290 149 119 483 406 466 358 213]


In [7]:
X_train_subset = X_train.iloc[:, best_solution]
X_test_subset = X_test.iloc[:, best_solution]

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_subset, y_train)
y_pred = clf.predict(X_test_subset)
acc = accuracy_score(y_test, y_pred)

print(acc)

0.8682857142857143


In [8]:
num_features = 28  # Change this according to your actual number of features
max_iter = 10

num_particles = 16
col_len = 28
max_value = 783

max_value = 783
std_dev = 150
mean = max_value / 2

# Initialize population using normal distribution
population = np.random.normal(mean, std_dev, (num_particles, col_len)).astype(int)
population[population > max_value] = max_value
population[population < 0] = 0

In [9]:
population[0]

array([429, 200, 515, 353, 438, 320, 359, 242, 409, 517, 566, 170, 400,
       431, 482, 509, 594, 343, 672, 407, 356, 278, 570, 526, 305, 436,
       521, 386])

In [16]:
X_train_subset = X_train.iloc[:, population[0]]
X_test_subset = X_test.iloc[:, population[0]]

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_subset, y_train)
y_pred = clf.predict(X_test_subset)
acc = accuracy_score(y_test, y_pred)

In [17]:
print(acc)

0.6915238095238095


In [3]:
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

new_population = [459 ,371 ,409 ,514 ,305 ,262 ,295, 439, 326, 291, 588 ,365 ,307, 290, 302 ,254, 515, 324,
 219 ,348 ,290 ,149 ,119, 483, 406, 466 ,358 ,213]

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

rf_classifier.fit(X_train.iloc[:,new_population], y_train)

y_pred = rf_classifier.predict(X_test.iloc[:,new_population])
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

report = classification_report(y_test, y_pred)
print(report)

Accuracy: 0.8682857142857143
              precision    recall  f1-score   support

           0       0.93      0.95      0.94      2058
           1       0.93      0.97      0.95      2364
           2       0.83      0.83      0.83      2133
           3       0.87      0.86      0.86      2176
           4       0.81      0.80      0.81      1936
           5       0.88      0.87      0.88      1915
           6       0.89      0.93      0.91      2088
           7       0.89      0.84      0.86      2248
           8       0.85      0.80      0.83      1992
           9       0.78      0.81      0.79      2090

    accuracy                           0.87     21000
   macro avg       0.87      0.87      0.87     21000
weighted avg       0.87      0.87      0.87     21000

