In [1]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version=1, parser = 'auto')
X, y = mnist.data, mnist.target

In [10]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import random
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [62]:
num_particles = 32
num_iterations = 10
num_features = 28
col_len = 28
max_value = 783
std_dev = 150
mean = max_value / 2
pop_size = (num_particles, col_len)

population = np.random.normal(mean, std_dev, pop_size).astype(int)
population[population > max_value] = max_value
population[population < 0] = 0

In [55]:
len(population)

32

In [56]:
def pso_feature_selection(X_train, y_train, X_test, y_test, population, num_iterations):
    num_particles= population.shape[0]
    max_value = X_train.shape[1] - 1

    c1 = c2 = 1.49

    global_best = None
    global_best_accuracy = 0.0

    p_best_positions = population.copy()
    p_best_fitness_values = np.zeros(num_particles)

    velocity = np.random.randint(0, 28, size=(num_particles, num_features))

    for iteration in range(1, num_iterations + 1):
        print(global_best)
        print()
        print(global_best_accuracy)
        w_max = 0.41
        w_min = 0.4
        w = w_max - ((w_max - w_min) * iteration) / num_iterations

        for i in range(num_particles):
            print(global_best_accuracy)
            particle = population[i]

            X_train_ind = X_train.iloc[:,particle]
            rf_classifier = RandomForestClassifier(n_estimators=10, random_state=42)
            rf_classifier.fit(X_train_ind, y_train)

            y_pred = rf_classifier.predict(X_test.iloc[:,particle])

            accuracy = accuracy_score(y_test, y_pred)

            if accuracy > p_best_fitness_values[i]:
                p_best_positions[i] = particle.copy()
                p_best_fitness_values[i] = accuracy

            if accuracy > global_best_accuracy:
                global_best = particle.copy()
                global_best_accuracy = accuracy

        for i in range(num_particles):
            velocity_term = w * velocity[i]
            personal_best_term = c1  * (p_best_positions[i] - population[i])
            global_best_term = c2  * (global_best - population[i])

            velocity[i] = velocity_term + personal_best_term + global_best_term
            population[i] = np.round(population[i] + velocity[i]).astype(int)

            population[i][population[i] > max_value] = max_value
            population[i][population[i] < 0] = 0

    return global_best

In [None]:
result = pso_feature_selection(X_train, y_train, X_test, y_test, population, num_iterations)

In [58]:
from sklearn.metrics import classification_report
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

rf_classifier.fit(X_train.iloc[:,result], y_train)

y_pred = rf_classifier.predict(X_test.iloc[:,result])
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.8570952380952381


In [59]:
new_population = [150, 572, 658, 213, 262, 435, 268, 319, 485, 543, 315, 322, 405,
       158, 265, 520, 487, 244, 183, 374, 352, 270, 327, 400, 517,  78,
       180, 539]

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

rf_classifier.fit(X_train.iloc[:,new_population], y_train)

y_pred = rf_classifier.predict(X_test.iloc[:,new_population])
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9301904761904762
