In [0]:
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import random
import math
from sklearn.utils import shuffle


def GenerateGaussianData(means, stds, datapoints):
    nrClasses = len(means)
    nrFeatures = len(means[0])
    Xdata = []
    Ydata = []
    for i in range(nrClasses):
        for j in range(datapoints[i]):
            features = []
            for k in range(nrFeatures):
                features.append(np.random.normal(
                    loc=means[i][k], scale=stds[i][k], size=None))
            Xdata.append(features)
            Ydata.append(i)
    Xdata = np.array(Xdata)
    Ydata = np.array(Ydata)
    Xdata, Ydata = shuffle(Xdata, Ydata)
    return Xdata, Ydata


def gaussian_clusters(nr_features=2, nr_classes=4, nr_data_points=10, \
                      random_diag=False, super_mega_random=False, variance_size=1):
    
    if super_mega_random:
        sigma = np.random.rand(nr_classes,nr_features)*variance_size
    elif random_diag:
        sigma = np.random.rand(nr_classes,nr_features)
        sigma[sigma > 0.5] = 1*variance_size
        sigma[sigma <= 0.5] = 0.01
    else:    
        sigma = [np.append([1]*(nr_features//2), [0.01]
                           * (nr_features-nr_features//2))]*nr_classes
    
    
    means = np.random.normal(0, 0.2, (nr_classes, nr_features))
    dataPoints = [nr_data_points] * nr_classes
    Xdat, Ydat = GenerateGaussianData(means, sigma, dataPoints)
    
    #dist = np.zeros(nr_classes)
    #for i in range(nr_classes):
    #  dist[i] = np.linalg.norm(means[0,:]-means[i,:])
    #print(dist)
    
    return Xdat, Ydat

Generate data

In [0]:
from sklearn.model_selection import train_test_split

X,y = gaussian_clusters(nr_features=100, nr_classes=200, \
                           nr_data_points=100, random_diag=True, variance_size=1)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

Apply RFE model

In [0]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import RFE

model = LinearDiscriminantAnalysis()

# Train model
rfe = RFE(model,50)
rfe = rfe.fit(X_train,y_train)

orig = model.fit(X_train,y_train)

# Print selected features and ranking of unselected
#print(rfe.ranking_)

Evaluate RFE model

In [4]:
score_rfe = rfe.score(X_test,y_test)
score_orig = orig.score(X_test,y_test)

# Print scores
print(score_rfe)
print(score_orig)

0.23742424242424243
0.42424242424242425


Filter with variance threshold + evaluation

In [5]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier

selector = VarianceThreshold(0.55)
X_high_variance = selector.fit_transform(X_train,y_train)

# Train model
neigh = KNeighborsClassifier(n_neighbors=30)
neigh.fit(X_high_variance,y_train);
score_vt = neigh.score(selector.transform(X_test), y_test)

neigh.fit(X_train,y_train);
score_orig = neigh.score(X_test,y_test)

# Print scores
print(score_vt)
print(score_orig)


0.13893939393939395
0.2912121212121212
