In [None]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import RepeatedKFold
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import os
import h5py
import pandas as pd
from multiprocessing.dummy import Pool as ThreadPool

def opti_func(train_i,test_i,crt_clf,X,Y):
    filt,clf = crt_clf()
    
    X_train, X_test = X[train_i], X[test_i]
    y_train, y_test = Y[train_i], Y[test_i]
    
    X_train = filt.fit_transform(X_train, y_train)
    X_test = filt.transform(X_test)
    clf.fit(X_train, y_train)
    
    return clf.score(X_test,y_test)*100
    
    
data_path = os.path.join('.','tests','test_output','data.hdf5')

dat = h5py.File(data_path, 'r')

X = np.array(dat.get('data'))
Y = np.array(dat.get('data_label'))
dat.close()

# Number of features to take 

featurenum_list = [10, 100, 1000, 10000, 100000] 
    

In [None]:
#### Decision Tree ####

for n in featurenum_list: 
    rkf = RepeatedKFold(n_splits=10, n_repeats=10)
    crt_clsf = lambda a_1 = chi2, a_2 = n :(SelectKBest(a_1, k=a_2),DecisionTreeClassifier())
    pool = ThreadPool(100)
    acc = [pool.apply(opti_func,args=(i_train,i_test,crt_clsf,X,Y) for i_train,i_test in rkf.split(X)]
    print("For SELKB+DT n = {}, mean = {}, var = {}".format(n,np.mean(acc),np.var(acc)))

In [None]:
for n in featurenum_list: 
    rkf = RepeatedKFold(n_splits=10, n_repeats=10)
    crt_clsf = lambda a_1 = n :(PCA(n_components=a_1),DecisionTreeClassifier())
    pool = ThreadPool(100)
    acc = [pool.apply(opti_func,args=(i_train,i_test,crt_clsf,X,Y) for i_train,i_test in rkf.split(X)]
    print("For PCA+DT n = {}, mean = {}, var = {}".format(n,np.mean(acc),np.var(acc)))

In [None]:
#### KNN ####
Ks = [1, 3, 5, 11]
for n in featurenum_list:
    for k in Ks:
        rkf = RepeatedKFold(n_splits=10, n_repeats=10)
        crt_clsf = lambda a_1 = chi2, a_2 = n, a_3 = k:(SelectKBest(a_1, k=a_2),KNeighborsClassifier(n_neighbors=a_3))
        pool = ThreadPool(100)
        Decacc = [pool.apply(opti_func,args=(i_train,i_test,crt_clsf,X,Y) for i_train,i_test in rkf.split(X)]
        print("For SELKB+knn n = {}, k = {}, mean = {}, var = {}".format(n,k,np.mean(acc),np.var(acc)))

In [None]:
for n in featurenum_list:
    for k in Ks:
        rkf = RepeatedKFold(n_splits=10, n_repeats=10)
        crt_clsf = lambda a_1 = n, a_3 = k :(PCA(n_components=a_1),KNeighborsClassifier(n_neighbors=a_3))
        pool = ThreadPool(100)
        Decacc = [pool.apply(opti_func,args=(i_train,i_test,crt_clsf,X,Y) for i_train,i_test in rkf.split(X)]
        print("For PCA+knn n = {}, k = {}, mean = {}, var = {}".format(n,k,np.mean(acc),np.var(acc)))