In [None]:
def feature_selection_knn(X_train, X_test, y_train, y_test, categorical, non_categorical):
    '''Perform online feature selection for the KNN algorithm. Default params are used due for time constraints
    Inputs:
        - categorical (list): list of categorical features
        - non_categorical (list): list of numerical, non categorical features
        - X_train, X_test (Pandas dataframe): feature matrix splitted in train and test
        - y_train, y_test (numpy array): label for train and test
    Output: 
        - best_cat (list): list of best categorical features
        - best_non_cat (list): list of best non categorical features
    '''
 
    # Best parameter to search
    best_acc = 0
    best_cat = []
    best_non_cat = []

    # Define array of total possible features
    poss_features = np.array(categorical + non_categorical)
    
    # Do all the possible combinations of features (from 1 features to all)
    for k in range(1, 18):
        print("Starting k =",k)
        
        # All combinations with this fixed k
        poss_comb = list(itertools.combinations(range(0,17),k))
        for c in poss_comb:
            cat = []
            non_cat = []
        
        # First 12 features are categorical
        for i in list(c):
            if i in range(0, 12):
                cat.append(poss_features[i])
            else:
                non_cat.append(poss_features[i])

        # Compute distance matrix and KNN
        len_X_train = len(X_train)
        X_train_new, X_test_new = compute_distance_matrix(X_train.append(X_test), len_X_train, cat,non_cat, alpha = 1)
        
        neigh = KNeighborsClassifier(metric = 'precomputed')
        neigh.fit(X_train_new, y_train.ravel())
        y_pred = neigh.predict(X_test_new)
        
        # Find accuracy
        acc = accuracy_score(y_test, y_pred)

        # If improvement, save parameters
        if acc>best_acc:
            best_acc = acc
            best_cat = cat
            best_non_cat = non_cat
            print("Best combination found! Acc: {}, features: cat: {}, non_cat:{}".format(best_acc, best_cat, best_non_cat))
        
        # Clean variables
        del X_train_new, X_test_new
    
    return best_cat, best_non_cat

In [1]:
def feature_selection_knn(X, y, ham, pub):
    print(ctime())
    best_acc = 0
    best_cat = []
    best_non_cat = []
    
    categorical = ['class', 'tax_order', 'family', 'genus', "species", 'control_type', 'media_type',
                   'application_freq_unit',"exposure_type", "conc1_type", 'obs_duration_mean']

    non_categorical = ['ring_number', 'tripleBond', 'doubleBond', 'alone_atom_number', 'oh_count',
                       'atom_number', 'bonds_number', 'Mol', 'MorganDensity', 'LogP']
    feats = categorical + non_categorical

    X_train, _, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 43)
    del _
    len_X_train = len(X_train)
    print('Pubchem')
    pub_matrix = pub * pubchem2d_matrix(X)

    for k in range(1,len(feats)+1):
        print("Starting k =",k)
        poss_comb = list(itertools.combinations(feats, k))

        for c in poss_comb:
            cat = []
            num = []
            for var in list(c):
                if var in categorical:
                    cat.append(var)
                else:
                    num.append(var)
        
            dist_matr = basic_matrix(X, cat, num, ham)
            dist_matr += pub_matrix
            dist_matr = pd.DataFrame(dist_matr)

            neigh = KNeighborsClassifier(metric = 'precomputed', n_neighbors=1, n_jobs=-2, leaf_size=40)
            neigh.fit(dist_matr.iloc[:len_X_train, :len_X_train], y_train.ravel())
            y_pred = neigh.predict(dist_matr.iloc[len_X_train:, :len_X_train]) 

            acc = accuracy_score(y_test, y_pred)

            # If improvement, save parameters
            if acc>best_acc:
                best_acc = acc
                best_cat = cat
                best_non_cat = num
                print("Best combination found! Acc: {}, features: cat: {}, non_cat:{}".format(best_acc, best_cat,
                                                                                              best_non_cat))

    print(ctime())
    return cat, num   

In [2]:
import itertools
from helper_knn import *

_, X_train, _, y_train, _, _ = load_data_knn('data/lc_db_processed.csv', encoding = 'binary')

del _

ham = 0.009473684210526315
pub = 0.007105263157894737

cat, num = feature_selection_knn(X_train, y_train, ham, pub)

Sat Dec  5 14:20:52 2020
Pubchem
Starting k = 1
Best combination found! Acc: 0.528283530229326, features: cat: ['class'], non_cat:[]
Best combination found! Acc: 0.5310632383599722, features: cat: ['family'], non_cat:[]
Best combination found! Acc: 0.5317581653926338, features: cat: ['obs_duration_mean'], non_cat:[]


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').