# Feature Selection

40% - for two or more filters  

20% - for embedded method

40% - for wrapper method

In [3]:
from __future__ import division

import numpy as np
np.warnings.filterwarnings('ignore')

import pandas as pd
from os.path import join

In [4]:
class DataProcessor:
    def __init__(self, data_filename, label_filename):
        self.data_filename = data_filename
        self.label_filename = label_filename
        
        self.process_data()
        self.X = np.array(self.X)
        self.Y = np.array(self.Y)
        
    
    def process_data(self):
        self.X = pd.read_csv(self.data_filename, delim_whitespace=True)
        self.Y = pd.read_csv(self.label_filename, delim_whitespace=True)
        self.X = self.X.dropna(how='any', axis=1)
        self.remove_with_one_value()
    
    def remove_with_one_value(self):
        for col in self.X.columns:
            if len(self.X[col].unique()) == 1:
                self.X.drop(col,inplace=True,axis=1)

    

In [5]:
data_folder = 'Task9_DataSet'
filename_data_train = join(data_folder, 'arcene_train.data')
filename_label_train = join(data_folder, 'arcene_train.labels')

In [6]:
data = DataProcessor(filename_data_train, filename_label_train)

In [7]:
print(data.X.shape)
print(data.X[:5])


(99, 9919)
[[  0  41  82 ...   0 284 423]
 [  0   0   1 ...   0  34 508]
 [  0  56  44 ...   0   0 469]
 [105   0 141 ...   0   0 354]
 [ 38  62   0 ...  18  59 340]]


## Filters:
### 1. FRatio Filter

##### https://stats.stackexchange.com/questions/277123/fisher-score-feature-selection-implementation

In [8]:
class FRatioFilter:
    def __init__(self, n=10):
        self.n = n
    
    def __calculate_F_ratio__(self,row, y_data):
        Mu = np.mean(row)
        inter_class = 0.0
        intra_class = 0.0
        for value in np.unique(y_data):
            index_for_this_value = np.where(y_data == value)[0]
            n = np.sum(row[index_for_this_value])
            mu = np.mean(row[index_for_this_value])
            var = np.var(row[index_for_this_value])
            inter_class += n * np.power(( mu - Mu),2)
            intra_class += (n - 1) * var
        
        f_ratio = inter_class/intra_class
        return f_ratio
            
    
    
    def fit(self, x_data, y_data):
        self.x_data, self.y_data = x_data, y_data
        f_ratios = []
        for feature in x_data.T:
            f_ratio = self.__calculate_F_ratio__(feature, y_data.T)
            f_ratios.append(f_ratio)
        self.f_ratios = np.array(f_ratios)
        # return top n f_ratios
        self.selection_indexes = np.argpartition(self.f_ratios, -self.n)[-self.n:]
        
    def transform(self, x_data, y_data):
        return x_data[:,self.selection_indexes]
    
    def fit_and_transform(self, x_data, y_data):
        self.fit(x_data, y_data)
        return self.transform(x_data, y_data)
        

In [9]:
fs = FRatioFilter()
new_X = fs.fit_and_transform(data.X, data.Y)
indexes = fs.selection_indexes
print(indexes)

[2193 6165 6155 6163 6162 2201 2202 6157 6156    0]


## 2. SpearmanCorrelation Filter

In [29]:
class SpearmanRankCorrelationFilter:
    def __init__(self,n=10):
        self.n = n
    
    def __calculate_spearmancorrelation__(self,row, y_data):
        y_data = np.reshape(y_data, len(y_data[0]))
        temp_data = {'row' : row,
                     'y_data' : y_data
                    }
        temp_frame = pd.DataFrame(temp_data)
        temp_frame['row_rank'] = temp_frame['row'].rank()
        temp_frame['y_data_rank'] = temp_frame['y_data'].rank()
        temp_frame['rank_diff_sq'] = (temp_frame['row_rank'] - temp_frame['y_data_rank']).apply(lambda x: np.power(x,2))        
        correlation_rho = 1 -  ( 6 * np.sum(temp_frame['rank_diff_sq']) /  ( len(row) * np.power(len(row), 2)  - 1 ) )
        return correlation_rho
    
    def fit(self, x_data, y_data):
        correlations = []
        for feature in x_data.T:
            correlation = self.__calculate_spearmancorrelation__(feature, y_data.T)
            correlations.append(correlation)
        self.correlations = np.array(correlations)
        self.selection_indexes = np.argpartition(self.correlations, -self.n)[-self.n:]
    
    def transform(self, x_data, y_data):
        return x_data[:,self.selection_indexes]
    
    def fit_and_transform(self, x_data, y_data):
        self.fit(x_data, y_data)
        return self.transform(x_data, y_data)

In [30]:
sfc = SpearmanRankCorrelationFilter()
sfc.fit(data.X, data.Y)

In [31]:
new_X = sfc.transform(data.X, data.Y)
indexes = sfc.selection_indexes
print(indexes)

[9881 1854 6229 6734 4642 6883 8406 3938 7323 6930]


# Wrapper

In [63]:

class SequentialForwardSelection:
    def __init__(self, model, n=10):
        self.n = n
        self.model = model
        self.selection_indexes = []
    
    
    def __calculate_accuracy__(self, predicted, actual ):
        correct = 0.0
        for i in range(len(predicted)):
            if predicted[i] == actual[i]:
                correct += 1
        
        return correct/ len(predicted)
    
    
    
    def __fit_model__(self, dataset, y_data):
        self.model.fit(dataset, y_data)
        predicted_y = self.model.predict(dataset)
        accuracy = self.__calculate_accuracy__(predicted_y, y_data)
        return accuracy
        
    
    def fit(self, x_data, y_data):
        number_of_selected_feature = 0
        selected_dataset = []
        best_accuracy = 0
        self.accuracy = []
        for i, feature in enumerate(x_data.T):
            temp_dataset = [column for column in selected_dataset]
            temp_dataset.append(feature)
            
            current_accuracy = self.__fit_model__(np.array(temp_dataset).T, y_data)
            self.accuracy.append(current_accuracy)
            if current_accuracy > best_accuracy:
                number_of_selected_feature += 1
                best_accuracy = current_accuracy
                selected_dataset = temp_dataset
                self.selection_indexes.append(i)
                if number_of_selected_feature >= self.n:
                    break

                    
    def transform(self, x_data, y_data):
        return x_data[:,np.array(self.selection_indexes)]

    
    def fit_and_transform(self, x_data, y_data):
        self.fit(x_data, y_data)
        return self.transform(x_data, y_data)

Testing Wrapper On Logistic Regression

In [64]:
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression()

In [65]:
sfs = SequentialForwardSelection(lg)

In [66]:
sfs.fit(data.X, data.Y)

In [67]:
sfs.selection_indexes

[0, 2, 3, 4, 7, 19, 33, 44, 61, 63]

In [69]:
new_X = sfs.transform(data.X, data.Y)

In [70]:
print(sfs.accuracy)

[0.5656565656565656, 0.5656565656565656, 0.5757575757575758, 0.5959595959595959, 0.7373737373737373, 0.7171717171717171, 0.7373737373737373, 0.7676767676767676, 0.7070707070707071, 0.7272727272727273, 0.7373737373737373, 0.7676767676767676, 0.7474747474747475, 0.7373737373737373, 0.696969696969697, 0.7171717171717171, 0.7474747474747475, 0.7676767676767676, 0.7272727272727273, 0.7777777777777778, 0.7474747474747475, 0.7272727272727273, 0.7575757575757576, 0.7272727272727273, 0.6868686868686869, 0.7575757575757576, 0.7474747474747475, 0.7676767676767676, 0.7373737373737373, 0.7373737373737373, 0.7575757575757576, 0.7777777777777778, 0.7676767676767676, 0.797979797979798, 0.7878787878787878, 0.7777777777777778, 0.7777777777777778, 0.7777777777777778, 0.7878787878787878, 0.7777777777777778, 0.7777777777777778, 0.797979797979798, 0.7777777777777778, 0.7777777777777778, 0.8080808080808081, 0.7878787878787878, 0.7777777777777778, 0.7777777777777778, 0.797979797979798, 0.797979797979798, 0.79