# Feature Selection

40% - for two or more filters  

20% - for embedded method

40% - for wrapper method

In [1]:
from __future__ import division

import numpy as np
np.warnings.filterwarnings('ignore')

import pandas as pd
from os.path import join
from sklearn.linear_model import LogisticRegression


In [2]:
# Number of features to evaulate :
N = [5, 20, 50]

In [3]:
def calculate_accuracy(predicted, actual ):
        correct = 0.0
        for i in range(len(predicted)):
            if predicted[i] == actual[i]:
                correct += 1
        
        return correct/ len(predicted)

In [4]:
header = {'filter_method' : ['FRatioFilter', 'SpearmanRankCorrelationFilter', 'SequentialForwardSelection' , 'LassoEmbedded' ]}

In [5]:
final_matrix = { 5 : [np.nan, np.nan, np.nan, np.nan] , 20 : [np.nan, np.nan, np.nan, np.nan] , 50 : [np.nan, np.nan , np.nan, np.nan]} 

In [6]:
final_matrix = pd.DataFrame(data=final_matrix, index=header['filter_method'])

In [7]:
final_matrix.head()

Unnamed: 0,5,20,50
FRatioFilter,,,
SpearmanRankCorrelationFilter,,,
SequentialForwardSelection,,,
LassoEmbedded,,,


In [8]:
class DataProcessor:
    def __init__(self, data_filename, label_filename):
        self.data_filename = data_filename
        self.label_filename = label_filename
        
        self.process_data()
        self.X = np.array(self.X)
        self.Y = np.array(self.Y)
        
    
    def process_data(self):
        self.X = pd.read_csv(self.data_filename, delim_whitespace=True)
        self.Y = pd.read_csv(self.label_filename, delim_whitespace=True)
        self.X = self.X.dropna(how='any', axis=1)
        self.remove_with_one_value()
    
    def remove_with_one_value(self):
        for col in self.X.columns:
            if len(self.X[col].unique()) == 1:
                self.X.drop(col,inplace=True,axis=1)

    

In [9]:
data_folder = 'Task9_DataSet'
filename_data_train = join(data_folder, 'arcene_train.data')
filename_label_train = join(data_folder, 'arcene_train.labels')

In [10]:
data = DataProcessor(filename_data_train, filename_label_train)

In [11]:
print(data.X.shape)
print(data.X[:5])


(99, 9919)
[[  0  41  82 ...   0 284 423]
 [  0   0   1 ...   0  34 508]
 [  0  56  44 ...   0   0 469]
 [105   0 141 ...   0   0 354]
 [ 38  62   0 ...  18  59 340]]


## Filters:
### 1. FRatio Filter

In [12]:
class FRatioFilter:
    def __init__(self, n=10):
        self.n = n
    
    def __calculate_F_ratio__(self,row, y_data):
        Mu = np.mean(row)
        inter_class = 0.0
        intra_class = 0.0
        for value in np.unique(y_data):
            index_for_this_value = np.where(y_data == value)[0]
            n = np.sum(row[index_for_this_value])
            mu = np.mean(row[index_for_this_value])
            var = np.var(row[index_for_this_value])
            inter_class += n * np.power(( mu - Mu),2)
            intra_class += (n - 1) * var
        
        f_ratio = inter_class/intra_class
        return f_ratio
            
    
    
    def fit(self, x_data, y_data):
        self.x_data, self.y_data = x_data, y_data
        f_ratios = []
        for feature in x_data.T:
            f_ratio = self.__calculate_F_ratio__(feature, y_data.T)
            f_ratios.append(f_ratio)
        self.f_ratios = np.array(f_ratios)
        # return top n f_ratios
        self.selection_indexes = np.argpartition(self.f_ratios, -self.n)[-self.n:]
        
    def transform(self, x_data, y_data):
        return x_data[:,self.selection_indexes]
    
    def fit_and_transform(self, x_data, y_data):
        self.fit(x_data, y_data)
        return self.transform(x_data, y_data)
        

In [13]:
fs = FRatioFilter()
new_X = fs.fit_and_transform(data.X, data.Y)
indexes = fs.selection_indexes
print(indexes)

[2193 6165 6155 6163 6162 2201 2202 6157 6156    0]


In [14]:
# Calculation for evaluation metrix in the end

for n in N:
    fs = FRatioFilter(n=n)
    new_X = fs.fit_and_transform(data.X, data.Y)
    logistic_regression = LogisticRegression()
    logistic_regression.fit(new_X, data.Y)
    prediction = logistic_regression.predict(new_X)
    accuracy = '{}%'.format(np.round(calculate_accuracy(prediction, data.Y) * 100 , 2))
    final_matrix.loc['FRatioFilter', n] = accuracy


In [15]:
final_matrix.head()

Unnamed: 0,5,20,50
FRatioFilter,69.7%,78.79%,100.0%
SpearmanRankCorrelationFilter,,,
SequentialForwardSelection,,,
LassoEmbedded,,,


## 2. SpearmanCorrelation Filter

In [18]:
class SpearmanRankCorrelationFilter:
    def __init__(self):
        self.n = 10
    
    def __calculate_spearmancorrelation__(self,row, y_data):
        y_data = np.reshape(y_data, len(y_data[0]))
        temp_data = {'row' : row,
                     'y_data' : y_data
                    }
        temp_frame = pd.DataFrame(temp_data)
        temp_frame['row_rank'] = temp_frame['row'].rank()
        temp_frame['y_data_rank'] = temp_frame['y_data'].rank()
        temp_frame['rank_diff_sq'] = (temp_frame['row_rank'] - temp_frame['y_data_rank']).apply(lambda x: np.power(x,2))        
        correlation_rho = 1 -  ( 6 * np.sum(temp_frame['rank_diff_sq']) /  ( len(row) * np.power(len(row), 2)  - 1 ) )
        return correlation_rho
    
    def fit(self, x_data, y_data):
        self.n = n
        correlations = []
        for feature in x_data.T:
            correlation = self.__calculate_spearmancorrelation__(feature, y_data.T)
            correlations.append(correlation)
        self.correlations = np.array(correlations)
    
    def transform(self, x_data, y_data,n = 10):
        self.n = n
        self.selection_indexes = np.argpartition(self.correlations, -self.n)[-self.n:]
        return x_data[:,self.selection_indexes]
    
    def fit_and_transform(self, x_data, y_data, n = 10):
        self.fit(x_data, y_data)
        return self.transform(x_data, y_data, n)

In [19]:
sfc = SpearmanRankCorrelationFilter()
sfc.fit(data.X, data.Y)

In [20]:
new_X = sfc.transform(data.X, data.Y)
indexes = sfc.selection_indexes
print(indexes)

[9881 1854 6229 6734 4642 6883 8406 3938 7323 6930]


In [21]:
# Calculation for evaluation metrix in the end

for n in N:
    new_X = sfc.transform(data.X, data.Y, n)
    logistic_regression = LogisticRegression()
    logistic_regression.fit(new_X, data.Y)
    prediction = logistic_regression.predict(new_X)
    accuracy = '{}%'.format(np.round(calculate_accuracy(prediction, data.Y) * 100 , 2))
    final_matrix.loc['SpearmanRankCorrelationFilter', n] = accuracy

In [22]:
final_matrix.head()

Unnamed: 0,5,20,50
FRatioFilter,69.7%,78.79%,100.0%
SpearmanRankCorrelationFilter,63.64%,72.73%,83.84%
SequentialForwardSelection,,,
LassoEmbedded,,,


# Wrapper

In [23]:

class SequentialForwardSelection:
    def __init__(self, model, n=10):
        self.n = n
        self.model = model
        self.selection_indexes = []
    
    
    def __calculate_accuracy__(self, predicted, actual ):
        correct = 0.0
        for i in range(len(predicted)):
            if predicted[i] == actual[i]:
                correct += 1
        
        return correct/ len(predicted)
    
    
    
    def __fit_model__(self, dataset, y_data):
        self.model.fit(dataset, y_data)
        predicted_y = self.model.predict(dataset)
        accuracy = self.__calculate_accuracy__(predicted_y, y_data)
        return accuracy
        
    
    def fit(self, x_data, y_data):
        number_of_selected_feature = 0
        selected_dataset = []
        best_accuracy = 0
        self.accuracy = []
        for i, feature in enumerate(x_data.T):
            temp_dataset = [column for column in selected_dataset]
            temp_dataset.append(feature)
            
            current_accuracy = self.__fit_model__(np.array(temp_dataset).T, y_data)
            self.accuracy.append(current_accuracy)
            if current_accuracy > best_accuracy:
                number_of_selected_feature += 1
                best_accuracy = current_accuracy
                selected_dataset = temp_dataset
                self.selection_indexes.append(i)
                if number_of_selected_feature >= self.n:
                    break

                    
    def transform(self, x_data, y_data):
        return x_data[:,np.array(self.selection_indexes)]

    
    def fit_and_transform(self, x_data, y_data):
        self.fit(x_data, y_data)
        return self.transform(x_data, y_data)

Testing Wrapper On Logistic Regression

In [24]:
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression()

In [25]:
sfs = SequentialForwardSelection(lg)

In [26]:
sfs.fit(data.X, data.Y)

In [27]:
sfs.selection_indexes

[0, 2, 3, 4, 7, 19, 33, 44, 61, 63]

In [28]:
new_X = sfs.transform(data.X, data.Y)

In [29]:
for n in N:
    fs = SequentialForwardSelection(LogisticRegression(), n=n)
    new_X = fs.fit_and_transform(data.X, data.Y)
    logistic_regression = LogisticRegression()
    logistic_regression.fit(new_X, data.Y)
    prediction = logistic_regression.predict(new_X)
    accuracy = '{}%'.format(np.round(calculate_accuracy(prediction, data.Y) * 100 , 2))
    final_matrix.loc['SequentialForwardSelection', n] = accuracy

# Embedded Method
Lasso Selection

In [87]:
class LassoRegression:
    def __init__(self,epochs = 1000, lr = 1e-15, reg_param = 1e-12):
        self.lr = lr
        self.reg_param = reg_param
        self.epochs = epochs
        self.cost_function = lambda X,Y,B: np.sum(np.square(X.dot(B) - Y)) / (2.0 * len(Y))
        self.B = []
        self.cost_history = []
    
    def __sigmoid__(self, z):
        return 1 / (1 + np.exp(-z))
    
    def fit(self, X, Y):
        self.B = np.zeros(len(X[0]))
        size = len(Y)
        for i in range(self.epochs):
            h = self.__sigmoid__(X.dot(self.B))
            loss =  (-Y * np.log(h) - (1 - Y) * np.log(1 - h)).mean()
            gradient = np.dot(X.T, (h - Y)) / Y.shape[0]
            self.B = self.B - self.lr * gradient - self.reg_param*np.sign(self.B)
            self.B[self.B < 1e-15 ] = 0
            cost = self.cost_function(X,Y,self.B)
            if i % 10 == 0:
                self.cost_history.append(cost)
    def predict_probs(self, X):
        return self.__sigmoid__(np.dot(X, self.B))

    def predict(self, X, threshold=0.5):
        return self.predict_probs(X) >= threshold
    
    
    def rmse(self,Y, Y_pred):
        s = np.array(Y - Y_pred)
        s = np.array(s**2)
        s = np.sum(s)/ len(Y)
        return s

In [88]:
from sklearn.model_selection import train_test_split
data_y = data.Y.reshape(data.Y.shape[0])
data_y[data_y == -1 ] = 0
data_y[data_y == 1] = 1
x_train, x_test, y_train, y_test = train_test_split(data.X, data_y)

In [89]:
ls = LassoRegression()
ls.fit(x_train, y_train)

In [90]:
print(ls.B)
print(ls.B.nonzero())
print(len(ls.B))
print(len(ls.B.nonzero()[0]))

[0. 0. 0. ... 0. 0. 0.]
(array([], dtype=int64),)
9919
0


In [91]:
print('So, Out of {} features only {} were selected after Lasso Embedded Feature Selection Method'.format(len(ls.B), len(ls.B.nonzero()[0])))

So, Out of 9919 features only 0 were selected after Lasso Embedded Feature Selection Method


In [92]:
prediction = ls.predict(x_test)
print(prediction[:5], y_test[:5])

[ True  True  True  True  True] [1 1 1 1 0]


In [97]:
# Picking With maximum n features

for n in N:
    selected_indeces = np.argpartition(ls.B, -n)[n:]
    logistic_regression = LogisticRegression()
    new_x = data.X[:, selected_indeces]
    x_train, x_test, y_train, y_test = train_test_split(new_x, data_y)
    logistic_regression.fit(x_train, y_train)
    prediction = logistic_regression.predict(x_test)
    accuracy = '{}%'.format(np.round(calculate_accuracy(prediction, y_test) * 100 , 2))
    final_matrix.loc['LassoEmbedded', n] = accuracy


Evaluation Metrix

In [98]:
final_matrix.head()

Unnamed: 0,5,20,50
FRatioFilter,69.7%,78.79%,100.0%
SpearmanRankCorrelationFilter,63.64%,72.73%,83.84%
SequentialForwardSelection,76.77%,93.94%,100.0%
LassoEmbedded,84.0%,84.0%,88.0%
