<a href="https://colab.research.google.com/github/siman-giri/Machine_learning_LABS/blob/master/Feature_Selection_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
from __future__ import division

np.warnings.filterwarnings('ignore')


file_feature = 'arcene_train.data'
file_label = 'arcene_train.labels'
feature = pd.read_csv(file_feature, delim_whitespace=True)
label = pd.read_csv(file_label, delim_whitespace=True)

In [0]:

#remove any NAN values from Feature
feature = feature.dropna(how='any', axis=1)
#print(feature.shape)
# remove redundant_values(unique values_features appearing only one time)
for col in feature.columns:
    if len(feature[col].unique()) == 1:
        feature.drop(col, inplace=True, axis=1)

In [0]:
feature = np.array(feature)
label = np.array(label)
#print(label)

In [0]:
header = {'filter_method': ['Wrapper_Filter(SFF)', 'Embedded_Filter(Lasso)',
                            'Filter(corelation)' , 'Filter(FRation)' , ]}
final_matrix = {5: [np.nan, np.nan, np.nan, np.nan ],
                20: [np.nan, np.nan, np.nan, np.nan],
                50: [np.nan, np.nan , np.nan, np.nan]}

final_matrix = pd.DataFrame(data=final_matrix, index=header['filter_method'])
print(final_matrix.head())

                        5   20  50
Wrapper_Filter(SFF)    NaN NaN NaN
Embedded_Filter(Lasso) NaN NaN NaN
Filter(corelation)     NaN NaN NaN
Filter(FRation)        NaN NaN NaN


In [0]:
class seqfeedforward:
    def __init__(self, n = 10):
        self.n  = n
        self.selected_feature = []
        self.accuracy = []

    def __fit_model__(self, feature, target):
        from sklearn.linear_model import LogisticRegression
        from sklearn.metrics import accuracy_score
        self.model = LogisticRegression()
        #self.accuracy_score = accuracy_score(pred, actual)
        self.model.fit(feature, target)
        predicted = self.model.predict(feature)
        accuracy = accuracy_score(predicted,target)
        return accuracy
    
    def fit_sfs(self, feature, target):
        import numpy as np
        number_selected_feature = 0
        data_selected_feature = []
        best_accuracy = 0

        for i, data in enumerate(feature.T):
            temp_feature_set = [col for col in data_selected_feature]
            temp_feature_set.append(data)
            current_accuracy = self.__fit_model__(np.array(temp_feature_set).T, target)
            self.accuracy.append(current_accuracy)
            if current_accuracy > best_accuracy:
                number_selected_feature += 1
                best_accuracy = current_accuracy
                data_selected_feature = temp_feature_set
                self.selected_feature.append(i)
                if number_selected_feature >= self.n:
                    break

    def sel_index(self, feature):
        import numpy as np
        return feature[:, (np.array(self.selected_feature))]










In [0]:
N = [5, 20, 50]
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
for n in N:
    filt_wrapper = seqfeedforward(n = n)
    selected_index = filt_wrapper.fit_sfs(feature, label)
    new_feature = filt_wrapper.sel_index(feature)
    logistic_regression = LogisticRegression()
    logistic_regression.fit(new_feature, label)
    prediction = logistic_regression.predict(new_feature)
    accuracy = '{}%'.format(np.round(accuracy_score(label, prediction) * 100 , 2))
    final_matrix.loc['Wrapper_Filter(SFF)', n] = accuracy

print(final_matrix)

                            5       20      50
Wrapper_Filter(SFF)     76.77%  93.94%  100.0%
Embedded_Filter(Lasso)     NaN     NaN     NaN
Filter(corelation)         NaN     NaN     NaN
Filter(FRation)            NaN     NaN     NaN


In [0]:
class Embedded_Method:
    def __init__(self):
        import numpy as np
        self.n_iterattions = 1000
        self.l_r = 0.0001
        self.alpha  = 1e-12   # regularization parameter
        self.weights = []
        self.loss_history = []
        self.loss_function = lambda X,Y,W : np.sum(np.square(X.dot(W)-Y)/(2.0*len(Y)))
    def sigmoid_activation(self, x):
        import numpy as np
        return 1/(1+ np.exp(-x))
    def __fit__(self, feature, target):
        import numpy as np
        self.W = np.zeros(len(feature[0]))
        for i in range(self.n_iterattions):
            h = self.sigmoid_activation(feature.dot(self.W))
            loss = (-target * np.log(h) - (1 - target) * np.log(1 - h)).mean()
            gradient = np.dot(feature.T, (h - target)) / target.shape[0]
            self.W = self.W - self.l_r * gradient - self.alpha * np.sign(self.W)
            self.W[self.W < 1e-15] = 0
            cost = self.loss_function(feature, target, self.W)
            if i % 10 == 0:
                self.loss_history.append(cost)

    def class_prob(self, feature):
        import numpy as np
        return self.sigmoid_activation(np.dot(feature, self.W))

    def predict(self, feature, threshold=0.5):
        return self.class_prob(feature) >= threshold
    
    def root_mean_square(self, Y, Pred_Y):
        import numpy as np
        score = np.sum(np.square(np.array(Y-Pred_Y)))/len(Y)
        return score
        '''score = np.array(Y -Pred_Y)
        score = np.array(score**2)
        score = np.sum(score)/len(Y)
        return score'''
        


In [0]:
from sklearn.model_selection import train_test_split
label_ = label.reshape(label.shape[0])
label_[label_ == -1 ] = 0
label_[label_ == 1] = 1
x_train, x_test, y_train, y_test = train_test_split(feature, label_)

In [0]:
embed_filter = Embedded_Method()
embed_filter.__fit__(x_train, y_train)

In [0]:
N = [5, 20, 50]
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
for n in N:
    selected_indexes = np.argpartition(embed_filter.W, -n)[n:]
    model = LogisticRegression()
    new_feature = feature[:, selected_indexes]
    x_train, x_test, y_train, y_test = train_test_split(new_feature, label_)
    model.fit(x_train, y_train)
    prediction = model.predict(x_test)
    accuracy = '{}%'.format(np.round(accuracy_score(prediction, y_test) * 100 , 2))
    final_matrix.loc['Embedded_Filter(Lasso)', n] = accuracy

In [0]:
print(final_matrix)

                            5       20      50
Wrapper_Filter(SFF)     76.77%  93.94%  100.0%
Embedded_Filter(Lasso)   92.0%   84.0%   72.0%
Filter(corelation)         NaN     NaN     NaN
Filter(FRation)            NaN     NaN     NaN


In [0]:
file_feature = 'arcene_train.data'
file_label = 'arcene_train.labels'
feature_filter = pd.read_csv(file_feature, delim_whitespace=True)
label_filter = pd.read_csv(file_label, delim_whitespace=True)

In [0]:
 #remove any NAN values from Feature
feature_filter = feature_filter.dropna(how='any', axis=1)
#print(feature.shape)
# remove redundant_values(unique values_features appearing only one time)
for col in feature_filter.columns:
    if len(feature_filter[col].unique()) == 1:
        feature_filter.drop(col, inplace=True, axis=1)

In [0]:
feature_filter = np.array(feature_filter)
label_filter = np.array(label_filter)

In [0]:
class filter_on_corealtion_2:
    def __init__(self, feature, label):
        self.X = feature
        self.Y = label.T[0]
        self.n = len(self.X.T)

    def find_covariance(self):
        import numpy as np
        mean_x = np.expand_dims(np.mean(self.X, axis=0), 1).T
        diff_x = self.X - mean_x
        mean_y = np.mean(self.Y)
        diff_y = self.Y - mean_y
        cov = np.sum(diff_x.T * diff_y, axis=1) / (len(self.Y) - 1)
        return cov

    def pearson_coeff(self):
        import numpy as np
        covariance = self.find_covariance()
        std_deviation_x = np.std(self.X.T, axis=1)
        std_deviation_y = np.std(self.Y)
        pear_coef = covariance / (std_deviation_x * std_deviation_y)
        return pear_coef

    def corelation_filter(self, k=20):
        import numpy as np
        pearson = self.pearson_coeff()
        pearson = np.where(np.isnan(pearson), 0, pearson)
        selected_index = np.flip(np.argsort(np.absolute(pearson)))[:k]
        #selected_feature = np.take()
        return selected_index

    def selected_feature(self,feature, n):
      import numpy as np
      sel_index  = self.corelation_filter(n)
      sel_feature = np.take(feature.T,sel_index, axis = 0).T
      return sel_feature
    

    









In [0]:
N = [5, 20, 50]
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
for n in N:
    filter1 = filter_on_corealtion_2(feature_filter, label_filter)
    selected_feature = filter1.selected_feature(feature_filter, n=n)
    logistic_regression = LogisticRegression()
    logistic_regression.fit(selected_feature, label_filter)
    prediction = logistic_regression.predict(selected_feature)
    accuracy = '{}%'.format(np.round(accuracy_score(label_filter, prediction) * 100 , 2))
    final_matrix.loc['Filter(corelation)', n] = accuracy

print(final_matrix)

                            5       20      50
Wrapper_Filter(SFF)     76.77%  93.94%  100.0%
Embedded_Filter(Lasso)   92.0%   84.0%   72.0%
Filter(corelation)       69.7%  90.91%  93.94%
Filter(FRation)            NaN     NaN     NaN


In [0]:
class FRatio_Filter:

    def __init__(self, n = 10):
        self.n = n

    def calculate_fratio(self, row, target):
        import numpy as np
        mean_ = np.mean(row)
        var_same_class = 0.0
        var_diff_class = 0.0
        for val in np.unique(target):   # calculates the f_ratio for every row of the feature:
            current_index = np.where(target == val)[0]
            n = np.sum(row[current_index])
            mu = np.mean(row[current_index])
            var = np.mean(row[current_index])
            var_same_class += n * np.power((mu - mean_),2)
            var_diff_class += (n-1) * var
        f_ratio = var_same_class/var_diff_class
        return f_ratio


    def __fit__(self, feature, target):
        import numpy as np
        self.X = feature
        self.Y = target
        fratio = []
        for i in feature.T:
            fratio.append(self.calculate_fratio(i, target.T))
        self.fratio = np.array(fratio)
        self.selected_index = np.argpartition(self.fratio, -self.n)[-self.n:]

        return feature[:, self.selected_index]


In [0]:
N = [5, 20, 50]
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
for n in N:
    filter1 = FRatio_Filter(n = n)
    selected_feature = filter1.__fit__(feature, label)
    logistic_regression = LogisticRegression()
    logistic_regression.fit(selected_feature, label)
    prediction = logistic_regression.predict(selected_feature)
    accuracy = '{}%'.format(np.round(accuracy_score(label, prediction) * 100 , 2))
    final_matrix.loc['Filter(FRation)', n] = accuracy

print(final_matrix)


                            5       20      50
Wrapper_Filter(SFF)     76.77%  93.94%  100.0%
Embedded_Filter(Lasso)   92.0%   84.0%   72.0%
Filter(corelation)       69.7%  90.91%  93.94%
Filter(FRation)          59.6%  78.79%  100.0%
Filter(FRatio)           59.6%  78.79%  100.0%
