In [165]:
%config Completer.use_jedi = False

In [166]:
import numpy as np
import os
import csv
from sklearn import svm
import time
import pandas as pd
from sklearn.utils import shuffle
from random import sample 

In [167]:
class Kernel():
    def linear():
        def result(x, y):
            return np.dot(x.T, y)
        return result
    
    def polynomial(degree, gamma=1.0, intercept=1.0):
        def result(x,y):
            return (gamma * (x.T @ y) + intercept) ** degree
        return result

    def gaussian(sigma = 1.):
        def result(x,y):
            return np.exp(-np.linalg.norm(x-y)/(2 * sigma ** 2))   
        return result
    
    def sigmoid(gamma = 1., intercept = 1.):
        def result(x,y):
            return np.tanh(gamma * np.dot(x.T, y) + intercept)
        return result

In [169]:
class SimpleMkl():
    def __init__(self, kernels, C = 1.):
        '''
        kernels: list of kernel functions
        C: regularization parameter used in SVM
        '''
        self.kernel_functions = kernels
        self.C = C
    
    def fit(self, X, y, verbose = False):
        '''
        X: training points
        y: label value (1 or -1) for each training point
        '''
        self.verbose = verbose
        
        self.x_train = X
        self.y_train = y
        self.y_matrix_outer = np.outer(self.y_train, self.y_train)
        
        self.get_kernel_matrices()
        
        self.epsilon_d = 1e-6
        self.duality_threshold = 0.02  
        self.epsilon_D = 1e-6
        
        
        d = np.repeat(1./self.num_of_kernels, self.num_of_kernels)
        self.model_svm = svm.SVC(C = self.C, kernel = 'precomputed')
             
        outer_iteration = 0
        outer_loop_stop = False
        
        if verbose:
            print('start loop ...')
        prev_dual_gap = None
        
        while not outer_loop_stop:
            # compute J and optimal alpha
            combined_kernel_matrix = self.get_weighted_kernel_matrix(d)
            J_d, alpha = self.get_J(combined_kernel_matrix)
            
            # compute dJ (gradient of J with respect to d)
            dJ = self.get_derivative_J(alpha)
            
            if verbose:
                print('outer iteration: ', outer_iteration)
                print('J_d: ', J_d)
                print('dJ: ', dJ)
                print('alpha: ', alpha)
                
            # change outer_loop_stop            
            duality_gap = self.get_duality_gap(J_d, alpha, dJ)
            
            if verbose:
                print('duality gap: ', duality_gap)
                print('prev dual gap: ', prev_dual_gap)
                
            if outer_iteration == 0:
                prev_dual_gap = duality_gap
            else:    
                if np.abs(duality_gap - prev_dual_gap) < self.duality_threshold:
                    if verbose:
                        print('reach dual gap thershold: ', np.abs(duality_gap - prev_dual_gap))
                    outer_loop_stop = True
                    break
                else:
                    prev_dual_gap = duality_gap
#             kkt_condition = self.get_KKT_condition(outer_iteration, dJ, d)
#             print('kkt', kkt_condition)
#             if kkt_condition:
#                 outer_loop_stop = True
#                 break
            outer_iteration += 1    
            
            # set mu
            # mu is index of the largest component in d for better numerical stability
            mu = np.argmax(d)
            
            # compute D direction
            D = self.get_D_direction(d, dJ, mu)
            ''''''
            D = self.l1_norm_on_sum(D, 0)
            
            if verbose:
                print('mu: ', mu)
                print('D: ', D)
                
            # set parameters for inner loop
            J_prim = 0
            d_prim = np.copy(d)
            D_prim = np.copy(D)
            
            inner_iteration = 0
            # inner loop : descent direction update
            # J_prim < J_d - epsilon
            # and  J_d - J_prim > self.J_thershold
            while J_prim < J_d:
                inner_iteration += 1
                
                if verbose:
                    print('inner iteration: ', inner_iteration)
                
                # set parameters
                d = np.copy(d_prim)
                D = np.copy(D_prim)
                
                # check d
                flag = False
                for i in d:
                    if i == 1:
                        flag = True
                        break
                if flag == True:
                    break
                  
                # find gamma_max, v
                gamma_max, v = self.get_gamma_max_and_index(d, D)
                if gamma_max == None:
                    gamma_max, v = 0, 0
                    
                if verbose:
                    print('d: ', d)
                    print('D: ', D)
                    print('gamma max: ', gamma_max)
                    
                # update d_prim
                d_prim = d + gamma_max * D
                
                # update D_prim direction
                D_prim[mu] = D[mu] + D[v]
                D_prim[v] = 0
                
                ''''''
                d_prim = self.l1_norm_on_sum(d_prim, 1)
                D_prim = self.l1_norm_on_sum(D_prim, 0)
                
                if verbose:
                    print('d_prim: ', d_prim)
                    print('D_prim: ', D_prim)
                
                # update J_prim
                combined_kernel_matrix = self.get_weighted_kernel_matrix(d_prim)
                J_prim, alpha = self.get_J(combined_kernel_matrix)

                if verbose:
                    print('J_prim: ', J_prim)
                    print('alpha_prim: ', alpha)
                    
                # update J_d with inner updated d
                # some use d_prim for J_d
                combined_kernel_matrix = self.get_weighted_kernel_matrix(d)
                J_d, alpha = self.get_J(combined_kernel_matrix)
                
                if verbose:
                    print('J_d: ', J_d)
                    print('alpha: ', alpha)
                    
            # line search
            gamma = self.armijo_rule(gamma_max, d, D, J_d)
            
            if verbose:
                print('armijo gamma: ', gamma)
                print('d: ', d)
                print('D: ', D)
            
            # check d
            flag = False
            for i in d:
                if i == 1:
                    flag = True
                    break
            if flag == False:
                # update d
                d = d + gamma * D
                ''''''
                d = self.l1_norm_on_sum(d, 1)
            
            if verbose:
                print('final d: ', d)
                
        self.kernel_weights = np.copy(d)    
        self.fitted_combined_kernel_matrix = self.get_weighted_kernel_matrix(d)
        self.model_svm.fit(self.fitted_combined_kernel_matrix, self.y_train)
        return self.kernel_weights
        
        
    def predict(self, x_test):
        test_size = x_test.shape[0]
        train_size = self.x_train.shape[0]
        kernel_matrix = np.zeros((test_size, train_size))
        for i in range(test_size):
            for j in range(train_size):
                kernel_matrix[i,j] = sum([self.kernel_weights[m] * self.kernel_functions[m](x_test[i, ], self.x_train[j,]) for m in range(len(self.kernel_functions))])
        
        self.predicted = np.array(self.model_svm.predict(kernel_matrix))
        return self.predicted
    
    
    def score(self, X, y):
        predicted_y = self.predict(X)
        return float(sum(y == predicted_y))/len(y)
    
    
    def matrix_pos_dif(self, matrix, etha):
        # costraint: etha > 0
        return np.all(np.linalg.eigvals(matrix) > etha)

    
    def get_kernel_matrices(self):
        X = self.x_train
        n = self.x_train.shape[0]
        kernels = self.kernel_functions
        M = len(kernels)
        
        kernel_matrices = [np.matrix(np.zeros((n,n))) for i in range(M)]
        for m in range(M):
            for i in range(n):
                for j in range(n):
                    kernel_matrices[m][i,j] = kernels[m](X[i], X[j])
        
        
        # kernel_matrices must be positive definite
        # all eigenvalues greater than some η > 0
        # to enforce a small ridge may be added to the diagonal of the Kernel matrices
        constraint = [self.matrix_pos_dif(kernel_mat, 0) for kernel_mat in kernel_matrices]
#         if self.verbose:
#             print('pos dif: ', constraint)
            
        epsilon = 1e-2
        for i, k_martix in enumerate(kernel_matrices):
            if constraint[i] == False:
                new_kernel = k_martix + epsilon* np.eye(k_martix.shape[0],k_martix.shape[1])
                if self.matrix_pos_dif(new_kernel, 0) == False:
                    pass
#                     print('!!!!!!!!!!! Something wrong here!')
                kernel_matrices[i] = new_kernel
        
        constraint = [self.matrix_pos_dif(kernel_mat, 0) for kernel_mat in kernel_matrices]
#         if self.verbose:
#             print('pos dif: ', constraint)
            
        # with using tuple we can't change it by mistake
        # kernel_matrices ---> (m, n, n)
        self.kernel_matrices = tuple(kernel_matrices)   
        self.num_of_kernels = len(self.kernel_matrices)
        
        return self.kernel_matrices
    
    
    def get_weighted_kernel_matrix(self, d):
        M = self.num_of_kernels
        n = self.kernel_matrices[0].shape[0]

        # combined_kernel_matrix ---> (n,n)
        combined_kernel_matrix = sum([d[m] * self.kernel_matrices[m] for m in range(M)])
        return combined_kernel_matrix
    
    
    def get_J(self, kernel_matrix):
        if self.verbose:
            print('enter J...')
            
        self.model_svm.fit(kernel_matrix, self.y_train)
        
        if self.verbose:
            print('finish svm fit.')
        
        # geting alphas and thier indices in support vectors
        # alpha ---> (num_support_vectors,)
        alpha = self.model_svm.dual_coef_[0]
        indices = self.model_svm.support_
        
        # all_alpha ---> (n,)
        all_alpha = np.zeros((self.y_train.shape[0]))
        n = all_alpha.shape[0]
        for i in range(n):
            if i in indices:  # we reach one of the support vectors
                index = np.where(indices == i)
                index = index[0][0]
                # fill all_alpha
                all_alpha[i] = alpha[index]
            else:
                continue  # it had been filled with zero
                
        # equation 10
        # J ---> scaler (cost function)
        # we should multiply pairwise because of sum_ij(alpa_i* alpha_i * y_i * y_j * kernel_ij)
        J = -0.5*(np.absolute(all_alpha)@(kernel_matrix * self.y_matrix_outer)@(np.absolute(all_alpha.T)))\
               + np.sum(np.absolute(all_alpha))

        # another calculation:
        # alpha_matrix = np.outer(np.absolute(all_alpha),np.absolute(all_alpha))
        # J = -0.5* (sum(sum(alpha_matrix * kernel_matrix * self.y_matrix_outer))) + np.sum(np.absolute(all_alpha))
        
        return J, all_alpha
          
        
    def get_derivative_J(self, alpha):
        # equation 11
        M = self.num_of_kernels
        
        # dJ ---> (M,)
        dJ = np.array([(-0.5 *(alpha@(self.kernel_matrices[m]*self.y_matrix_outer)@alpha.T)) for m in range(M)])
        return dJ
    
    
    def get_D_direction(self, d, dJ, mu):
        # equation 12
        
        #TODO: is it nesscery to normlize dJ or D??
        D = np.zeros(self.num_of_kernels)
        D_mu = 0
        for i in range(self.num_of_kernels):
            if (d[i] < self.epsilon_d or d[i] == 0) and ((dJ[i] - dJ[mu]) > 0):
                D[i] = 0.
            elif i != mu:
                D[i] = -dJ[i] + dJ[mu]
                D_mu -= D[i]
        D[mu] = D_mu
        return D
    
    
    def get_gamma_max_and_index(self, d, D):
        gamma_max = None
        v = None
        for m in range(self.num_of_kernels):
            if (gamma_max == None and D[m] < 0) or (D[m] < 0 and -d[m]/D[m] < gamma_max):
                v = m
                gamma_max = -d[m]/D[m]
        return gamma_max, v
    
    
    def armijo_rule(self, gamma_max, d, D, J_d, beta = 0.9, sigma = 0.01):
        gamma = gamma_max
        armijo_terminated = False
        while not armijo_terminated:
            new_d = d + gamma*D
            
            combined_kernel_matrix = self.get_weighted_kernel_matrix(new_d)
            new_J_d, alpha = self.get_J(combined_kernel_matrix)
            
            new_dJ = self.get_derivative_J(alpha)
            
            if (J_d - new_J_d) >= sigma * np.sum(new_dJ * gamma * D):
                # TODO: new_J_d <= J_d + gamma * armijo_sigma * D.T.dot(dJ)
                armijo_terminated = True
            else:
                gamma *= beta
        return gamma    
   
    
    def get_duality_gap(self, J_d, alpha, dJ):      
#         duality_gap = (J_d - np.sum(np.absolute(alpha)) + np.max(-dJ)) * 2
        duality_gap = (J_d - np.sum(np.absolute(alpha)) + np.max(-dJ)) / J_d
        return duality_gap[0,0]


    def get_KKT_condition(self, iteration, dJ, d, epsilon = 0.01):
        M = len(dJ)
        if iteration == 0:
            return False
        else:
            dJ_min = 1e+4
            dJ_max = -1e+4
            dm0_min = 1e+5
            for m in range(M):
                if d[m] > 0:
                    if dJ[m] < dJ_min:
                        dJ_min = dJ[m]

                    if dJ[m] > dJ_max:
                        dJ_max = dJ[m]
                else:
                    if dJ[m] < dm0_min:
                        dm0_min = dJ[m]
            result = (dJ_max - dJ_min < epsilon) and dm0_min >= dJ_max
            return result

        
    def l1_norm_on_sum(self, input_list, sum_):
        for i in range(len(input_list)):
            input_list[i] = round(input_list[i], 8)

        u = np.argmax(abs(input_list))
        diff = sum_ - sum(input_list)
        if diff <= 0.0001:
            input_list[u] += diff
            return input_list


In [257]:
class DataSet:               
    def load_datasets(self, dataset_path):
        self.dataframe = pd.read_csv(dataset_path, sep=',', header=None)
        index_label = self.dataframe.columns[-1]
#         print(self.dataframe[index_label])
#         self.dataframe = shuffle(self.dataframe)
        
        self.train_size = int(len(self.dataframe))
        self.x_train = self.dataframe.copy()
        self.x_train = self.x_train.drop([index_label], axis=1).to_numpy(dtype='float64')

        self.y_train = self.dataframe[index_label]
        self.y_train = self.y_train.to_numpy()
        
        print("Finished reading dataset ...")
        
    def __init__(self, dataset_path):
        self.load_datasets(dataset_path)

In [291]:
kernel_1 = Kernel.gaussian(1.0)
kernel_2 = Kernel.gaussian(0.5)
kernel_3 = Kernel.gaussian(2.0)
kernel_4 = Kernel.gaussian(0.1)
kernel_5 = Kernel.linear()
kernel_6 = Kernel.polynomial(2, 1)
kernel_7 = Kernel.polynomial(3, 1)
kernel_8 = Kernel.sigmoid()

In [175]:
class Call_on_datasets():
    def __init__(self, data_path_train, data_path_test, flag=False):
        self.data_train = DataSet(dataset_path= data_path_train)
        self.data_test = DataSet(dataset_path= data_path_test)

        print('Dataset shape: ', self.data_train.x_train.shape, self.data_train.y_train.shape)
#         print('Data Test: ', self.data_test.x_train.shape, self.data_test.y_train.shape)
        
        if flag:
            sampling_size = self.data_train.x_train.shape[0]
            tmp_y_train = self.data_test.y_train.reshape(-1, 1)

            data_t = np.append(self.data_test.x_train, tmp_y_train, axis=1)
            _data_train = sample(list(data_t), sampling_size)
            _data_train = np.asarray(_data_train)
            _y_train = _data_train[:, -1]
            _x_train = np.delete(_data_train, -1, axis=1)
            
            self.data_test.x_train = _x_train
            self.data_test.y_train = _y_train
#             print('Data Test: ', self.data_test.x_train.shape, self.data_test.y_train.shape)


    def evalue(self, list_kernels, verbose=True):
        list_kernels = tuple(list_kernels)
        if verbose:
            print('\nSimpleMKL..')
        start = time.time()
        simpleMkl_model = SimpleMkl(C=1., kernels=list_kernels)
        simpleMkl_model.fit(self.data_train.x_train, self.data_train.y_train, verbose=False)
        end = time.time()
        processing_time = end - start
        if verbose:
            print('Running Time:  ', processing_time,'s')
            print('weights: ', simpleMkl_model.kernel_weights)

            print('\n')
            print('Train Accuracy: ')
        start = time.time()
        self.train_acc = simpleMkl_model.score(self.data_train.x_train, self.data_train.y_train)* 100
        end = time.time()
        processing_time = end - start
        if verbose:
            print(self.train_acc, '%')
            print('Testing execution time: ', processing_time,'s')

            print('\n')
            print('Test Accuracy: ')
        start = time.time()
        self.test_acc = simpleMkl_model.score(self.data_test.x_train, self.data_test.y_train)* 100
        end = time.time()
        processing_time = end - start
        if verbose:
            print(self.test_acc, '%')
            print('Testing execution time: ', processing_time,'s')


# Breast-w

In [8]:
breast_w = Call_on_datasets(data_path_train= './NewDatasets/new_breast_w_train.data', \
                           data_path_test= './NewDatasets/new_breast_w_test.data', flag=True)

list_kernels = [kernel_8, kernel_1, kernel_2, kernel_3, kernel_4, kernel_5]   
# list_kernels = [kernel_6, kernel_1, kernel_2]   

breast_w.evalue(list_kernels)

Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (25, 10) (25,)

SimpleMKL..
Running Time:   43.650038957595825 s
weights:  [0. 0. 0. 0. 0. 1.]


Train Accuracy: 
44.0 %
Testing execution time:  0.040164947509765625 s


Test Accuracy: 
64.0 %
Testing execution time:  0.040081024169921875 s


# Messidor

In [60]:
messidor = Call_on_datasets(data_path_train= './NewDatasets/new_messidor_train.data', \
                           data_path_test= './NewDatasets/new_messidor_test.data', flag=True)

list_kernels = [kernel_1, kernel_2, kernel_3, kernel_4]   
# list_kernels = [kernel_6, kernel_1]   
list_kernels = [kernel_6, kernel_1, kernel_2]   

messidor.evalue(list_kernels)

Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (40, 19) (40,)

SimpleMKL..
Running Time:   1.349165916442871 s
weights:  [1. 0. 0.]


Train Accuracy: 
100.0 %
Testing execution time:  0.11119771003723145 s


Test Accuracy: 
70.0 %
Testing execution time:  0.16142964363098145 s


# Car

In [211]:
car = Call_on_datasets(data_path_train= './NewDatasets/new_car_train.data', \
                           data_path_test= './NewDatasets/new_car_test.data', flag=True)

# list_kernels = [kernel_1, kernel_2, kernel_3, kernel_4]   
list_kernels = [kernel_8, kernel_1, kernel_2]   

car.evalue(list_kernels)

Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (62, 6) (62,)

SimpleMKL..
Running Time:   0.5553033351898193 s
weights:  [0. 1. 0.]


Train Accuracy: 
100.0 %
Testing execution time:  0.36735081672668457 s


Test Accuracy: 
98.38709677419355 %
Testing execution time:  0.2961091995239258 s


In [176]:
car = Call_on_datasets(data_path_train= './NewDatasets/new_car_train.data', \
                           data_path_test= './NewDatasets/new_car_test.data', flag=True)

list_kernels = [kernel_1, kernel_2, kernel_3, kernel_4]   
# list_kernels = [kernel_6, kernel_1, kernel_2]   

car.evalue(list_kernels)

Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (62, 6) (62,)

SimpleMKL..
Running Time:   0.6819605827331543 s
weights:  [1. 0. 0. 0.]


Train Accuracy: 
100.0 %
Testing execution time:  0.5845355987548828 s


Test Accuracy: 
91.93548387096774 %
Testing execution time:  0.47055482864379883 s


# SpamBase

In [73]:
spambase = Call_on_datasets(data_path_train= './NewDatasets/new_Spambase_train.data', \
                           data_path_test= './NewDatasets/new_Spambase_test.data', flag=True)

list_kernels = [kernel_1, kernel_2, kernel_3, kernel_4]   
list_kernels = [kernel_5, kernel_1, kernel_2]   

spambase.evalue(list_kernels)

Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (26, 57) (26,)

SimpleMKL..
Running Time:   13.975008487701416 s
weights:  [1. 0. 0.]


Train Accuracy: 
96.15384615384616 %
Testing execution time:  0.049661874771118164 s


Test Accuracy: 
88.46153846153845 %
Testing execution time:  0.040589332580566406 s


# Coil2000

In [77]:
coil = Call_on_datasets(data_path_train= './NewDatasets/new_coil2000_train.data', \
                           data_path_test= './NewDatasets/new_coil2000_test.data', flag=True)

list_kernels = [kernel_1, kernel_2, kernel_3, kernel_4]   
# list_kernels = [kernel_5, kernel_6, kernel_2]   

coil.evalue(list_kernels)

Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (84, 85) (84,)

SimpleMKL..
Running Time:   1.5594041347503662 s
weights:  [0. 0. 1. 0.]


Train Accuracy: 
95.23809523809523 %
Testing execution time:  0.9004759788513184 s


Test Accuracy: 
96.42857142857143 %
Testing execution time:  0.8556923866271973 s


# Bank

In [81]:
bank = Call_on_datasets(data_path_train= './NewDatasets/new_bank_train.data', \
                           data_path_test= './NewDatasets/new_bank_test.data', flag=True)

# list_kernels = [kernel_1, kernel_2, kernel_3, kernel_4]   
# list_kernels = [kernel_5, kernel_1, kernel_2]
list_kernels = [kernel_8, kernel_1]

bank.evalue(list_kernels)

Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (13, 16) (13,)

SimpleMKL..
Running Time:   0.06968379020690918 s
weights:  [0.48609358 0.51390642]


Train Accuracy: 
92.3076923076923 %
Testing execution time:  0.017798900604248047 s


Test Accuracy: 
100.0 %
Testing execution time:  0.017586469650268555 s


# Skin

In [93]:
skin = Call_on_datasets(data_path_train= './NewDatasets/new_skin_train.data', \
                           data_path_test= './NewDatasets/new_skin_test.data', flag=True)

# list_kernels = [kernel_1, kernel_2, kernel_3, kernel_4]   
list_kernels = [kernel_3, kernel_1, kernel_8, kernel_2, kernel_4]   

skin.evalue(list_kernels)

Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (18, 3) (18,)

SimpleMKL..
Running Time:   0.040435791015625 s
weights:  [0.31158425 0.19471775 0.10434062 0.19467869 0.19467869]


Train Accuracy: 
100.0 %
Testing execution time:  0.0974891185760498 s


Test Accuracy: 
100.0 %
Testing execution time:  0.11562156677246094 s


# covertype

In [289]:
covtype = Call_on_datasets(data_path_train= './NewDatasets/new_covtype_train.data', \
                           data_path_test= './NewRandomSelectionDatasets_COVT/new_covtype_test_0.data', flag=True)

list_kernels = [kernel_1, kernel_2, kernel_3, kernel_4, kernel_5, kernel_6, kernel_7, kernel_8]   
# list_kernels = [kernel_8, kernel_6, kernel_2]   

covtype.evalue(list_kernels)

Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (15, 54) (15,)

SimpleMKL..
Running Time:   0.02194070816040039 s
weights:  [0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125]


Train Accuracy: 
100.0 %
Testing execution time:  0.017953157424926758 s


Test Accuracy: 
93.33333333333333 %
Testing execution time:  0.017946958541870117 s


## compare with random

In [239]:
import Datasets_final as Datasets_F

In [108]:
class RandomSelection:
    def __init__(self, dataset, sampling_size=25):
        self.x_train = dataset.x_train
        self.y_train = dataset.y_train
        
        data_t = np.append(self.x_train, self.y_train, axis=1)
        
        self.data_train = sample(list(data_t), sampling_size)
        self.data_train = np.asarray(self.data_train)
        self.y_train = self.data_train[:, -1]
        self.x_train = np.delete(self.data_train, -1, axis=1)

In [97]:
class NewDatasetHumanLabeling:
    def __init__(self, dataset, new_x_train, output_dataset_path):
        start = time.time()

        new_y_train = self.LabelingNewDataset(dataset, new_x_train)
        self.create_new_dataset_csv_file(new_x_train, new_y_train, output_dataset_path)

        end = time.time()
        self.processing_time = end - start
    
    def LabelingNewDataset(self, dataset, new_x_train):
        new_y_train = []
        i = 0
        for new_x in new_x_train:
            new_y_train.append(dataset.y_train[(dataset.x_train==new_x).all(axis=1).nonzero()[0][0]][0])
        return new_y_train

    def create_new_dataset_csv_file(self, new_x_train, new_y_train, output_dataset_path):
        df = pd.DataFrame(new_x_train)
        df[len(df.columns)] = new_y_train
        # create a new csv file
        df.to_csv("test.csv", index=False)
        
        # remove first line of csv file which is the header of each coloumn
        with open("test.csv",'r') as f:
            with open(output_dataset_path,'w') as f1:
                next(f) # skip header line
                for line in f:
                    f1.write(line)
        os.remove("test.csv")

In [116]:
def save_data_test(x_test, y_test, output_dataset_path):
    df = pd.DataFrame(x_test)
    df[len(df.columns)] = y_test
    # create a new csv file
    df.to_csv("test.csv", index=False)

    # remove first line of csv file which is the header of each coloumn
    with open("test.csv",'r') as f:
        with open(output_dataset_path,'w') as f1:
            next(f) # skip header line
            for line in f:
                f1.write(line)
    os.remove("test.csv")

#  coil

In [137]:
class run_n_algo():
    def __init__(self, method):
        self.method = method
        
    def get_random_dataset(self, dataset, sampling_size):
#         print(dataset.x_train.shape, dataset.y_train.shape)
        random_selection = RandomSelection(dataset, sampling_size=sampling_size)
#         print('random: ', random_selection.x_train.shape, random_selection.y_train.shape)
        return random_selection
        
    def save_coil(self):
        dataset = Datasets_F.Coil2000_Dataset(dataset_path='./Datasets/coil2000.dat',dataset_name='Coil2000', 
                            train_size=0.02, normalization_method='None')
        random_selection = self.get_random_dataset(dataset, 68)
        save_data_test(random_selection.x_train, random_selection.y_train, "./NewRandomSelectionDatasets/new_coil2000_train.data")
        save_data_test(dataset.x_test, dataset.y_test, "./NewRandomSelectionDatasets/new_coil2000_test.data")
        
    def call_coil(self):
        self.train_acc = []
        self.test_acc = []
        for i in range(15):
            if self.method=='random':
                self.save_coil()
                coil = Call_on_datasets(data_path_train= "./NewRandomSelectionDatasets/new_coil2000_train.data", \
                                   data_path_test= "./NewRandomSelectionDatasets/new_coil2000_test.data", flag=True)
            else: 
                coil = Call_on_datasets(data_path_train= './NewDatasets/new_coil2000_train.data', \
                           data_path_test= './NewDatasets/new_coil2000_test.data', flag=True)

            list_kernels = [kernel_1, kernel_2, kernel_3, kernel_4]   

            coil.evalue(list_kernels, verbose=False)
            self.train_acc.append(coil.train_acc)
            self.test_acc.append(coil.test_acc)
        print('all accuracies: ', self.test_acc)
        print('min: ', min(self.test_acc))
        print('max: ', max(self.test_acc))
        print('mean: ', np.mean(self.test_acc))

In [136]:
random_method = run_n_algo('random')
random_method.call_coil()

Started reading dataset  Coil2000 ...
Finished reading dataset  Coil2000 ...
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (68, 85) (68,)
Started reading dataset  Coil2000 ...
Finished reading dataset  Coil2000 ...
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (68, 85) (68,)
Started reading dataset  Coil2000 ...
Finished reading dataset  Coil2000 ...
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (68, 85) (68,)
Started reading dataset  Coil2000 ...
Finished reading dataset  Coil2000 ...
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (68, 85) (68,)
Started reading dataset  Coil2000 ...
Finished reading dataset  Coil2000 ...
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (68, 85) (68,)
Started reading dataset  Coil2000 ...
Finished reading dataset  Coil2000 ...
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (68, 85) (68,)
Star

In [138]:
random_method = run_n_algo('selection')
random_method.call_coil()

Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (84, 85) (84,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (84, 85) (84,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (84, 85) (84,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (84, 85) (84,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (84, 85) (84,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (84, 85) (84,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (84, 85) (84,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (84, 85) (84,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (84, 85) (84,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (84, 85) (84,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (84, 85) (84,)
Finished reading data

# bank

In [240]:
class run_n_bank():
    def __init__(self, method):
        self.method = method
        
    def get_random_dataset(self, dataset, sampling_size):
#         print(dataset.x_train.shape, dataset.y_train.shape)
        random_selection = RandomSelection(dataset, sampling_size=sampling_size)
#         print('random: ', random_selection.x_train.shape, random_selection.y_train.shape)
        return random_selection
        
    def save_bank(self):
        dataset = Datasets_F.Bank_Marketing_Dataset('./Datasets/bank-full.csv', "Bank Marketing", 'y', 
                                      train_size=0.1, normalization_method="None", balance=True)
        random_selection = self.get_random_dataset(dataset, 13)
        save_data_test(random_selection.x_train, random_selection.y_train, "./NewRandomSelectionDatasets/new_bank_train.data")
        save_data_test(dataset.x_test, dataset.y_test, "./NewRandomSelectionDatasets/new_bank_test.data")
        
    def call_bank(self):
        self.train_acc = []
        self.test_acc = []
        for i in range(15):
            if self.method=='random':
                self.save_bank()
                bank = Call_on_datasets(data_path_train= "./NewRandomSelectionDatasets/new_bank_train.data", \
                                   data_path_test= "./NewRandomSelectionDatasets/new_bank_test.data", flag=True)
            else: 
                bank = Call_on_datasets(data_path_train= './NewDatasets/new_bank_train.data', \
                           data_path_test= './NewDatasets/new_bank_test.data', flag=True)

            list_kernels = [kernel_8, kernel_1]

            bank.evalue(list_kernels, verbose=False)
            self.train_acc.append(bank.train_acc)
            self.test_acc.append(bank.test_acc)
        print('all accuracies: ', self.test_acc)
        print('min: ', min(self.test_acc))
        print('max: ', max(self.test_acc))
        print('mean: ', np.mean(self.test_acc))

In [241]:
random_method = run_n_bank('random')
random_method.call_bank()

Started reading dataset  Bank Marketing ...
Finished reading dataset  Bank Marketing ...
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (13, 16) (13,)
Started reading dataset  Bank Marketing ...
Finished reading dataset  Bank Marketing ...
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (13, 16) (13,)
Started reading dataset  Bank Marketing ...
Finished reading dataset  Bank Marketing ...
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (13, 16) (13,)
Started reading dataset  Bank Marketing ...
Finished reading dataset  Bank Marketing ...
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (13, 16) (13,)
Started reading dataset  Bank Marketing ...
Finished reading dataset  Bank Marketing ...
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (13, 16) (13,)
Started reading dataset  Bank Marketing ...
Finished reading dataset  Bank Marketing ...
Finished reading data

In [143]:
select_method = run_n_bank('selection')
select_method.call_bank()

Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (13, 16) (13,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (13, 16) (13,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (13, 16) (13,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (13, 16) (13,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (13, 16) (13,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (13, 16) (13,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (13, 16) (13,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (13, 16) (13,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (13, 16) (13,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (13, 16) (13,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (13, 16) (13,)
Finished reading data

# skin

In [146]:
class run_n_skin():
    def __init__(self, method):
        self.method = method
        
    def get_random_dataset(self, dataset, sampling_size):
#         print(dataset.x_train.shape, dataset.y_train.shape)
        random_selection = RandomSelection(dataset, sampling_size=sampling_size)
#         print('random: ', random_selection.x_train.shape, random_selection.y_train.shape)
        return random_selection
        
    def save_skin(self):
        dataset = Datasets_F.Skin_NonSkin_Dataset('./Datasets/Skin_NonSkin.txt', "Skin Segmentation",
                                             train_size=0.0005, normalization_method="None")
        random_selection = self.get_random_dataset(dataset, 18)
        save_data_test(random_selection.x_train, random_selection.y_train, "./NewRandomSelectionDatasets/new_skin_train.data")
        save_data_test(dataset.x_test, dataset.y_test, "./NewRandomSelectionDatasets/new_skin_test.data")
        
    def call_skin(self):
        self.train_acc = []
        self.test_acc = []
        for i in range(15):
            if self.method=='random':
                self.save_skin()
                skin = Call_on_datasets(data_path_train= "./NewRandomSelectionDatasets/new_skin_train.data", \
                                   data_path_test= "./NewRandomSelectionDatasets/new_skin_test.data", flag=True)
            else: 
                skin = Call_on_datasets(data_path_train= './NewDatasets/new_skin_train.data', \
                           data_path_test= './NewDatasets/new_skin_test.data', flag=True)

            list_kernels = [kernel_3, kernel_1, kernel_8, kernel_2, kernel_4]   

            skin.evalue(list_kernels, verbose=False)
            self.train_acc.append(skin.train_acc)
            self.test_acc.append(skin.test_acc)
        print('all accuracies: ', self.test_acc)
        print('min: ', min(self.test_acc))
        print('max: ', max(self.test_acc))
        print('mean: ', np.mean(self.test_acc))

In [147]:
random_method = run_n_skin('random')
random_method.call_skin()

Started reading dataset  Skin Segmentation ...
Finished reading dataset  Skin Segmentation ...
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (18, 3) (18,)
Started reading dataset  Skin Segmentation ...
Finished reading dataset  Skin Segmentation ...
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (18, 3) (18,)
Started reading dataset  Skin Segmentation ...
Finished reading dataset  Skin Segmentation ...
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (18, 3) (18,)
Started reading dataset  Skin Segmentation ...
Finished reading dataset  Skin Segmentation ...
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (18, 3) (18,)
Started reading dataset  Skin Segmentation ...
Finished reading dataset  Skin Segmentation ...
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (18, 3) (18,)
Started reading dataset  Skin Segmentation ...
Finished reading dataset  Skin Segment

In [150]:
select_method = run_n_skin('selection')
select_method.call_skin()

Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (18, 3) (18,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (18, 3) (18,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (18, 3) (18,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (18, 3) (18,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (18, 3) (18,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (18, 3) (18,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (18, 3) (18,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (18, 3) (18,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (18, 3) (18,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (18, 3) (18,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (18, 3) (18,)
Finished reading dataset ...
Fin

In [155]:
select_method = run_n_skin('selection')
select_method.call_skin()

Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (18, 3) (18,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (18, 3) (18,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (18, 3) (18,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (18, 3) (18,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (18, 3) (18,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (18, 3) (18,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (18, 3) (18,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (18, 3) (18,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (18, 3) (18,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (18, 3) (18,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (18, 3) (18,)
Finished reading dataset ...
Fin

# covertype

In [254]:
class run_n_covt():
    def __init__(self, method):
        self.method = method
        
    def get_random_dataset(self, dataset, sampling_size):
#         print(dataset.x_train.shape, dataset.y_train.shape)
        random_selection = RandomSelection(dataset, sampling_size=sampling_size)
#         print('random: ', random_selection.x_train.shape, random_selection.y_train.shape)
        return random_selection
        
    def save_covt(self, iteration):
        dataset = Datasets_F.Covertype_Dataset('./Datasets/covtype.data', "Covertype", 
                                               train_size=0.02, normalization_method="None")
        random_selection = self.get_random_dataset(dataset, 50)
        save_data_test(random_selection.x_train, random_selection.y_train, "./NewRandomSelectionDatasets/new_covtype_train_"+str(iteration)+".data")
        save_data_test(dataset.x_test, dataset.y_test, "./NewRandomSelectionDatasets/new_covtype_test_"+str(iteration)+".data")
        
    def call_covt(self):
        self.train_acc = []
        self.test_acc = []
        for i in range(15):
            if self.method=='random':
#                 self.save_covt(i)
                covt = Call_on_datasets(data_path_train= "./NewRandomSelectionDatasets_COVT/new_covtype_train_"+str(i)+".data", \
                                   data_path_test= "./NewRandomSelectionDatasets_COVT/new_covtype_test_"+str(i)+".data", flag=True)
            else: 
                covt = Call_on_datasets(data_path_train= './NewDatasets/new_covtype_train.data', \
                           data_path_test= './NewRandomSelectionDatasets_COVT/new_covtype_test_0.data', flag=True)

#             list_kernels = [kernel_3, kernel_1, kernel_8, kernel_2, kernel_4]   
            list_kernels = [kernel_1, kernel_2, kernel_3, kernel_4, kernel_5, kernel_6, kernel_7, kernel_8]   

            covt.evalue(list_kernels, verbose=False)
            self.train_acc.append(covt.train_acc)
            self.test_acc.append(covt.test_acc)
        print('all accuracies: ', self.test_acc)
        print('min: ', min(self.test_acc))
        print('max: ', max(self.test_acc))
        print('mean: ', np.mean(self.test_acc))

In [255]:
random_method = run_n_covt('random')
random_method.call_covt()

Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (50, 54) (50,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (50, 54) (50,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (50, 54) (50,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (50, 54) (50,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (50, 54) (50,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (50, 54) (50,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (50, 54) (50,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (50, 54) (50,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (50, 54) (50,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (50, 54) (50,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (50, 54) (50,)
Finished reading data

In [256]:
select_method = run_n_covt('selection')
select_method.call_covt()

Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (15, 54) (15,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (15, 54) (15,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (15, 54) (15,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (15, 54) (15,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (15, 54) (15,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (15, 54) (15,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (15, 54) (15,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (15, 54) (15,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (15, 54) (15,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (15, 54) (15,)
Finished reading dataset ...
Finished reading dataset ...
Dataset shape:  (15, 54) (15,)
Finished reading data