# Training & Testing selected data using LibSVM

In [1]:
import time
import numpy as np
import pandas as pd
from sklearn.svm import SVC

In [2]:
import Datasets

## defining ReadData class to read datasets created in Newdataset folder after instance selection

In [3]:
class ReadDataset:
    def __init__(self, dataset_name, data_train_path, data_test_path, normalization_method='none'):
        self.dataset_name = dataset_name
        self.data_train_path = data_train_path
        self.data_test_path = data_test_path
        self.normalization_method = normalization_method
        
        self.read_data_train()
        self.read_data_test()
        
    def read_data_train(self):
        """
        self.x_train
        self.y_train
        """
        print("Started reading data train of dataset ", self.dataset_name, "...")

        self.data_ = []
        with open(self.data_train_path) as data_file_pointer:
            for line in data_file_pointer:
                tmp = line.split(',')
                instance = []
                for f in tmp:
                    instance.append(float(f))
                self.data_.append(instance)
        
        self.data_ = np.array(self.data_)
        np.random.shuffle(self.data_)
        
        # seperate class labels from data
        self.y_train = self.data_[:, -1]          # for last column
        self.x_train = self.data_[:, :-1]     # for all but last column
        
#         self.y_train = self.y_train.reshape(-1, 1)
        
        # remove self.dataframe to avoid storing data too much
#         del self.data_
        
        # Normalize data
#         self.normalize(self.normalization_method)

        print("Finished reading data train of dataset ", self.dataset_name, "...")
    

    def read_data_test(self):
        """
        self.x_train
        self.y_train
        """
        print("Started reading data test of dataset ", self.dataset_name, "...")

        self.data_ = []
        with open(self.data_test_path) as data_file_pointer:
            for line in data_file_pointer:
                tmp = line.split(',')
                instance = []
                for f in tmp:
                    instance.append(float(f))
                self.data_.append(instance)
        
        self.data_ = np.array(self.data_)
        np.random.shuffle(self.data_)
        
        # seperate class labels from data
        self.y_test = self.data_[:, -1]          # for last column
        self.x_test = self.data_[:, :-1]     # for all but last column
        
#         self.y_test = self.y_test.reshape(-1, 1)
        
        # remove self.dataframe to avoid storing data too much
#         del self.data_
        
        # Normalize data
#         self.normalize(self.normalization_method)

        print("Finished reading data test of dataset ", self.dataset_name, "...")

        
    """
    Normalizing data improves the convergence of learning model and causes that smaller features also be able to affect the model parameters.
    """
    def normalize(self, normalization_method):
        if normalization_method == 'none':
            print("No normalization.")
            return
        
        if normalization_method == 'zero_mean_unit_var':
            print("zero-mean & unit_variance normalization.")
            self.x_train_without_x0 = self.zero_mean_unit_variance(self.x_train)
            self.x_test_without_x0 = self.zero_mean_unit_variance(self.x_test)
            
            
        if normalization_method == 'scale_0_1':
            print("scaling to [0, 1] normalization.")
            self.x_train_without_x0 = self.scaling_between_0_1(self.x_train)
            self.x_test_without_x0 = self.scaling_between_0_1(self.x_test)
     
    
    def scaling_between_0_1(self, numpy_array):
        '''
        Scaling
        '''
        normed_numpy_array = (numpy_array - numpy_array.min(axis=0)) / (numpy_array.max(axis=0) - numpy_array.min(axis=0))
        return normed_numpy_array


    def zero_mean_unit_variance(self, numpy_array):
        '''
        Standardization
        '''
        normed_numpy_array = (numpy_array - numpy_array.mean(axis=0)) / numpy_array.std(axis=0)
        return normed_numpy_array


## defining *LibSVM* class training & testing functions

In [4]:
class LibSVM_Method:
    def __init__(self, C=1, kernel='linear', svm_library='sklearn'):        
        self.C = C
        self.kernel = kernel
        self.svm_library = svm_library

    def accuracy(self, y, y_pred):
        res = 0
        for i in range(0, len(y)):
            if y_pred[i] == y[i]:
                res += 1
        return (res / float(len(y))) * 100

    # returns the score|mean accuracy of the svm model
    def svm_mean_accuracy(self, data_x, data_y, dataset_name):
        start = time.time()
        
        model_score = self.svm_model.score(data_x, data_y)
        
        self.prediction_accuracy_processing_time = time.time() - start
        print("Calculating accuracy of LibSVM on dataset ", dataset_name, " Finished after ", self.prediction_accuracy_processing_time)
        
        return model_score*100
    
    def train(self, x_train, y_train, dataset_name):
        print("Started training LibSVM on dataset " + dataset_name + " ...")
        start = time.time()
        self.svm_model = SVC(C=self.C, kernel=self.kernel)
        self.svm_model.fit(x_train, y_train)
        self.train_processing_time = time.time() - start
        print("Training LibSVM on dataset ", dataset_name, " Finished after ", self.train_processing_time)

In [5]:
libsvm = LibSVM_Method()

# Testing LibSVM with NewDatasets created by instance selection

## Breast-W

In [16]:
breast_dataset = ReadDataset('Breast-w', './NewDatasets/new_breast_w_train.data', './NewDatasets/new_breast_w_test.data')


Started reading data train of dataset  Breast-w ...
Finished reading data train of dataset  Breast-w ...
Started reading data test of dataset  Breast-w ...
Finished reading data test of dataset  Breast-w ...


In [9]:
libsvm.train(breast_dataset.x_train, breast_dataset.y_train, "Breast-w")
print("\n")
print("Accuracy on data train: ", libsvm.svm_mean_accuracy(breast_dataset.x_train, breast_dataset.y_train, "Breast-w"), 
      " | processing time: ", libsvm.prediction_accuracy_processing_time)

print("Accuracy on data test: ", libsvm.svm_mean_accuracy(breast_dataset.x_test, breast_dataset.y_test, "Breast-w"), 
      " | processing time: ", libsvm.prediction_accuracy_processing_time)

Started training LibSVM on dataset Breast-w ...
Training LibSVM on dataset  Breast-w  Finished after  0.4678378105163574


Calculating accuracy of LibSVM on dataset  Breast-w  Finished after  0.001997232437133789
Accuracy on data train:  40.0  | processing time:  0.001997232437133789
Calculating accuracy of LibSVM on dataset  Breast-w  Finished after  0.0020105838775634766
Accuracy on data test:  34.146341463414636  | processing time:  0.0020105838775634766


## Messidor

In [10]:
messidor_dataset = ReadDataset('Messidor', './NewDatasets/new_messidor_train.data', './NewDatasets/new_messidor_test.data')


Started reading data train of dataset  Messidor ...
Finished reading data train of dataset  Messidor ...
Started reading data test of dataset  Messidor ...
Finished reading data test of dataset  Messidor ...


In [11]:
libsvm.train(messidor_dataset.x_train, messidor_dataset.y_train, "Messidor")

print("Accuracy on data train: ", libsvm.svm_mean_accuracy(messidor_dataset.x_train, messidor_dataset.y_train, "Messidor"), 
      " | processing time: ", libsvm.prediction_accuracy_processing_time)

print("Accuracy on data test: ", libsvm.svm_mean_accuracy(messidor_dataset.x_test, messidor_dataset.y_test, "Messidor"), 
      " | processing time: ", libsvm.prediction_accuracy_processing_time)

Started training LibSVM on dataset Messidor ...
Training LibSVM on dataset  Messidor  Finished after  0.0830233097076416
Calculating accuracy of LibSVM on dataset  Messidor  Finished after  0.0010020732879638672
Accuracy on data train:  90.69767441860465  | processing time:  0.0010020732879638672
Calculating accuracy of LibSVM on dataset  Messidor  Finished after  0.001990795135498047
Accuracy on data test:  66.79536679536679  | processing time:  0.001990795135498047


## Car

In [125]:
car_dataset = ReadDataset('Car', './NewDatasets/new_car_train.data', './NewDatasets/new_car_test.data')


Started reading data train of dataset  Car ...
Finished reading data train of dataset  Car ...
Started reading data test of dataset  Car ...
Finished reading data test of dataset  Car ...


In [126]:
libsvm.train(car_dataset.x_train, car_dataset.y_train, "Car")

print("Accuracy on data train: ", libsvm.svm_mean_accuracy(car_dataset.x_train, car_dataset.y_train, "Car"), 
      " | processing time: ", libsvm.prediction_accuracy_processing_time)

print("Accuracy on data test: ", libsvm.svm_mean_accuracy(car_dataset.x_test, car_dataset.y_test, "Car"), 
      " | processing time: ", libsvm.prediction_accuracy_processing_time)

Started training LibSVM on dataset Car ...
Training LibSVM on dataset  Car  Finished after  0.013097524642944336
Calculating accuracy of LibSVM on dataset  Car  Finished after  0.007983207702636719
Accuracy on data train:  81.25  | processing time:  0.007983207702636719
Calculating accuracy of LibSVM on dataset  Car  Finished after  0.04876089096069336
Accuracy on data test:  74.87146529562982  | processing time:  0.04876089096069336


## Spambase

In [12]:
spambase_dataset = ReadDataset('Spambase', './NewDatasets/new_Spambase_train.data', './NewDatasets/new_Spambase_test.data')


Started reading data train of dataset  Spambase ...
Finished reading data train of dataset  Spambase ...
Started reading data test of dataset  Spambase ...
Finished reading data test of dataset  Spambase ...


In [13]:
libsvm.train(spambase_dataset.x_train, spambase_dataset.y_train, "Spambase")

print("Accuracy on data train: ", libsvm.svm_mean_accuracy(spambase_dataset.x_train, spambase_dataset.y_train, "Spambase"), 
      " | processing time: ", libsvm.prediction_accuracy_processing_time)

print("Accuracy on data test: ", libsvm.svm_mean_accuracy(spambase_dataset.x_test, spambase_dataset.y_test, "Spambase"), 
      " | processing time: ", libsvm.prediction_accuracy_processing_time)

Started training LibSVM on dataset Spambase ...
Training LibSVM on dataset  Spambase  Finished after  0.0020003318786621094
Calculating accuracy of LibSVM on dataset  Spambase  Finished after  0.001999378204345703
Accuracy on data train:  100.0  | processing time:  0.001999378204345703
Calculating accuracy of LibSVM on dataset  Spambase  Finished after  0.00799870491027832
Accuracy on data test:  38.145823273780785  | processing time:  0.00799870491027832


## Coil2000 dataset

In [14]:
coil2000_dataset = ReadDataset('Breast-w', './NewDatasets/new_coil2000_train.data', './NewDatasets/new_coil2000_test.data')


Started reading data train of dataset  Breast-w ...
Finished reading data train of dataset  Breast-w ...
Started reading data test of dataset  Breast-w ...
Finished reading data test of dataset  Breast-w ...


In [15]:
libsvm.train(coil2000_dataset.x_train, coil2000_dataset.y_train, "Coil2000")

print("Accuracy on data train: ", libsvm.svm_mean_accuracy(coil2000_dataset.x_train, coil2000_dataset.y_train, "Coil2000"), 
      " | processing time: ", libsvm.prediction_accuracy_processing_time)

print("Accuracy on data test: ", libsvm.svm_mean_accuracy(coil2000_dataset.x_test, coil2000_dataset.y_test, "Coil2000"), 
      " | processing time: ", libsvm.prediction_accuracy_processing_time)

Started training LibSVM on dataset Coil2000 ...
Training LibSVM on dataset  Coil2000  Finished after  0.0030014514923095703
Calculating accuracy of LibSVM on dataset  Coil2000  Finished after  0.0009999275207519531
Accuracy on data train:  100.0  | processing time:  0.0009999275207519531
Calculating accuracy of LibSVM on dataset  Coil2000  Finished after  0.0279998779296875
Accuracy on data test:  57.64595886141699  | processing time:  0.0279998779296875


## Bank Marketing

In [132]:
bank_dataset = ReadDataset('Bank', './NewDatasets/new_bank_train.data', './NewDatasets/new_bank_test.data')


Started reading data train of dataset  Bank ...
Finished reading data train of dataset  Bank ...
Started reading data test of dataset  Bank ...
Finished reading data test of dataset  Bank ...


In [133]:
libsvm.train(bank_dataset.x_train, bank_dataset.y_train, "Bank")

print("Accuracy on data train: ", libsvm.svm_mean_accuracy(bank_dataset.x_train, bank_dataset.y_train, "Bank"), 
      " | processing time: ", libsvm.prediction_accuracy_processing_time)

print("Accuracy on data test: ", libsvm.svm_mean_accuracy(bank_dataset.x_test, bank_dataset.y_test, "Bank"), 
      " | processing time: ", libsvm.prediction_accuracy_processing_time)

Started training LibSVM on dataset Bank ...
Training LibSVM on dataset  Bank  Finished after  1.7501986026763916
Calculating accuracy of LibSVM on dataset  Bank  Finished after  0.0029973983764648438
Accuracy on data train:  100.0  | processing time:  0.0029973983764648438
Calculating accuracy of LibSVM on dataset  Bank  Finished after  0.07204341888427734
Accuracy on data test:  74.1140329319243  | processing time:  0.07204341888427734


## Skin Segmentation

In [62]:
skin_dataset = ReadDataset('Skin', './NewDatasets/new_skin_train.data', './NewDatasets/new_skin_test.data')


Started reading data train of dataset  Skin ...
Finished reading data train of dataset  Skin ...
Started reading data test of dataset  Skin ...
Finished reading data test of dataset  Skin ...


In [64]:
libsvm.train(skin_dataset.x_train, skin_dataset.y_train, "Skin")

print("Accuracy on data train: ", libsvm.svm_mean_accuracy(skin_dataset.x_train, skin_dataset.y_train, "Skin"), 
      " | processing time: ", libsvm.prediction_accuracy_processing_time)

print("Accuracy on data test: ", libsvm.svm_mean_accuracy(skin_dataset.x_test, skin_dataset.y_test, "Skin"), 
      " | processing time: ", libsvm.prediction_accuracy_processing_time)

Started training LibSVM on dataset Skin ...
Training LibSVM on dataset  Skin  Finished after  0.002995729446411133
Calculating accuracy of LibSVM on dataset  Skin  Finished after  0.0014698505401611328
Accuracy on data train:  100.0  | processing time:  0.0014698505401611328
Calculating accuracy of LibSVM on dataset  Skin  Finished after  0.36596083641052246
Accuracy on data test:  94.38381611447934  | processing time:  0.36596083641052246


## Covertype

In [18]:
covertype_dataset = ReadDataset('Covertype', './NewDatasets/new_covtype_train.data', './NewDatasets/new_covtype_test.data')


Started reading data train of dataset  Covertype ...
Finished reading data train of dataset  Covertype ...
Started reading data test of dataset  Covertype ...
Finished reading data test of dataset  Covertype ...


In [19]:
libsvm.train(covertype_dataset.x_train, covertype_dataset.y_train, "Covertype")

print("Accuracy on data train: ", libsvm.svm_mean_accuracy(covertype_dataset.x_train, covertype_dataset.y_train, "Covertype"), 
      " | processing time: ", libsvm.prediction_accuracy_processing_time)

print("Accuracy on data test: ", libsvm.svm_mean_accuracy(covertype_dataset.x_test, covertype_dataset.y_test, "Covertype"), 
      " | processing time: ", libsvm.prediction_accuracy_processing_time)

Started training LibSVM on dataset Covertype ...
Training LibSVM on dataset  Covertype  Finished after  0.0020017623901367188
Calculating accuracy of LibSVM on dataset  Covertype  Finished after  0.0010004043579101562
Accuracy on data train:  100.0  | processing time:  0.0010004043579101562
Calculating accuracy of LibSVM on dataset  Covertype  Finished after  0.7012355327606201
Accuracy on data test:  82.34028577851463  | processing time:  0.7012355327606201


In [20]:
covertype_dataset = ReadDataset('Covertype', './NewDatasets/sklearn_new_covtype_train.data', './NewDatasets/sklearn_new_covtype_test.data')


Started reading data train of dataset  Covertype ...
Finished reading data train of dataset  Covertype ...
Started reading data test of dataset  Covertype ...
Finished reading data test of dataset  Covertype ...


In [21]:
libsvm.train(covertype_dataset.x_train, covertype_dataset.y_train, "Covertype")

print("Accuracy on data train: ", libsvm.svm_mean_accuracy(covertype_dataset.x_train, covertype_dataset.y_train, "Covertype"), 
      " | processing time: ", libsvm.prediction_accuracy_processing_time)

print("Accuracy on data test: ", libsvm.svm_mean_accuracy(covertype_dataset.x_test, covertype_dataset.y_test, "Covertype"), 
      " | processing time: ", libsvm.prediction_accuracy_processing_time)

Started training LibSVM on dataset Covertype ...
Training LibSVM on dataset  Covertype  Finished after  0.001999378204345703
Calculating accuracy of LibSVM on dataset  Covertype  Finished after  0.00200653076171875
Accuracy on data train:  100.0  | processing time:  0.00200653076171875
Calculating accuracy of LibSVM on dataset  Covertype  Finished after  0.6383273601531982
Accuracy on data test:  65.87008598645573  | processing time:  0.6383273601531982


# Testing LibSVM with oroginal datasets splitted by 80%-20% for data train & test

## Breast-W

In [16]:
breast_dataset = Datasets.Breast_W_Dataset('./Datasets/breast-cancer-wisconsin.data', "Breast-W", 
                                    train_size=0.69, normalization_method='None')

breast_dataset.y_train = breast_dataset.y_train.reshape(-1,)
breast_dataset.y_test = breast_dataset.y_test.reshape(-1,)

Started reading dataset  Breast-W ...
Finished reading dataset  Breast-W ...


In [17]:
libsvm.train(breast_dataset.x_train, breast_dataset.y_train, "Breast-w")
print("\n")
print("Accuracy on data train: ", libsvm.svm_mean_accuracy(breast_dataset.x_train, breast_dataset.y_train, "Breast-w"), 
      " | processing time: ", libsvm.prediction_accuracy_processing_time)

print("Accuracy on data test: ", libsvm.svm_mean_accuracy(breast_dataset.x_test, breast_dataset.y_test, "Breast-w"), 
      " | processing time: ", libsvm.prediction_accuracy_processing_time)

Started training LibSVM on dataset Breast-w ...
Training LibSVM on dataset  Breast-w  Finished after  14.974852323532104


Calculating accuracy of LibSVM on dataset  Breast-w  Finished after  0.004967451095581055
Accuracy on data train:  61.78343949044586  | processing time:  0.004967451095581055
Calculating accuracy of LibSVM on dataset  Breast-w  Finished after  0.005017995834350586
Accuracy on data test:  94.33962264150944  | processing time:  0.005017995834350586


## Messidor

In [20]:
messidor_dataset = Datasets.Messidor_Dataset('./Datasets/messidor_features.arff', "Messidor", 
                                     train_size=0.8, normalization_method='None', 
                                     is_class_label_a_feature=False)
messidor_dataset.y_train = messidor_dataset.y_train.reshape(-1,)
messidor_dataset.y_test = messidor_dataset.y_test.reshape(-1,)

Started reading dataset  Messidor ...
Finished reading dataset  Messidor ...


In [21]:
libsvm.train(messidor_dataset.x_train, messidor_dataset.y_train, "Messidor")

print("Accuracy on data train: ", libsvm.svm_mean_accuracy(messidor_dataset.x_train, messidor_dataset.y_train, "Messidor"), 
      " | processing time: ", libsvm.prediction_accuracy_processing_time)

print("Accuracy on data test: ", libsvm.svm_mean_accuracy(messidor_dataset.x_test, messidor_dataset.y_test, "Messidor"), 
      " | processing time: ", libsvm.prediction_accuracy_processing_time)

Started training LibSVM on dataset Messidor ...
Training LibSVM on dataset  Messidor  Finished after  2.5814099311828613
Calculating accuracy of LibSVM on dataset  Messidor  Finished after  0.014998912811279297
Accuracy on data train:  76.52173913043478  | processing time:  0.014998912811279297
Calculating accuracy of LibSVM on dataset  Messidor  Finished after  0.006006717681884766
Accuracy on data test:  75.75757575757575  | processing time:  0.006006717681884766


## Car

In [29]:
car_dataset = Datasets.Car_Dataset('./Datasets/car.data', "Car", 'Class', 
                                   train_size=0.59)
car_dataset.y_train = car_dataset.y_train.reshape(-1,)
car_dataset.y_test = car_dataset.y_test.reshape(-1,)

Started reading dataset  Car ...
Finished reading dataset  Car ...


In [30]:
libsvm.train(car_dataset.x_train, car_dataset.y_train, "Car")

print("Accuracy on data train: ", libsvm.svm_mean_accuracy(car_dataset.x_train, car_dataset.y_train, "Car"), 
      " | processing time: ", libsvm.prediction_accuracy_processing_time)

print("Accuracy on data test: ", libsvm.svm_mean_accuracy(car_dataset.x_test, car_dataset.y_test, "Car"), 
      " | processing time: ", libsvm.prediction_accuracy_processing_time)

Started training LibSVM on dataset Car ...
Training LibSVM on dataset  Car  Finished after  0.03501558303833008
Calculating accuracy of LibSVM on dataset  Car  Finished after  0.017028093338012695
Accuracy on data train:  85.37782139352305  | processing time:  0.017028093338012695
Calculating accuracy of LibSVM on dataset  Car  Finished after  0.01100015640258789
Accuracy on data test:  84.4851904090268  | processing time:  0.01100015640258789


## Spambase

In [36]:
spambase_dataset = Datasets.Spambase_Dataset('./Datasets/spambase.data', "Spambase", 
                                    train_size=0.78, normalization_method='None')
spambase_dataset.y_train = spambase_dataset.y_train.reshape(-1,)
spambase_dataset.y_test = spambase_dataset.y_test.reshape(-1,)

Started reading dataset  Spambase ...
Finished reading dataset  Spambase ...


In [37]:
libsvm.train(spambase_dataset.x_train, spambase_dataset.y_train, "Spambase")

print("Accuracy on data train: ", libsvm.svm_mean_accuracy(spambase_dataset.x_train, spambase_dataset.y_train, "Spambase"), 
      " | processing time: ", libsvm.prediction_accuracy_processing_time)

print("Accuracy on data test: ", libsvm.svm_mean_accuracy(spambase_dataset.x_test, spambase_dataset.y_test, "Spambase"), 
      " | processing time: ", libsvm.prediction_accuracy_processing_time)

Started training LibSVM on dataset Spambase ...
Training LibSVM on dataset  Spambase  Finished after  222.11860513687134
Calculating accuracy of LibSVM on dataset  Spambase  Finished after  0.20221185684204102
Accuracy on data train:  92.7536231884058  | processing time:  0.20221185684204102
Calculating accuracy of LibSVM on dataset  Spambase  Finished after  0.05599808692932129
Accuracy on data test:  93.78084896347482  | processing time:  0.05599808692932129


## Coil2000 dataset

In [14]:
coil2000_dataset = Datasets.Coil2000_Dataset('./Datasets/coil2000.dat', "Coil2000", 
                            train_size=0.8, normalization_method='None')
coil2000_dataset.y_train = coil2000_dataset.y_train.reshape(-1,)
coil2000_dataset.y_test = coil2000_dataset.y_test.reshape(-1,)

Started reading data train of dataset  Breast-w ...
Finished reading data train of dataset  Breast-w ...
Started reading data test of dataset  Breast-w ...
Finished reading data test of dataset  Breast-w ...


In [15]:
libsvm.train(coil2000_dataset.x_train, coil2000_dataset.y_train, "Coil2000")

print("Accuracy on data train: ", libsvm.svm_mean_accuracy(coil2000_dataset.x_train, coil2000_dataset.y_train, "Coil2000"), 
      " | processing time: ", libsvm.prediction_accuracy_processing_time)

print("Accuracy on data test: ", libsvm.svm_mean_accuracy(coil2000_dataset.x_test, coil2000_dataset.y_test, "Coil2000"), 
      " | processing time: ", libsvm.prediction_accuracy_processing_time)

Started training LibSVM on dataset Coil2000 ...
Training LibSVM on dataset  Coil2000  Finished after  0.0030014514923095703
Calculating accuracy of LibSVM on dataset  Coil2000  Finished after  0.0009999275207519531
Accuracy on data train:  100.0  | processing time:  0.0009999275207519531
Calculating accuracy of LibSVM on dataset  Coil2000  Finished after  0.0279998779296875
Accuracy on data test:  57.64595886141699  | processing time:  0.0279998779296875


## Bank Marketing

In [132]:
bank_dataset = Datasets.Bank_Marketing_Dataset('./Datasets/bank-full.csv', "Bank Marketing", 'y', 
                                      train_size=0.8, normalization_method="None")
bank_dataset.y_train = bank_dataset.y_train.reshape(-1,)
bank_dataset.y_test = bank_dataset.y_test.reshape(-1,)

Started reading data train of dataset  Bank ...
Finished reading data train of dataset  Bank ...
Started reading data test of dataset  Bank ...
Finished reading data test of dataset  Bank ...


In [133]:
libsvm.train(bank_dataset.x_train, bank_dataset.y_train, "Bank")

print("Accuracy on data train: ", libsvm.svm_mean_accuracy(bank_dataset.x_train, bank_dataset.y_train, "Bank"), 
      " | processing time: ", libsvm.prediction_accuracy_processing_time)

print("Accuracy on data test: ", libsvm.svm_mean_accuracy(bank_dataset.x_test, bank_dataset.y_test, "Bank"), 
      " | processing time: ", libsvm.prediction_accuracy_processing_time)

Started training LibSVM on dataset Bank ...
Training LibSVM on dataset  Bank  Finished after  1.7501986026763916
Calculating accuracy of LibSVM on dataset  Bank  Finished after  0.0029973983764648438
Accuracy on data train:  100.0  | processing time:  0.0029973983764648438
Calculating accuracy of LibSVM on dataset  Bank  Finished after  0.07204341888427734
Accuracy on data test:  74.1140329319243  | processing time:  0.07204341888427734


## Skin Segmentation

In [62]:
skin_dataset = Datasets.Skin_NonSkin_Dataset('./Datasets/Skin_NonSkin.txt', "Skin Segmentation",
                                             train_size=0.8, normalization_method="None")
skin_dataset.y_train = skin_dataset.y_train.reshape(-1,)
skin_dataset.y_test = skin_dataset.y_test.reshape(-1,)

Started reading data train of dataset  Skin ...
Finished reading data train of dataset  Skin ...
Started reading data test of dataset  Skin ...
Finished reading data test of dataset  Skin ...


In [64]:
libsvm.train(skin_dataset.x_train, skin_dataset.y_train, "Skin")

print("Accuracy on data train: ", libsvm.svm_mean_accuracy(skin_dataset.x_train, skin_dataset.y_train, "Skin"), 
      " | processing time: ", libsvm.prediction_accuracy_processing_time)

print("Accuracy on data test: ", libsvm.svm_mean_accuracy(skin_dataset.x_test, skin_dataset.y_test, "Skin"), 
      " | processing time: ", libsvm.prediction_accuracy_processing_time)

Started training LibSVM on dataset Skin ...
Training LibSVM on dataset  Skin  Finished after  0.002995729446411133
Calculating accuracy of LibSVM on dataset  Skin  Finished after  0.0014698505401611328
Accuracy on data train:  100.0  | processing time:  0.0014698505401611328
Calculating accuracy of LibSVM on dataset  Skin  Finished after  0.36596083641052246
Accuracy on data test:  94.38381611447934  | processing time:  0.36596083641052246


## Covertype

In [6]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

class My_Covertype_Dataset:
    def __init__(self, dataset_path, dataset_name, output_coloumn_name='44', 
                 train_size=0.1, normalization_method='zero_mean_unit_var'):
        """
        self.x_train
        self.x_test
        self.y_train
        self.y_test
        """
        print("Started reading dataset ", dataset_name, "...")
        
        self.dataset_path = dataset_path
        self.train_size = train_size
        self.normalization_method = normalization_method

        self.data_ = []
        with open(self.dataset_path) as data_file_pointer:
            for line in data_file_pointer:
                tmp = line.split(",")
                instance = []
                for f in tmp:
                    instance.append(float(f))
                self.data_.append(instance)
        
        self.data_ = np.array(self.data_)       
        np.random.shuffle(self.data_)

        # seperate class labels from data
        data_y = self.data_[:, -1]          # for last column
        # in the article, it is said to only consider Aspen vs other classes
        # due to dataset description, Aspen class is the class label '5'.
        data_y[data_y==1] = -1
        data_y[data_y==2] = -1
        data_y[data_y==3] = -1
        data_y[data_y==4] = -1
        data_y[data_y==6] = -1
        data_y[data_y==7] = -1
        data_y[data_y==5] = +1
        data_x = self.data_[:, :-1]     # for all but last column
        
        data_y = data_y.reshape(-1, 1)
        
        # split data test & train
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(data_x, data_y, test_size=1-self.train_size, random_state=42)
        
        self.y_train = self.y_train.reshape(-1, 1)
        self.y_test = self.y_test.reshape(-1, 1)
        
        # remove self.dataframe to avoid storing data too much
#         del self.data_
        
        # Normalize data
        self.normalize(self.normalization_method)

        print("Finished reading dataset ", dataset_name, "...")
        
        
    """
    Normalizing data improves the convergence of learning model and causes that smaller features also be able to affect the model parameters.
    """
    def normalize(self, normalization_method):
        if normalization_method == 'none':
            print("No normalization.")
            return
        
        if normalization_method == 'zero_mean_unit_var':
            print("zero-mean & unit_variance normalization.")
            self.x_train_without_x0 = self.zero_mean_unit_variance(self.x_train)
            self.x_test_without_x0 = self.zero_mean_unit_variance(self.x_test)
            
            
        if normalization_method == 'scale_0_1':
            print("scaling to [0, 1] normalization.")
            self.x_train_without_x0 = self.scaling_between_0_1(self.x_train)
            self.x_test_without_x0 = self.scaling_between_0_1(self.x_test)
     
    
    def scaling_between_0_1(self, numpy_array):
        '''
        Scaling
        '''
        normed_numpy_array = (numpy_array - numpy_array.min(axis=0)) / (numpy_array.max(axis=0) - numpy_array.min(axis=0))
        return normed_numpy_array


    def zero_mean_unit_variance(self, numpy_array):
        '''
        Standardization
        '''
        normed_numpy_array = (numpy_array - numpy_array.mean(axis=0)) / numpy_array.std(axis=0)
        return normed_numpy_array

covertype_dataset = My_Covertype_Dataset('./Datasets/covtype.data', "Covertype", 
                                               train_size=0.8, normalization_method="None")
covertype_dataset.y_train = covertype_dataset.y_train.reshape(-1,)
covertype_dataset.y_test = covertype_dataset.y_test.reshape(-1,)

Started reading dataset  Covertype ...
Finished reading dataset  Covertype ...


In [None]:
libsvm.train(covertype_dataset.x_train, covertype_dataset.y_train, "Covertype")

print("Accuracy on data train: ", libsvm.svm_mean_accuracy(covertype_dataset.x_train, covertype_dataset.y_train, "Covertype"), 
      " | processing time: ", libsvm.prediction_accuracy_processing_time)

print("Accuracy on data test: ", libsvm.svm_mean_accuracy(covertype_dataset.x_test, covertype_dataset.y_test, "Covertype"), 
      " | processing time: ", libsvm.prediction_accuracy_processing_time)

Started training LibSVM on dataset Covertype ...
