## Selecting representative instances of dataset using KMeans clustering method

In [1]:
# imports

import os
import numpy as np
import pandas as pd
import time
import sys


In [2]:
import Datasets

In [3]:
class My_KMeans:
    def __init__(self, dataset, num_of_clusters, initialization_method="kmeans++", max_iter=300):
        self.X = dataset
        self.k = num_of_clusters
        self.centroids = None
        self.clusters = None
        self.max_iter = max_iter
        self.initialization_method = initialization_method
        
        self.initialize_centroids()
        
    # distance of all datas in X with each centroid
    def distance_fn(self, X, centroids):
        distances = np.sqrt(np.power(np.subtract(X, centroids),2).sum(axis=2))
        return distances
    
    # distance of two points i & j
    def two_points_distance_fn(self, p1, p2):
        return np.sum((p1 - p2)**2)
  
    # randomly initiate k-clusters from X as centroid
    def initialize_centroids(self):
        if self.initialization_method == "random":
            self.centroids = self.X.copy()
            np.random.shuffle(self.centroids)
            self.centroids = self.centroids[:self.k]
        elif self.initialization_method == "kmeans++":
            self.centroids = []
            # add first random centroid
            random_index = np.random.randint(0, len(self.X))
            self.centroids.append(self.X[random_index])
            
            for c_id in range(self.k - 1):
                ## initialize a list to store distances of data points from nearest centroid
                dist = []
                for i in range(self.X.shape[0]):
                    point = self.X[i, :]
                    d = sys.maxsize

                    ## compute distance of point from each of the previously selected centroid and store the minimum distance
                    for j in range(len(self.centroids)):
                        temp_dist = self.two_points_distance_fn(point, self.centroids[j])
                        d = min(d, temp_dist)
                    dist.append(d)

                ## select data point with maximum distance as our next centroid
                # dist = np.asarray(dist)
                next_centroid = self.X[np.argmax(dist), :]
                self.centroids.append(next_centroid)
                dist = []
            self.centroids = np.asarray(self.centroids)
            
        return self.centroids

    def find_closest_centroids(self):
        """
        find the cluster label for each point
        """
        distances = self.distance_fn(self.X, self.centroids[:, np.newaxis])
        return np.argmin(distances, axis=0)
    
    def closest_farthest_of_each_cluster(self):
        res = []
        for k in range(self.centroids.shape[0]):
            dists = []
            for x in self.X[self.closest_centroids==k]:
                dist = np.linalg.norm(self.centroids[k] - x)
                dists.append((x, dist))
            # sort dists and pick the first and last elements
            dists = sorted(dists, key=lambda x: x[1])
            res.append(dists[0][0])
            res.append(dists[-1][0])
        return res
    
    def move_centroids(self, closest_centroids):
        # create array of new centroids
        new_centroids = np.asarray([self.X[closest_centroids==k].mean(axis=0) for k in range(self.centroids.shape[0])])
        return new_centroids
    
    def main_loop(self):
        # while centroids do not change
        i = 0
        self.closest_centroids = self.find_closest_centroids()
        self.new_centroids = self.move_centroids(self.closest_centroids)
        while np.array_equal(self.centroids, self.new_centroids) == False:
            if i > self.max_iter:
                break
            i += 1
            self.centroids = self.new_centroids
            self.closest_centroids = self.find_closest_centroids()
            self.new_centroids = self.move_centroids(self.closest_centroids)
        return self.closest_centroids

In [4]:
from sklearn.cluster import KMeans

class InstanceSelection:
    def __init__(self, dataset, num_of_clusters, 
                 repeating_time, KMeans_Model="My_Kmeans", 
                 kmeans_initialization_method="kmeans++", kmeans_max_iter=300):
        self.X = dataset
        self.k = num_of_clusters
        self.RT = repeating_time
        self.new_X = []
        
        self.kmeans_initialization_method = kmeans_initialization_method
        self.kmeans_max_iter = kmeans_max_iter
        self.KMeans_Model = KMeans_Model
        
        self.processing_time = 0
    
    def My_Kmeans_main_loop(self):
        start = time.time()
        for i in range(self.RT):
            kmeans = My_KMeans(self.X, self.k, initialization_method=self.kmeans_initialization_method, max_iter=self.kmeans_max_iter)
            kmeans.main_loop()
            # select farthest and closest instances of each cluster to the new dataset
            closest_farthest_instances_of_clusters = kmeans.closest_farthest_of_each_cluster()
            for instance in closest_farthest_instances_of_clusters:
                self.new_X.append(instance)
        self.new_X = np.asarray(self.new_X)
        end = time.time()
        self.processing_time = end - start
        return self.new_X
    
    def Sklearn_Kmeans_main_loop(self):
        start = time.time()
        if self.kmeans_initialization_method == "kmeans++":
            self.kmeans_initialization_method = "k-means++"
        for i in range(self.RT):
            kmeans = KMeans(n_clusters=self.k, init=self.kmeans_initialization_method, random_state=0).fit(self.X)
            
            # select farthest and closest instances of each cluster to the new dataset
            for l in np.unique(kmeans.labels_):
                cluster_datas = self.X[kmeans.labels_==l]
                distance = []
                for data in cluster_datas:
                    distance.append((data, np.linalg.norm(data - kmeans.cluster_centers_[l])))
      
                distance = sorted(distance, key=lambda x: x[1])
                self.new_X.append(distance[0][0])
                self.new_X.append(distance[-1][0])
        
        self.new_X = np.asarray(self.new_X)
        end = time.time()
        self.processing_time = end - start
        return self.new_X

    def main_loop(self):
        if self.KMeans_Model == "My_KMeans":
            return self.My_Kmeans_main_loop()
        elif self.KMeans_Model == "sklearn_KMeans":
            return self.Sklearn_Kmeans_main_loop()

## Outlier Detection based on KSE statistic

In [5]:
from statsmodels.distributions.empirical_distribution import ECDF

class OutlierDetectionReductionKSE:
    def __init__(self, x_train, repeating_time, KS_type="euclidean_distance", ROT_type="run_1_time"):
        """
        ROT_type -> shows which kind of outloer detection is used
        1 -> run KSE for ROT times and each time remove the biggest outlier score
        2 -> run KSE once and remove the top ROT-instances with highest outlier score
        """
        self.x_train = x_train
        self.ROT = repeating_time
        self.KS_type = KS_type
        self.ROT_type =  ROT_type

        start = time.time()
        
        self.outlier_detection_main_loop()

        end = time.time()
        self.processing_time = end - start
    
    # eulicidin distance
    def distance_fn_2_points(self, x, p):
        return np.linalg.norm(x - p)

    def distance_fn_x_train_vs_point(self, x_train, p):
        return np.linalg.norm(x_train - p, axis=1)      # find euclidean distance of all data in x_train with point p
    
    def KS_euclidean_distance(self, pj, pi):
        distance_arr = [abs(self.distance_fn_2_points(pj, x)-self.distance_fn_2_points(pi, x)) for x in self.x_train]
        distance_arr.sort()     # sort array ascending
        return distance_arr[-1]

    def KS_distribution_of_euclidean_distance(self, pj, pi):
        # find ecdf of pj
        pj_dists = self.distance_fn_x_train_vs_point(self.x_train, pj)
        ecdf_pj_obj = ECDF(pj_dists)
        ecdf_pj = ecdf_pj_obj(pj_dists)
        
        # find ecdf of pi
        pi_dists = self.distance_fn_x_train_vs_point(self.x_train, pi)
        ecdf_pi_obj = ECDF(pi_dists)
        ecdf_pi = ecdf_pi_obj(pi_dists)
            
        # find ecdf_pj - ecdf_pi
        total_ecdf = ecdf_pj - ecdf_pi
        max_total_ecdf = max(total_ecdf)
        return max_total_ecdf

    def KSE(self, pj):
        if self.KS_type == "euclidean_distance":
            res = 0
            for x in self.x_train:
                res += self.KS_euclidean_distance(pj, x)
            return float(res)/(len(self.x_train)-1)
        
        elif self.KS_type == "distribution_of_euclidean_distance":
            res = 0
            for x in self.x_train:
                if (x==pj).all() == False:
                    res += self.KS_distribution_of_euclidean_distance(pj, x)
            return float(res)/(len(self.x_train)-1)
        
    def outlier_detection_main_loop(self):
        # compute outlier score for all x in x_train
        # delete the instance with biggest outlier score
        # repeat this step for ROT times
        
        if self.ROT_type == "run_ROT_times":
            # ~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.
            # run KSE for ROT times and remove the instance with maximum outlier score each time
            for i in range(self.ROT):
                outlier_scores_array = []
                for x in self.x_train:
                    outlier_scores_array.append(self.KSE(x))
                remove_index = np.argmax(outlier_scores_array)
                
                # remove last element of outlier_scores_array from x_train
                self.x_train = np.delete(self.x_train, remove_index, axis=0)   # delete instance on remove-index of x_train in axis row
            # ~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.

        if self.ROT_type == "run_1_time":
            # ~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.
            # run KSE once and remove the ROT-instances with maximum outlier score
            
            outlier_scores_array = []
            for x in self.x_train:
                outlier_scores_array.append(self.KSE(x))
            # sort indices based on value
            temp = np.argpartition(-np.asarray(outlier_scores_array), self.ROT)
            max_ROT_args = temp[:self.ROT]

            # remove last ROT-elements of outlier_scores_array from x_train
            self.x_train = np.delete(self.x_train, max_ROT_args, axis=0)      # delete instance on remove-index of x_train in axis row
            # ~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.
                
        return self.x_train
                
    

## Stage-1 & Stage-2

* ### K-Means
* ### Outlier Detection

Now, We're going to run these two steps for each dataset for diffrent values for k, RT, ROT to see how many instances we will have at the end.

> - k -> 5-30
> 
> - RT -> 5-30
> 
> - ROT -> 5-30


In [6]:
def Stage_1_and_2(dataset, dataset_name, k, RT, ROT, 
                  KMeans_Model="My_KMeans", kmeans_initialization_method="kmeans++", kmeans_max_iter=300, 
                  KS_type="euclidean_distance", ROT_type="run_1_time"):
    
    print("x_train shape: ", dataset.x_train.shape)
    print("Started Stage I & II on dataset ", dataset_name, "...")
                  
    # do instance selection using K-Means
    print("Instance selection started ...")
    instance_selection = InstanceSelection(dataset.x_train, num_of_clusters=k, repeating_time=RT, 
                                           KMeans_Model=KMeans_Model, kmeans_initialization_method=kmeans_initialization_method, 
                                           kmeans_max_iter=kmeans_max_iter)
    new_x_train = instance_selection.main_loop()
    print("Instance selection finished after ", instance_selection.processing_time, "...")
    print("x_train shape kmeans after instance selection: ", new_x_train.shape, "\n")
    
    
    # remove duplicate instances
    print("Started removing duplicate instances ...", dataset_name, "...")
    new_x_train = np.unique(new_x_train, axis=0)
    print("x_train shape after duplication deleting: ", new_x_train.shape, "\n")
    
    # do outlier detection and deletion using KSE
    print("Outlier Detection started ...")
    outlier_detection = OutlierDetectionReductionKSE(new_x_train, repeating_time=ROT, KS_type=KS_type, ROT_type=ROT_type)
    print("x_train shape after outlier detection: ", outlier_detection.x_train.shape)
    print("Outlier Detection finished after ", outlier_detection.processing_time, "...\n")
    print("Finished.")
    print("~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.")
    
    return outlier_detection.x_train

## Labeling new dataset (representative datas)

In [7]:
class NewDatasetHumanLabeling:
    def __init__(self, dataset, new_x_train, output_dataset_path):
        start = time.time()

        new_y_train = self.LabelingNewDataset(dataset, new_x_train)
        self.create_new_dataset_csv_file(new_x_train, new_y_train, output_dataset_path)

        end = time.time()
        self.processing_time = end - start
    
    def LabelingNewDataset(self, dataset, new_x_train):
        new_y_train = []
        i = 0
        for new_x in new_x_train:
            new_y_train.append(dataset.y_train[(dataset.x_train==new_x).all(axis=1).nonzero()[0][0]][0])
        return new_y_train

    def create_new_dataset_csv_file(self, new_x_train, new_y_train, output_dataset_path):
        df = pd.DataFrame(new_x_train)
        df[len(df.columns)] = new_y_train
        # create a new csv file
        df.to_csv("test.csv", index=False)
        
        # remove first line of csv file which is the header of each coloumn
        with open("test.csv",'r') as f:
            with open(output_dataset_path,'w') as f1:
                next(f) # skip header line
                for line in f:
                    f1.write(line)
        os.remove("test.csv")

## Saving data test in a new csv file individualy

In [8]:
def save_data_test(x_test, y_test, output_dataset_path):
    df = pd.DataFrame(x_test)
    df[len(df.columns)] = y_test
    # create a new csv file
    df.to_csv("test.csv", index=False)

    # remove first line of csv file which is the header of each coloumn
    with open("test.csv",'r') as f:
        with open(output_dataset_path,'w') as f1:
            next(f) # skip header line
            for line in f:
                f1.write(line)
    os.remove("test.csv")

## Breast-w Instance Selection + Outlier Detection + Human Labeling

In [11]:
breast_w_dataset = Datasets.Breast_W_Dataset('./Datasets/breast-cancer-wisconsin.data', "Breast-W", 
                                    train_size=0.1, normalization_method='None')

Started reading dataset  Breast-W ...
Finished reading dataset  Breast-W ...


In [20]:
new_x_train = Stage_1_and_2(breast_w_dataset, "Breast-W", k=9, RT=12, ROT=10, 
                            KMeans_Model="My_KMeans", kmeans_initialization_method="kmeans++", kmeans_max_iter=300, 
                            KS_type="distribution_of_euclidean_distance", ROT_type="run_1_time")
new_x_train.shape

x_train shape:  (68, 10)
Started Stage I & II on dataset  Breast-W ...
Instance selection started ...
Instance selection finished after  0.27941107749938965 ...
x_train shape kmeans after instance selection:  (216, 10) 

Started removing duplicate instances ... Breast-W ...
x_train shape after duplication deleting:  (31, 10) 

Outlier Detection started ...
x_train shape after outlier detection:  (21, 10)
Outlier Detection finished after  0.18903779983520508 ...

Finished.
~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.


(21, 10)

In [21]:
breast_w_dataset_human_labling = NewDatasetHumanLabeling(breast_w_dataset, new_x_train, "./NewDatasets/new_breast_w_train.data")
save_data_test(breast_w_dataset.x_test, breast_w_dataset.y_test, "./NewDatasets/new_breast_w_test.data")

## Messidor Instance Selection + Outlier Detection + Human Labeling

In [15]:
messidor = Datasets.Messidor_Dataset('./Datasets/messidor_features.arff', "Messidor", 
                                     train_size=0.1, normalization_method='None', 
                                     is_class_label_a_feature=False)

Started reading dataset  Messidor ...
Finished reading dataset  Messidor ...


In [18]:
new_x_train = Stage_1_and_2(messidor, "Messidor", k=10, RT=12, ROT=5, 
                            KMeans_Model="My_KMeans", kmeans_initialization_method="kmeans++", kmeans_max_iter=300, 
                            KS_type="distribution_of_euclidean_distance", ROT_type="run_1_time")
new_x_train.shape

x_train shape:  (115, 19)
Started Stage I & II on dataset  Messidor ...
Instance selection started ...
Instance selection finished after  0.6965482234954834 ...
x_train shape kmeans after instance selection:  (240, 19) 

Started removing duplicate instances ... Messidor ...
x_train shape after duplication deleting:  (41, 19) 

Outlier Detection started ...
x_train shape after outlier detection:  (36, 19)
Outlier Detection finished after  0.45304059982299805 ...

Finished.
~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.


(36, 19)

In [19]:
messidor_dataset_human_labling = NewDatasetHumanLabeling(messidor, new_x_train, "./NewDatasets/new_messidor_train.data")
save_data_test(messidor.x_test, messidor.y_test, "./NewDatasets/new_messidor_test.data")

## Car Instance Selection + Outlier Detection + Human Labeling

In [22]:
car_dataset = Datasets.Car_Dataset('./Datasets/car.data', "Car", 'Class', 
                                   train_size=0.1)

Started reading dataset  Car ...
Finished reading dataset  Car ...


In [25]:
new_x_train = Stage_1_and_2(car_dataset, "Car", k=9, RT=10, ROT=10, 
                            KMeans_Model="My_KMeans", kmeans_initialization_method="kmeans++", kmeans_max_iter=300, 
                            KS_type="distribution_of_euclidean_distance", ROT_type="run_1_time")
new_x_train.shape

x_train shape:  (172, 6)
Started Stage I & II on dataset  Car ...
Instance selection started ...
Instance selection finished after  0.558758020401001 ...
x_train shape kmeans after instance selection:  (180, 6) 

Started removing duplicate instances ... Car ...
x_train shape after duplication deleting:  (72, 6) 

Outlier Detection started ...
x_train shape after outlier detection:  (62, 6)
Outlier Detection finished after  1.3460261821746826 ...

Finished.
~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.


(62, 6)

In [26]:
car_dataset_human_labling = NewDatasetHumanLabeling(car_dataset, new_x_train, "./NewDatasets/new_car_train.data")
save_data_test(car_dataset.x_test, car_dataset.y_test, "./NewDatasets/new_car_test.data")

## Spambase Instance Selection + Outlier Detection + Human Labeling

In [32]:
spambase_dataset = Datasets.Spambase_Dataset('./Datasets/spambase.data', "Spambase", 
                                    train_size=0.1, normalization_method='None')


Started reading dataset  Spambase ...
Finished reading dataset  Spambase ...


In [35]:
new_x_train = Stage_1_and_2(spambase_dataset, "Spambase", k=13, RT=15, ROT=20, 
                            KMeans_Model="My_KMeans", kmeans_initialization_method="kmeans++", kmeans_max_iter=300, 
                            KS_type="distribution_of_euclidean_distance", ROT_type="run_1_time")
new_x_train.shape

x_train shape:  (459, 57)
Started Stage I & II on dataset  Spambase ...
Instance selection started ...
Instance selection finished after  7.445483207702637 ...
x_train shape kmeans after instance selection:  (390, 57) 

Started removing duplicate instances ... Spambase ...
x_train shape after duplication deleting:  (44, 57) 

Outlier Detection started ...
x_train shape after outlier detection:  (24, 57)
Outlier Detection finished after  0.4269986152648926 ...

Finished.
~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.


(24, 57)

In [36]:
spambase_dataset_human_labling = NewDatasetHumanLabeling(spambase_dataset, new_x_train, "./NewDatasets/new_Spambase_train.data")
save_data_test(spambase_dataset.x_test, spambase_dataset.y_test, "./NewDatasets/new_Spambase_test.data")

## Coil2000 Instance Selection + Outlier Detection + Human Labeling

In [45]:
coil2000_dataset = Datasets.Coil2000_Dataset('./Datasets/coil2000.dat', "Coil2000", 
                            train_size=0.02, normalization_method='None')
coil2000_dataset.x_train.shape

Started reading dataset  Coil2000 ...
Finished reading dataset  Coil2000 ...


(196, 85)

In [46]:
new_x_train = Stage_1_and_2(coil2000_dataset, "Coil2000", k=26, RT=12, ROT=26, 
                            KMeans_Model="sklearn_KMeans", kmeans_initialization_method="kmeans++", kmeans_max_iter=300, 
                            KS_type="distribution_of_euclidean_distance", ROT_type="run_1_time")
new_x_train.shape

x_train shape:  (196, 85)
Started Stage I & II on dataset  Coil2000 ...
Instance selection started ...
Instance selection finished after  3.67228627204895 ...
x_train shape kmeans after instance selection:  (624, 85) 

Started removing duplicate instances ... Coil2000 ...
x_train shape after duplication deleting:  (51, 85) 

Outlier Detection started ...
x_train shape after outlier detection:  (25, 85)
Outlier Detection finished after  0.8941731452941895 ...

Finished.
~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.


(25, 85)

In [42]:
new_x_train = Stage_1_and_2(coil2000_dataset, "Coil2000", k=26, RT=12, ROT=26, 
                            KMeans_Model="My_KMeans", kmeans_initialization_method="random", kmeans_max_iter=300, 
                            KS_type="distribution_of_euclidean_distance", ROT_type="run_1_time")
new_x_train.shape

x_train shape:  (196, 85)
Started Stage I & II on dataset  Coil2000 ...
Instance selection started ...
Instance selection finished after  1.8234508037567139 ...
x_train shape kmeans after instance selection:  (624, 85) 

Started removing duplicate instances ... Coil2000 ...
x_train shape after duplication deleting:  (153, 85) 

Outlier Detection started ...
x_train shape after outlier detection:  (127, 85)
Outlier Detection finished after  9.597731590270996 ...

Finished.
~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.


(127, 85)

In [47]:
new_x_train = Stage_1_and_2(coil2000_dataset, "Coil2000", k=26, RT=12, ROT=26, 
                            KMeans_Model="My_KMeans", kmeans_initialization_method="kmeans++", kmeans_max_iter=300, 
                            KS_type="distribution_of_euclidean_distance", ROT_type="run_1_time")
new_x_train.shape

x_train shape:  (196, 85)
Started Stage I & II on dataset  Coil2000 ...
Instance selection started ...
Instance selection finished after  6.308869361877441 ...
x_train shape kmeans after instance selection:  (624, 85) 

Started removing duplicate instances ... Coil2000 ...
x_train shape after duplication deleting:  (110, 85) 

Outlier Detection started ...
x_train shape after outlier detection:  (84, 85)
Outlier Detection finished after  3.311028242111206 ...

Finished.
~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.


(84, 85)

In [49]:
coil2000_dataset_human_labling = NewDatasetHumanLabeling(coil2000_dataset, new_x_train, "./NewDatasets/new_coil2000_train.data")
save_data_test(coil2000_dataset.x_test, coil2000_dataset.y_test, "./NewDatasets/new_coil2000_test.data")

## Bank Marketing Instance Selection + Outlier Detection + Human Labeling

In [50]:
bank_dataset = Datasets.Bank_Marketing_Dataset('./Datasets/bank-full.csv', "Bank Marketing", 'y', 
                                      train_size=0.1, normalization_method="None")


Started reading dataset  Bank Marketing ...
Finished reading dataset  Bank Marketing ...


In [51]:
new_x_train = Stage_1_and_2(bank_dataset, "Bank Marketing", k=9, RT=15, ROT=5, 
                            KMeans_Model="My_KMeans", kmeans_initialization_method="kmeans++", kmeans_max_iter=300, 
                            KS_type="distribution_of_euclidean_distance", ROT_type="run_1_time")
new_x_train.shape

x_train shape:  (4521, 16)
Started Stage I & II on dataset  Bank Marketing ...
Instance selection started ...
Instance selection finished after  39.0008819103241 ...
x_train shape kmeans after instance selection:  (270, 16) 

Started removing duplicate instances ... Bank Marketing ...
x_train shape after duplication deleting:  (18, 16) 

Outlier Detection started ...
x_train shape after outlier detection:  (13, 16)
Outlier Detection finished after  0.07222437858581543 ...

Finished.
~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.


(13, 16)

In [52]:
bank_dataset_human_labling = NewDatasetHumanLabeling(bank_dataset, new_x_train, "./NewDatasets/new_bank_train.data")
save_data_test(bank_dataset.x_test, bank_dataset.y_test, "./NewDatasets/new_bank_test.data")

## Skin Segmentation Instance Selection + Outlier Detection + Human Labeling

In [60]:
skin_dataset = Datasets.Skin_NonSkin_Dataset('./Datasets/Skin_NonSkin.txt', "Skin Segmentation",
                                             train_size=0.0005, normalization_method="None")

Started reading dataset  Skin Segmentation ...
Finished reading dataset  Skin Segmentation ...


In [61]:
new_x_train = Stage_1_and_2(skin_dataset, "Skin Segmentation", k=5, RT=15, ROT=5, 
                            KMeans_Model="My_KMeans", kmeans_initialization_method="kmeans++", kmeans_max_iter=300, 
                            KS_type="distribution_of_euclidean_distance", ROT_type="run_1_time")
new_x_train.shape

x_train shape:  (122, 3)
Started Stage I & II on dataset  Skin Segmentation ...
Instance selection started ...
Instance selection finished after  0.19309711456298828 ...
x_train shape kmeans after instance selection:  (150, 3) 

Started removing duplicate instances ... Skin Segmentation ...
x_train shape after duplication deleting:  (24, 3) 

Outlier Detection started ...
x_train shape after outlier detection:  (19, 3)
Outlier Detection finished after  0.138962984085083 ...

Finished.
~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.


(19, 3)

In [63]:
skin_dataset_human_labling = NewDatasetHumanLabeling(skin_dataset, new_x_train, "./NewDatasets/new_skin_train.data")
save_data_test(skin_dataset.x_test, skin_dataset.y_test, "./NewDatasets/new_skin_test.data")

## Covertype Instance Selection + Outlier Detection + Human Labeling

In [64]:
covertype_dataset = Datasets.Covertype_Dataset('./Datasets/covtype.data', "Covertype", 
                                               train_size=0.02, normalization_method="None")

Started reading dataset  Covertype ...
Finished reading dataset  Covertype ...


In [37]:
new_x_train = Stage_1_and_2(covertype_dataset, "Covertype", k=10, RT=30, ROT=5, 
                            KMeans_Model="My_KMeans", kmeans_initialization_method="kmeans++", kmeans_max_iter=300, 
                            KS_type="distribution_of_euclidean_distance", ROT_type="run_1_time")
new_x_train.shape

x_train shape:  (11620, 54)
Started Stage I & II on dataset  Covertype ...
Instance selection started ...
Instance selection finished after  845.946711063385 ...
x_train shape kmeans after instance selection:  (600, 54) 

Started removing duplicate instances ... Covertype ...
x_train shape after duplication deleting:  (20, 54) 

Outlier Detection started ...
x_train shape after outlier detection:  (15, 54)
Outlier Detection finished after  0.09199357032775879 ...

Finished.
~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.


(15, 54)

In [38]:
covertype_dataset_human_labling = NewDatasetHumanLabeling(covertype_dataset, new_x_train, "./NewDatasets/new_covtype_train.data")
save_data_test(covertype_dataset.x_test, covertype_dataset.y_test, "./NewDatasets/new_covtype_test.data")

In [39]:
new_x_train = Stage_1_and_2(covertype_dataset, "Covertype", k=10, RT=30, ROT=5, 
                            KMeans_Model="sklearn_KMeans", kmeans_initialization_method="kmeans++", kmeans_max_iter=300, 
                            KS_type="distribution_of_euclidean_distance", ROT_type="run_1_time")
new_x_train.shape

x_train shape:  (11620, 54)
Started Stage I & II on dataset  Covertype ...
Instance selection started ...
Instance selection finished after  39.954097747802734 ...
x_train shape kmeans after instance selection:  (600, 54) 

Started removing duplicate instances ... Covertype ...
x_train shape after duplication deleting:  (20, 54) 

Outlier Detection started ...
x_train shape after outlier detection:  (15, 54)
Outlier Detection finished after  0.08603334426879883 ...

Finished.
~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.


(15, 54)

In [40]:
covertype_dataset_human_labling = NewDatasetHumanLabeling(covertype_dataset, new_x_train, "./NewDatasets/sklearn_new_covtype_train.data")
save_data_test(covertype_dataset.x_test, covertype_dataset.y_test, "./NewDatasets/sklearn_new_covtype_test.data")