# Create New Representative datasets using RandomSelection Method

In [38]:
from random import sample
from random import shuffle
import numpy as np
import pandas as pd
import os

In [14]:
import Datasets

In [32]:
class RandomSelection:
    def __init__(self, dataset, dataset_name, sampling_size=25):
        self.x_train = dataset.x_train
        self.y_train = dataset.y_train
        
        data_t = np.append(self.x_train, self.y_train, axis=1)
        
        self.data_train = sample(list(data_t), sampling_size)
        self.data_train = np.asarray(self.data_train)
        self.y_train = self.data_train[:, -1]
        self.x_train = np.delete(self.data_train, -1, axis=1)

## Labeling new dataset (representative datas)

In [11]:
class NewDatasetHumanLabeling:
    def __init__(self, dataset, new_x_train, output_dataset_path):
        start = time.time()

        new_y_train = self.LabelingNewDataset(dataset, new_x_train)
        self.create_new_dataset_csv_file(new_x_train, new_y_train, output_dataset_path)

        end = time.time()
        self.processing_time = end - start
    
    def LabelingNewDataset(self, dataset, new_x_train):
        new_y_train = []
        i = 0
        for new_x in new_x_train:
            new_y_train.append(dataset.y_train[(dataset.x_train==new_x).all(axis=1).nonzero()[0][0]][0])
        return new_y_train

    def create_new_dataset_csv_file(self, new_x_train, new_y_train, output_dataset_path):
        df = pd.DataFrame(new_x_train)
        df[len(df.columns)] = new_y_train
        # create a new csv file
        df.to_csv("test.csv", index=False)
        
        # remove first line of csv file which is the header of each coloumn
        with open("test.csv",'r') as f:
            with open(output_dataset_path,'w') as f1:
                next(f) # skip header line
                for line in f:
                    f1.write(line)
        os.remove("test.csv")

## Saving data test in a new csv file individualy

In [12]:
def save_data_test(x_test, y_test, output_dataset_path):
    df = pd.DataFrame(x_test)
    df[len(df.columns)] = y_test
    # create a new csv file
    df.to_csv("test.csv", index=False)

    # remove first line of csv file which is the header of each coloumn
    with open("test.csv",'r') as f:
        with open(output_dataset_path,'w') as f1:
            next(f) # skip header line
            for line in f:
                f1.write(line)
    os.remove("test.csv")

## Breast-w Random Instance Selection + Human Labeling

In [33]:
breast_w_dataset = Datasets.Breast_W_Dataset('./Datasets/breast-cancer-wisconsin.data', "Breast-W", 
                                    train_size=0.1, normalization_method='None')

Started reading dataset  Breast-W ...
Finished reading dataset  Breast-W ...


In [34]:
random_selection = RandomSelection(breast_w_dataset, "Breast-W", sampling_size=19)
print("x_train shape: ", random_selection.x_train.shape)

x_train shape:  (19, 10)


In [39]:
save_data_test(random_selection.x_train, random_selection.y_train, "./NewRandomSelectionDatasets/new_breast_w_train.data")
save_data_test(breast_w_dataset.x_test, breast_w_dataset.y_test, "./NewRandomSelectionDatasets/new_breast_w_test.data")

## Messidor Random Instance Selection + Human Labeling

In [40]:
messidor = Datasets.Messidor_Dataset('./Datasets/messidor_features.arff', "Messidor", 
                                     train_size=0.1, normalization_method='None', 
                                     is_class_label_a_feature=False)

Started reading dataset  Messidor ...
Finished reading dataset  Messidor ...


In [41]:
random_selection = RandomSelection(messidor, "Messidor", sampling_size=36)
print("x_train shape: ", random_selection.x_train.shape)

x_train shape:  (36, 19)


In [42]:
save_data_test(random_selection.x_train, random_selection.y_train, "./NewRandomSelectionDatasets/new_messidor_train.data")
save_data_test(messidor.x_test, messidor.y_test, "./NewRandomSelectionDatasets/new_messidor_test.data")

## Car Random Instance Selection + Human Labeling

In [43]:
car_dataset = Datasets.Car_Dataset('./Datasets/car.data', "Car", 'Class', 
                                   train_size=0.1)

Started reading dataset  Car ...
Finished reading dataset  Car ...


In [44]:
random_selection = RandomSelection(car_dataset, "Car", sampling_size=56)
print("x_train shape: ", random_selection.x_train.shape)

x_train shape:  (56, 6)


In [45]:
save_data_test(random_selection.x_train, random_selection.y_train, "./NewRandomSelectionDatasets/new_car_train.data")
save_data_test(car_dataset.x_test, car_dataset.y_test, "./NewRandomSelectionDatasets/new_car_test.data")

## Spambase Random Instance Selection + Human Labeling

In [47]:
spambase_dataset = Datasets.Spambase_Dataset('./Datasets/spambase.data', "Spambase", 
                                    train_size=0.1, normalization_method='None')


Started reading dataset  Spambase ...
Finished reading dataset  Spambase ...


In [48]:
random_selection = RandomSelection(spambase_dataset, "Spambase", sampling_size=26)
print("x_train shape: ", random_selection.x_train.shape)

x_train shape:  (26, 57)


In [49]:
save_data_test(random_selection.x_train, random_selection.y_train, "./NewRandomSelectionDatasets/new_Spambase_train.data")
save_data_test(spambase_dataset.x_test, spambase_dataset.y_test, "./NewRandomSelectionDatasets/new_Spambase_test.data")

## Coil2000 Random Instance Selection + Human Labeling

In [50]:
coil2000_dataset = Datasets.Coil2000_Dataset('./Datasets/coil2000.dat', "Coil2000", 
                            train_size=0.02, normalization_method='None')
coil2000_dataset.x_train.shape

Started reading dataset  Coil2000 ...
Finished reading dataset  Coil2000 ...


(196, 85)

In [52]:
random_selection = RandomSelection(coil2000_dataset, "Coil2000", sampling_size=68)
print("x_train shape: ", random_selection.x_train.shape)

x_train shape:  (68, 85)


In [53]:
save_data_test(random_selection.x_train, random_selection.y_train, "./NewRandomSelectionDatasets/new_coil2000_train.data")
save_data_test(coil2000_dataset.x_test, coil2000_dataset.y_test, "./NewRandomSelectionDatasets/new_coil2000_test.data")

## Bank Marketing Random Instance Selection + Human Labeling

In [54]:
bank_dataset = Datasets.Bank_Marketing_Dataset('./Datasets/bank-full.csv', "Bank Marketing", 'y', 
                                      train_size=0.1, normalization_method="None")


Started reading dataset  Bank Marketing ...
Finished reading dataset  Bank Marketing ...


In [56]:
random_selection = RandomSelection(bank_dataset, "Bank", sampling_size=13)
print("x_train shape: ", random_selection.x_train.shape)

x_train shape:  (13, 16)


In [57]:
save_data_test(random_selection.x_train, random_selection.y_train, "./NewRandomSelectionDatasets/new_bank_train.data")
save_data_test(bank_dataset.x_test, bank_dataset.y_test, "./NewRandomSelectionDatasets/new_bank_test.data")

## Skin Segmentation Random Instance Selection + Human Labeling

In [58]:
skin_dataset = Datasets.Skin_NonSkin_Dataset('./Datasets/Skin_NonSkin.txt', "Skin Segmentation",
                                             train_size=0.0005, normalization_method="None")

Started reading dataset  Skin Segmentation ...
Finished reading dataset  Skin Segmentation ...


In [60]:
random_selection = RandomSelection(skin_dataset, "Skin", sampling_size=18)
print("x_train shape: ", random_selection.x_train.shape)

x_train shape:  (18, 3)


In [61]:
save_data_test(random_selection.x_train, random_selection.y_train, "./NewRandomSelectionDatasets/new_skin_train.data")
save_data_test(skin_dataset.x_test, skin_dataset.y_test, "./NewRandomSelectionDatasets/new_skin_test.data")

## Covertype Random Instance Selection + Human Labeling

In [62]:
covertype_dataset = Datasets.Covertype_Dataset('./Datasets/covtype.data', "Covertype", 
                                               train_size=0.02, normalization_method="None")

Started reading dataset  Covertype ...
Finished reading dataset  Covertype ...


In [63]:
random_selection = RandomSelection(covertype_dataset, "Covertype", sampling_size=50)
print("x_train shape: ", random_selection.x_train.shape)

x_train shape:  (50, 54)


In [64]:
save_data_test(random_selection.x_train, random_selection.y_train, "./NewRandomSelectionDatasets/new_covtype_train.data")
save_data_test(covertype_dataset.x_test, covertype_dataset.y_test, "./NewRandomSelectionDatasets/new_covtype_test.data")