In [1]:
from collections import defaultdict
import random
import numpy as np

from utils import one_hot_encoding

def dim(array):
    if not type(array) == list:
        return []
    return [len(array)] + dim(array[0])

def integerize(values):
    def integerize_list(values):
        distinct = {}
        latest_integer = 0
        result = []
        
        for value in values:
            if value not in distinct:
                distinct[value] = latest_integer
                result.append(latest_integer)
                latest_integer += 1
            else:
                result.append(distinct[value])
        result = np.array(result)
        
        return result, distinct
        
    if len(dim(values)) == 0:
        # values is a integer value
        
        raise ValueError("values must be a list")
        
    elif len(dim(values)) == 1:
        # values is a list
        
        return integerize_list(values)

    elif len(dim(values)) == 2:
        # values is a list of lists
        
        # Transpose values variable
        values_T = list(zip(*values))
        
        distincts = []
        results = []
        for value in values_T:
            result, distinct = integerize_list(value)
            results.append(result)
            distincts.append(distinct)
        results = np.array(results).T
        
        return results, distincts

ModuleNotFoundError: No module named 'utils'

In [147]:
def one_hot_encoding(values, templates=None):
    def one_hot_encoding_1d(values, template):
        '''
        Parameter:
        values: numpy array
            The 1d numpy array needed to be one hot encoded
        template: numpy array
            The 1d numpy array specify the template of encoding
            
        Return: numpy array
            1d one hot encoded numpy array
        '''
        values = np.array(values)
        template = np.array(template)
        
        # If the template has less than two values
        if template.shape[0] < 3:
            results = np.array([np.where(template == value)[0][0] for value in values])
        else:            
            results = np.zeros((values.shape[0], template.shape[0]), dtype=int)
            for value, result in zip(values, results):
                index = np.where(template == value)[0][0]
                result[index] = 1
            results = results.reshape((-1))
        return results
        
    if len(values.shape) == 0:
        # values variable is an integer value
        # return a 1d array
        
        # template variable must be a 1d array if the values is an integer
        if templates == None:
            raise ValueError('The templates variable must be a 1d array if the values variable is an integer')
        elif len(templates.shape) != 1:
            raise ValueError('The templates variable must be a 1d array if the values variable is an integer')
        
        results = one_hot_encoding_1d(np.array([values]), templates)
            
    elif len(values.shape) == 1:
        # values variable is a 1d array
        # return a 1d array
        
        # template variable is a 1d array
        if templates == None:
            templates = np.sort(np.unique(values))
        elif len(templates.shape) != 1:
            raise ValueError('The templates variable must be a 1d array if the values variable is a 1d array')

        results = one_hot_encoding_1d(values, templates)
            
    elif len(values.shape) == 2:
        # values variable is a 2d array
        # return a 2d array
        # assume to one hot encoding for each column of values variable
        
        # template variable is a 2d array. Each row of template variable is for each column of values variable
        if templates == None:
            templates = np.array([np.sort(np.unique(values[:, i])) for i in range(values.shape[1])])
        elif len(templates.shape) != 2:
            raise ValueError('The templates variable must be a 2d array if the values variable is a 2d array')
        
        results = np.zeros((values.shape[0], 0), dtype=int)
        for value, template in zip(values.T, templates):
            num_col = template.shape[0]
            if template.shape[0] < 3:
                num_col = 1
                
            results = np.concatenate((results, one_hot_encoding_1d(value, template)
                                      .reshape((values.shape[0], num_col))), axis=1)
        
    return results

array([[0, 0, 0],
       [1, 1, 1]], dtype=int64)

In [184]:
class BinaryDataset:
    def __init__(self, path, label_index, transform, unknown):
        '''
        Parameters:
        path: String
            The path to the dataset file
        label_index: int
            The label index of the dataset
        transform: Map
            How to transform the features and labels. The key of the map should be index and the value 
            is a function of the transformation. The return value of the transformation function is 
            expected to be a description. 
        '''

        # Transform features and labels
        self.features = []
        self.labels = []
        with open(path, "r") as file:
            for line in file:
                row = line.strip().split(',')

                feature = []
                for index, field in enumerate(row):
                    if index == label_index:
                        transformed_label = transform[index](field.strip())
                        self.labels.append(transformed_label)
                    elif index in transform:
                        transformed_feature = transform[index](field.strip())
                        feature.append(transformed_feature)
                self.features.append(feature)
        
        # Integerize the features and labels
        self.X, self.x_feature_map = integerize(self.features)
        self.Y, self.y_label_map = integerize(self.labels)
            
        # Binarize the features and labels
        self.bX = one_hot_encoding(self.X)
        self.bY = one_hot_encoding(self.Y)
               
#     def get_dataset(self, percentage):
#         indices = random.sample(range(self.X.shape[0]), self.X.shape[0])
#         self.train_indices = sorted(indices[:int(percentage * self.X.shape[0])])
#         self.test_indices = sorted(indices[int(percentage * self.X.shape[0]):])

#         self.train_X = self.X[self.train_indices]
#         self.test_X = self.X[self.test_indices]
#         self.train_Y = self.Y[self.train_indices]
#         self.test_Y = self.Y[self.test_indices]

In [185]:
path = 'zoo.data'
label_index = 17
transform = {
    1: lambda x: {'1': 'hair', '0': 'NOT_hair'}[x],
    2: lambda x: {'1': 'feathers', '0': 'NOT_feathers'}[x],
    3: lambda x: {'1': 'eggs', '0': 'NOT_eggs'}[x],
    4: lambda x: {'1': 'milk', '0': 'NOT_milk'}[x],
    5: lambda x: {'1': 'airborne', '0': 'NOT_airborne'}[x],
    6: lambda x: {'1': 'aquatic', '0': 'NOT_aquatic'}[x],
    7: lambda x: {'1': 'predator', '0': 'NOT_predator'}[x],
    8: lambda x: {'1': 'toothed', '0': 'NOT_toothed'}[x],
    9: lambda x: {'1': 'backbone', '0': 'NOT_backbone'}[x],
    10: lambda x: {'1': 'breathes', '0': 'NOT_breathes'}[x],
    11: lambda x: {'1': 'venomous', '0': 'NOT_venomous'}[x],
    12: lambda x: {'1': 'fins', '0': 'NOT_fins'}[x],
    13: lambda x: {'0': 'legs_0', '2': 'legs_2', '4': 'legs_4', '5': 'legs_5', '6': 'legs_6', '8': 'legs_8'}[x],
    14: lambda x: {'1': 'tail', '0': 'NOT_tail'}[x],
    15: lambda x: {'1': 'domestic', '0': 'NOT_domestic'}[x],
    16: lambda x: {'1': 'catsize', '0': 'NOT_catsize'}[x],
    17: lambda x: {'1': 'Mammal', '2': 'Bird', '3': 'Reptile', '4': 'Fish', '5': 'Amphibian', '6': 'Bug', '7': 'Invertebrate'}[x]
}

dataset = BinaryDataset(path, label_index, transform)

In [187]:
def divide_value(x, thresholds):
    thresholds = sorted(thresholds)
    
    for i in range(len(thresholds)):
        description = str(thresholds[-1]) + "<="
        if x < thresholds[i]:
            description = ""
            if i != 0:
                description = str(thresholds[i - 1]) + "<= AND "
            description += "<" + str(thresholds[i])
            break
            
    return description
            
path = 'adult.data'
label_index = 14
transform = {
    0: lambda x: divide_value(int(x), [28, 37, 48]), 
    1: lambda x: {'Private': 'private', 'Self-emp-not-inc': 'self_employed', 'Self-emp-inc': 'self_employed', 
                  'Federal-gov': 'government', 'Local-gov': 'government', 'State-gov': 'government', 
                  'Without-pay': 'no_work', 'Never-worked': 'no_work'}[x], 
    3: lambda x: {'10th': 'dropout', '11th': 'dropout', '12th': 'dropout', '1st-4th':'dropout', '5th-6th': 'dropout', 
                  '7th-8th': 'dropout', '9th': 'dropout', 'Preschool': 'dropout', 'HS-grad': 'high_school_grad',
                  'Some-college': 'high_school_grad', 'Masters': 'college', 'Prof-school': 'prof_school', 
                  'Assoc-acdm': 'associates', 'Assoc-voc': 'associates', 'Bachelors': 'college', 'Doctorate': 'phd'}[x], 
    5: lambda x: {'Never-married': 'never_married', 'Married-AF-spouse': 'married','Married-civ-spouse': 'married', 
                  'Married-spouse-absent': 'separated', 'Separated': 'separated', 'Divorced':'separated', 
                  'Widowed': 'widowed'}[x], 
    6: lambda x: {"Adm-clerical": "admin", "Armed-Forces": "military", "Craft-repair": "blue_collar", 
                  "Exec-managerial": "white_collar", "Farming-fishing": "blue_collar", "Handlers-cleaners": "blue_collar", 
                  "Machine-op-inspct": "blue_collar", "Other-service": "service", "Priv-house-serv": "service", 
                  "Prof-specialty": "professional", "Protective-serv": "other", "Sales":"sales", "Tech-support": "other", 
                  "Transport-moving": "blue_collar"}[x], 
    8: lambda x: {'White': 'white', 'Asian-Pac-Islander': 'asian_pac_islander', 'Amer-Indian-Eskimo': 'amer_indian_eskimo',
                  'Other': 'other', 'Black': 'black'}[x],
    9: lambda x: {'Female': 'female', 'Male': 'male'}[x], 
    12: lambda x: divide_value(int(x), [40, 45]),
    14: lambda x: x
}

dataset = BinaryDataset(path, label_index, transform)

KeyError: '?'