## Preentrenamiento - Wang - bits

#### Lectura de datasets

In [1]:
import re

class Dataset:
    
    def clean_str(self, string):
        """
        Tokenization/string cleaning for all datasets except for SST.
        Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
        """
        string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
        string = re.sub(r"\'s", " \'s", string)
        string = re.sub(r"\'ve", " \'ve", string)
        string = re.sub(r"n\'t", " n\'t", string)
        string = re.sub(r"\'re", " \'re", string)
        string = re.sub(r"\'d", " \'d", string)
        string = re.sub(r"\'ll", " \'ll", string)
        string = re.sub(r",", " , ", string)
        string = re.sub(r"!", " ! ", string)
        string = re.sub(r"\(", " \( ", string)
        string = re.sub(r"\)", " \) ", string)
        string = re.sub(r"\?", " \? ", string)
        string = re.sub(r"\s{2,}", " ", string)

        return string.strip().lower()
    
    #################################################################
    ##################### 20 Newsgroups #############################
    #################################################################
    
    path_train_20newsgroups = "../20NewsGroup/20ng-train-stemmed.txt"
    path_test_20newsgroups = "../20NewsGroup/20ng-test-stemmed.txt"
    
    target_names_20newsgroups = [
        "alt.atheism", 
        "comp.graphics",
        "comp.os.ms-windows.misc",
        "comp.sys.ibm.pc.hardware", 
        "comp.sys.mac.hardware",
        "comp.windows.x",
        "misc.forsale",
        "rec.autos",
        "rec.motorcycles",
        "rec.sport.baseball",
        "rec.sport.hockey",
        "sci.crypt",
        "sci.electronics",
        "sci.med",
        "sci.space",
        "soc.religion.christian",
        "talk.politics.guns",
        "talk.politics.mideast",
        "talk.politics.misc",
        "talk.religion.misc"
    ]
    
    def read_20newsgroups_file(self, path_test_20newsgroups):
        data = []
        target = []
        
        with open(path_test_20newsgroups) as file:
            for index, line in enumerate(file):
                tokens_count = len(line.split())

                if tokens_count > 1 and tokens_count <= 301:
                    category, text = line.split(None, 1)
                    data.append(self.clean_str(text))
                    target.append(self.target_names_20newsgroups.index(category))
                
        return data, target
        
    
    def fetch_20newsgroups(self, subset = "train"):
        dataset = {'data': None,  'target': None , 'target_names': self.target_names_20newsgroups}
    
        if subset == 'train':
            dataset['data'], dataset['target'] = self.read_20newsgroups_file(self.path_train_20newsgroups)
        elif subset == 'test':
            dataset['data'], dataset['target'] = self.read_20newsgroups_file(self.path_test_20newsgroups)
        elif subset == 'all':
            data_train, target_train = self.read_20newsgroups_file(self.path_train_20newsgroups)
            data_test, target_test = self.read_20newsgroups_file(self.path_test_20newsgroups)
            
            dataset['data'], dataset['target'] = data_train + data_test, target_train + target_test
        
        return dataset
    
    #################################################################
    ##################### SearchSnippets ############################
    #################################################################
    
    path_train_search_snippets = "../SearchSnippets/train.txt"
    path_test_search_snippets = "../SearchSnippets/test.txt"
    
    target_names_search_snippets = [
        "business",
        "computers",
        "culture-arts-entertainment",
        "education-science",
        "engineering",
        "health",
        "politics-society",
        "sports"
    ]
    
    def read_search_snippets_file(self, path_test_search_snippets):
        data = []
        target = []
        
        with open(path_test_search_snippets, encoding="utf8") as file:
            for index, line in enumerate(file):
                tokens_count = len(line.split())
               
                if tokens_count > 1 and tokens_count <= 301:
                    text, category = line.rsplit(None, 1)
                    data.append(self.clean_str(text))
                    target.append(self.target_names_search_snippets.index(category))
                
        return data, target
        
    
    def fetch_search_snippets(self, subset = "train"):
        dataset = {'data': None,  'target': None , 'target_names': self.target_names_search_snippets}
    
        if subset == 'train':
            dataset['data'], dataset['target'] = self.read_search_snippets_file(self.path_train_search_snippets)
        elif subset == 'test':
            dataset['data'], dataset['target'] = self.read_search_snippets_file(self.path_test_search_snippets)
        elif subset == 'all':
            data_train, target_train = self.read_search_snippets_file(self.path_train_search_snippets)
            data_test, target_test = self.read_search_snippets_file(self.path_test_search_snippets)
            
            dataset['data'], dataset['target'] = data_train + data_test, target_train + target_test
        
        return dataset

#### Métodos de Preentrenamiento

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import euclidean_distances
from pprint import pprint
import numpy as np
import threading
import math as mt
import multiprocessing
import time

class binaryCodes:
    
    vectorizer = TfidfVectorizer()
    
    def __init__(self, dataset, sigma = 2, method = 'weiss', k_neighbors = 7, c_coeff = 1, 
                 a_equal_coeff = 1, b_unequal_coeff = 0.1):
        self.tf_idf_features_matrix = self.tf_idf(dataset['data'])
        self.distance_matrix = self.euclidean_distance(self.tf_idf_features_matrix)
        self.labels = dataset['target']
        self.sigma = sigma
        self.method = method
        self.k_neighbors = k_neighbors
        self.c_coeff = c_coeff
        self.a_equal_coeff = a_equal_coeff
        self.b_unequal_coeff = b_unequal_coeff
        
    def tf_idf(self, data):
        return self.vectorizer.fit_transform(data)
    
    def euclidean_distance(self, tf_idf_features):
        return euclidean_distances(tf_idf_features, tf_idf_features)
    
    def parallel_operation(self, row_interval, w):
        for row in range(row_interval[0], row_interval[1]):
            for column in range(row, w.shape[1]):
                
                if self.method == 'weiss':
                    value = np.exp(-self.distance_matrix[row][column]**2/self.sigma**2)
                    
                elif self.method == 'chinese_weiss':
                  
                    row_neighbors = self.distance_matrix[row].argsort()[1:self.k_neighbors+1]
                    column_neighbors = self.distance_matrix[column].argsort()[1:self.k_neighbors+1]
                    
                    if row in column_neighbors or column in row_neighbors:
                        value = self.c_coeff*np.exp(-self.distance_matrix[row][column]**2/(2*self.sigma**2))
                    else:
                        value = 0
                        
                elif self.method == 'chinese_weiss_supervised':
                    
                    row_neighbors = self.distance_matrix[row].argsort()[1:self.k_neighbors+1]
                    column_neighbors = self.distance_matrix[column].argsort()[1:self.k_neighbors+1]
                    
                    if row in column_neighbors or column in row_neighbors:
                        if self.labels[row] == self.labels[column]:
                            value = self.a_equal_coeff*np.exp(-self.distance_matrix[row][column]**2/(2*self.sigma**2))
                        else:
                            value = self.b_unequal_coeff*np.exp(-self.distance_matrix[row][column]**2/(2*self.sigma**2))
                    else:
                        value = 0
                    
                w[row, column] = value
                
                if row != column:
                    w[column, row] = value
        
    def parallel_operation2(self, row_interval, w_matrix, d):
        for row in range(row_interval[0], row_interval[1]):
            sum_row = np.sum(w_matrix[row])
            d[row, row] = sum_row
                
    def w_matrix(self):
        
        print("Calculating W matrix using {} method ...".format(self.method))
        
        data_size, _ = self.tf_idf_features_matrix.shape

        start_time = time.time()

        w = np.zeros((data_size, data_size))

        cores_count = multiprocessing.cpu_count()
        bounds = list(range(mt.floor(data_size/cores_count), 
                            data_size-data_size%cores_count+1, mt.floor(data_size/cores_count)))

        bounds[-1] += data_size%cores_count

        threads = list()

        lower_bound = 0

        for index, upper_bound in enumerate(bounds):
            threads.append(threading.Thread(target = self.parallel_operation, args=([lower_bound, upper_bound], w)))
            lower_bound = upper_bound

        for thread in threads:
            thread.start()

        for thread in threads:
            thread.join()

        elapsed_time = time.time() - start_time

        print("Time to compute: {} min".format(elapsed_time/60))

        print("Nonzero elements count: {}".format(np.count_nonzero(w)))
        print("Ratio: {}".format(np.count_nonzero(w)/data_size**2))

        return w
    
    def d_matrix(self, w_matrix):
        
        print("Calculating D matrix using {} method ...".format(self.method))
        
        data_size, _ = self.tf_idf_features_matrix.shape
        
        start_time = time.time()
        
        d = np.zeros((data_size, data_size))

        cores_count = multiprocessing.cpu_count()
        bounds = list(range(mt.floor(data_size/cores_count), 
                            data_size-data_size%cores_count+1, mt.floor(data_size/cores_count)))

        bounds[-1] += data_size%cores_count

        threads = list()

        lower_bound = 0

        for index, upper_bound in enumerate(bounds):
            threads.append(threading.Thread(target = self.parallel_operation2, 
                                            args=([lower_bound, upper_bound], w_matrix, d)))
            lower_bound = upper_bound
            
        for thread in threads:
            thread.start()

        for thread in threads:
            thread.join()

        elapsed_time = time.time() - start_time
        
        print("Time to compute: {} min".format(elapsed_time/60))

        return d
    
    def binary_codes(self, hash_len = 64):
        
        data_size, _ = self.tf_idf_features_matrix.shape
        sigma_str = str(self.sigma).replace('.', '')
        c_coeff_str = str(self.c_coeff).replace('.', '')
        a_equal_coeff_str = str(self.a_equal_coeff).replace('.', '')
        b_unequal_coeff_str = str(self.b_unequal_coeff).replace('.', '')
        
        try:
            hash_codes = np.load('hash_codes-{}_{}_{}_{}_{}_{}_{}_{}.npy'.format(data_size, hash_len, self.method, 
                                     sigma_str, c_coeff_str, a_equal_coeff_str, b_unequal_coeff_str, self.k_neighbors))    
            
            print("Using preexisting hash codes")

        except:
        
            print("Calculating hash codes")
            global_start_time = time.time()
            
            try:
                eigen_values = np.load('eigen_values-{}_{}_{}_{}_{}_{}_{}.npy'.format(data_size, self.method, sigma_str, 
                                                        c_coeff_str, a_equal_coeff_str, b_unequal_coeff_str, self.k_neighbors)) 
                eigen_vectors = np.load('eigen_vectors-{}_{}_{}_{}_{}_{}_{}.npy'.format(data_size, self.method, sigma_str, 
                                                        c_coeff_str, a_equal_coeff_str, b_unequal_coeff_str, self.k_neighbors))
                
                print("Using preexisting eigen vectors and values")
                
            except:
                
                w_matrix = self.w_matrix()
                d_matrix = self.d_matrix(w_matrix)

                sub_martix = d_matrix-w_matrix
                
                print("Calculating eigen vectors")
                
                eigen_values, eigen_vectors = np.linalg.eig(sub_martix)
                
                np.save('eigen_values-{}_{}_{}_{}_{}_{}_{}.npy'.format(data_size, self.method, sigma_str, 
                                        c_coeff_str, a_equal_coeff_str, b_unequal_coeff_str, self.k_neighbors), eigen_values)
                
                np.save('eigen_vectors-{}_{}_{}_{}_{}_{}_{}.npy'.format(data_size, self.method, sigma_str, 
                                        c_coeff_str, a_equal_coeff_str, b_unequal_coeff_str, self.k_neighbors), eigen_vectors)

            eigen_values = np.delete(eigen_values, 0)
            eigen_vectors = np.delete(eigen_vectors, 0, 1)

            min_eigen_values = np.argsort(eigen_values)
           
            count = 0
            hash_codes = list()

            for i in range(hash_len):
                min_eigen_values[count]
                hash_codes.append(eigen_vectors[:, min_eigen_values[count]])
                count += 1

            hash_codes = np.asarray(hash_codes).transpose()

            threshold = lambda x: 1 if x>=0 else 0 
            vfunc = np.vectorize(threshold)

            hash_codes = vfunc(hash_codes)

            elapsed_time = time.time() - global_start_time

            print("Total Time: {} min \n".format(elapsed_time/60))

            np.save('hash_codes-{}_{}_{}_{}_{}_{}_{}_{}.npy'.format(data_size, hash_len, self.method, 
                        sigma_str, c_coeff_str, a_equal_coeff_str, b_unequal_coeff_str, self.k_neighbors), hash_codes)
        
        return hash_codes

#### Evaluación mediante clusters

In [3]:
%matplotlib inline

from sklearn.neighbors import DistanceMetric
import ipython_genutils
import pandas as pd
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Image, display

class evaluate:
    
    def intra_cluster_distance(hash_codes, dataset):
        
        clusters = dict()
        
        categories = dataset['target']
        categories_names = dataset['target_names']
        
        for index, value in enumerate(categories):
            if value not in clusters.keys():
                clusters[value] = [hash_codes[index]]
            else:
                clusters[value].append(hash_codes[index])
                
        distance_matrix = DistanceMetric.get_metric('hamming')
        table = {'name': [], 'size': [], 'elements': [], 'median': [], 'mean': [], 'std': [], 'min': [], 'max': []}
        
        for label in clusters.keys():
            inner_distances = distance_matrix.pairwise(clusters[label])
            inner_distances = inner_distances[np.triu_indices(inner_distances.shape[0], 1)]
            
            table['name'].append(categories_names[label])
            table['size'].append(len(clusters[label]))
            table['elements'].append(len(inner_distances))
            table['median'].append(np.median(inner_distances))
            table['mean'].append(np.mean(inner_distances))
            table['std'].append(np.std(inner_distances))
            table['min'].append(np.amin(inner_distances))
            table['max'].append(np.amax(inner_distances))
            
        df = pd.DataFrame(data=table)
        display(df)
        display(df.describe()) 
        
        return inner_distances
    
    def inter_cluster_distance(hash_codes, dataset):
        
        clusters = dict()
        
        categories = dataset['target']
        categories_names = dataset['target_names']
        
        for index, value in enumerate(categories):
            if value not in clusters.keys():
                clusters[value] = [hash_codes[index]]
            else:
                clusters[value].append(hash_codes[index])
                
        distance_matrix = DistanceMetric.get_metric('hamming')
        table = {'name': [], 'size': [], 'elements': [], 'median': [], 'mean': [], 'std': [], 'min': [], 'max': []}
        
        pairs = list(itertools.combinations(clusters.keys(), 2))
        
        median_matrix = np.zeros((len(categories_names), len(categories_names)))
        mean_matrix = np.zeros((len(categories_names), len(categories_names)))
        std_matrix = np.zeros((len(categories_names), len(categories_names)))
        min_distance_matrix = np.zeros((len(categories_names), len(categories_names)))
        max_distance_matrix = np.zeros((len(categories_names), len(categories_names)))
        
        for label_a, label_b in pairs:
            inter_distances = distance_matrix.pairwise(clusters[label_a], clusters[label_b])
            median = np.median(inter_distances)
            mean = inter_distances.mean()
            std = inter_distances.std()
            min_distance = inter_distances.min()
            max_distance = inter_distances.max()
            
            median_matrix[label_a, label_b] = median
            median_matrix[label_b, label_a] = median
            
            mean_matrix[label_a, label_b] = mean
            mean_matrix[label_b, label_a] = mean
            
            std_matrix[label_a, label_b] = std
            std_matrix[label_b, label_a] = std
            
            min_distance_matrix[label_a, label_b] = min_distance
            min_distance_matrix[label_b, label_a] = min_distance
            
            max_distance_matrix[label_a, label_b] = max_distance
            max_distance_matrix[label_b, label_a] = max_distance
            
        ## Median
        df = pd.DataFrame(median_matrix, columns = categories_names, index = categories_names)
        fig, ax = plt.subplots(figsize=(12,10))
        
        plt.title("Median", fontsize=18)
        ttl = ax.title
        ttl.set_position([0.5, 1.01])
       
        sns.heatmap(df, annot=True, vmin=0, vmax=1, ax=ax)
        
        ## Mean
        df = pd.DataFrame(mean_matrix, columns = categories_names, index = categories_names)
        fig, ax = plt.subplots(figsize=(12,10))
        
        plt.title("Mean", fontsize=18)
        ttl = ax.title
        ttl.set_position([0.5, 1.01])
       
        sns.heatmap(df, annot=True, vmin=0, vmax=1, ax=ax)
        
        ## Std
        df = pd.DataFrame(std_matrix, columns = categories_names, index = categories_names)
        fig, ax = plt.subplots(figsize=(12,10))
        
        plt.title("Std", fontsize=18)
        ttl = ax.title
        ttl.set_position([0.5, 1.01])
       
        sns.heatmap(df, annot=True, vmin=0, vmax=0.1, ax=ax)
        
        ## Min distance
        df = pd.DataFrame(min_distance_matrix, columns = categories_names, index = categories_names)
        fig, ax = plt.subplots(figsize=(12,10))
        
        plt.title("Min Distance", fontsize=18)
        ttl = ax.title
        ttl.set_position([0.5, 1.01])
       
        sns.heatmap(df, annot=True, vmin=0, vmax=1, ax=ax)
        
        ## Max distance
        df = pd.DataFrame(max_distance_matrix, columns = categories_names, index = categories_names)
        fig, ax = plt.subplots(figsize=(12,10))
        
        plt.title("Max Distance", fontsize=18)
        ttl = ax.title
        ttl.set_position([0.5, 1.01])
       
        sns.heatmap(df, annot=True, vmin=0, vmax=1, ax=ax)
            
        return

#### Evaluación mediante infomation retrival

In [4]:
from sklearn.neighbors import DistanceMetric

class hashingDatabase:
    
    def __init__(self):
        self.database = []
        self.labels = []
    
    def add_element(self, element, label):
        self.database.append(element)
        self.labels.append(label)
        
    def find_neighbours_position(self, element):
        distance_metric = DistanceMetric.get_metric('hamming')
        distance_vector = distance_metric.pairwise([element], self.database)
        return np.argsort(distance_vector)
    
    def evaluate_P_K(self, element, label, k = None):
        neighbours_position = self.find_neighbours_position(element)[0][:k]
        matched = 0
        #print(neighbours_position)
        for neighbour_position in neighbours_position:
            #print('neighbor label: {}'.format(self.labels[neighbour_position]))
            if self.labels[neighbour_position] == label:
                matched += 1
        #print('precision {}'.format(matched/k))        
        return matched/k

### 20 Newsgroup

In [5]:
dataset = Dataset()
print("Dataset structure:")
train_dataset = dataset.fetch_20newsgroups(subset = 'train')
print(train_dataset.keys())
print("\n")
print("Dataset example:")
print(train_dataset['data'][0])
print("\n")
print("Train dataset:")
print("Set size: {}".format(len(train_dataset['data'])))
print("\n")

complete_dataset = train_dataset['data']

mean_length = sum(len(document.split()) for document in complete_dataset)/len(complete_dataset)
max_length = max(map(lambda document: len(document.split()), complete_dataset))

print('Mean Lenght: {}'.format(round(mean_length, 1)))
print('Max Lenght: {}'.format(max_length))
print("\n")

#vocabulary = list()
#for document in complete_dataset:
#    for word in document.split():
#        if word not in vocabulary:
#            vocabulary.append(word)
#
#print('Vocabulary Size: {}'.format(len(vocabulary)))

Dataset structure:
dict_keys(['data', 'target', 'target_names'])


Dataset example:
univers violat separ church state dmn kepler unh edu king becom philosoph philosoph becom king write recent ra order and resist care appar post religi flyer entitl soul scroll thought religion spiritu and matter soul insid bathroom stall door school univers hampshir sort newslett assembl hall director campu pose question spiritu each issu and solicit respons includ issu pretti vagu assum put christian care not mention jesu bibl heard defend doesn support religion thi state univers and strong support separ church and state enrag can thi sound scream for parodi give copi your friendli neighbourhood subgeniu preacher luck run mental mincer and hand you back outrag offens and gut bustingli funni parodi you can past origin can stool scroll thought religion spiritu and matter colon you can us thi text wipe mathew


Train dataset:
Set size: 10443


Mean Lenght: 94.3
Max Lenght: 300




### Wang

#### Parametro $n$ bits

In [None]:
import random
import copy
import math 

k = 100
parameters = [16, 32, 64]
seeds = [1, 4 ,7]

precision_mean = []
precision_std = []

for parameter in parameters:
    print('Parametro n bits {}'.format(parameter))
    hash_codes = binaryCodes(train_dataset, sigma = 1, k_neighbors = 7 , c_coeff = 1, method = 'chinese_weiss')
    binary_codes = hash_codes.binary_codes(hash_len = parameter)
    
    precision_val = []
    
    for count, seed in enumerate(seeds):
        print('    Conjunto de validacion {}'.format(count+1))
        random.seed(seed)
        
        validation_sample = random.sample(range(len(train_dataset['data'])), math.ceil(len(train_dataset['data'])*0.2))
        validation_split = {'binary_code': [], 'target': []}
        
        database = hashingDatabase()

        for index in range(len(train_dataset['data'])):
            if index in validation_sample:
                validation_split['binary_code'].append(binary_codes[index])
                validation_split['target'].append(train_dataset['target'][index])
            else:
                database.add_element(binary_codes[index], train_dataset['target'][index])

        precisions = []
        
        for index in range(len(validation_split['binary_code'])):
            precision = database.evaluate_P_K(validation_split['binary_code'][index], validation_split['target'][index], k)
            precisions.append(precision)    
            
        print("    --->n bits {} precision at {} mean precision {}".format(parameter, k, np.mean(precisions)))

        precision_val.append(np.mean(precisions))
    
    precision_mean.append(np.mean(precision_val))
    precision_std.append(np.std(precision_val))
    
    print("Global Mean Precision {} with std {}\n".format(np.mean(precision_val), np.std(precision_val)))

print("\n")
print("Mean: {}".format(precision_mean))
print("Std: {}".format( precision_std))

Parametro n bits 16
Calculating hash codes
Calculating W matrix using chinese_weiss method ...


In [None]:
abscissa = parameters
ordinate = list(map(lambda x: x*100, precision_mean))
error = list(map(lambda x: x*100, precision_std))
 
plt.style.use('default') #seaborn-paper
#plt.errorbar(abscissa, ordinate, error, capsize=4, capthick=0.8, ecolor='black', elinewidth=0.8, 
#             marker = 'o', color='g', label=r'parametro $\epsilon$')
plt.plot(abscissa, ordinate, marker = 'o', color='g', label=r'parametro $n$ bits')
plt.xscale('log', basex=2)
plt.xticks(parameters, parameters)
plt.xlabel(r'$n$ bits')
plt.ylabel('mP@100(%)')
#plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('20 Newsgroups (Wang)')
#plt.rc('grid', linestyle=":", color='grey')
#plt.grid(True)

#for i in range(len(ordinate)):
    # Create a formatted string with three spaces, one newline
#    ann = r'{0:.2f}$\pm${0:.2f}'.format(ordinate[i], error[i])
#    plt.annotate(ann, xy=(abscissa[i], ordinate[i]), xycoords='data')

plt.show()

### SearchSnippets

In [None]:
dataset = Dataset()
print("Dataset structure:")
train_dataset = dataset.fetch_search_snippets(subset = 'train')
print(train_dataset.keys())
print("\n")
print("Dataset example:")
print(train_dataset['data'][0])
print("\n")
print("Train dataset:")
print("Set size: {}".format(len(train_dataset['data'])))
print("\n")

complete_dataset = train_dataset['data']

mean_length = sum(len(document.split()) for document in complete_dataset)/len(complete_dataset)
max_length = max(map(lambda document: len(document.split()), complete_dataset))

print('Mean Lenght: {}'.format(round(mean_length, 1)))
print('Max Lenght: {}'.format(max_length))
print("\n")

#vocabulary = list()
#for document in complete_dataset:
#    for word in document.split():
#        if word not in vocabulary:
#            vocabulary.append(word)
#
#print('Vocabulary Size: {}'.format(len(vocabulary)))

### Wang

#### Parametro $n$ bits

In [None]:
import random
import copy
import math 

k = 100
parameters = [16, 32, 64]
seeds = [1, 4 ,7]

precision_mean = []
precision_std = []

for parameter in parameters:
    print('Parametro n bits {}'.format(parameter))
    hash_codes = binaryCodes(train_dataset, sigma=1, k_neighbors = 7 , c_coeff = 1, method = 'chinese_weiss')
    binary_codes = hash_codes.binary_codes(hash_len = parameter)
    
    precision_val = []
    
    for count, seed in enumerate(seeds):
        print('    Conjunto de validacion {}'.format(count+1))
        random.seed(seed)
        
        validation_sample = random.sample(range(len(train_dataset['data'])), math.ceil(len(train_dataset['data'])*0.2))
        validation_split = {'binary_code': [], 'target': []}
        
        database = hashingDatabase()

        for index in range(len(train_dataset['data'])):
            if index in validation_sample:
                validation_split['binary_code'].append(binary_codes[index])
                validation_split['target'].append(train_dataset['target'][index])
            else:
                database.add_element(binary_codes[index], train_dataset['target'][index])

        precisions = []
        
        for index in range(len(validation_split['binary_code'])):
            precision = database.evaluate_P_K(validation_split['binary_code'][index], validation_split['target'][index], k)
            precisions.append(precision)    
            
        print("    --->n bits {} precision at {} mean precision {}".format(parameter, k, np.mean(precisions)))

        precision_val.append(np.mean(precisions))
    
    precision_mean.append(np.mean(precision_val))
    precision_std.append(np.std(precision_val))
    
    print("Global Mean Precision {} with std {}\n".format(np.mean(precision_val), np.std(precision_val)))

print("\n")
print("Mean: {}".format(precision_mean))
print("Std: {}".format( precision_std))

In [None]:
abscissa = parameters
ordinate = list(map(lambda x: x*100, precision_mean))
error = list(map(lambda x: x*100, precision_std))
 
plt.style.use('default') #seaborn-paper
#plt.errorbar(abscissa, ordinate, error, capsize=4, capthick=0.8, ecolor='black', elinewidth=0.8, 
#             marker = 'o', color='g', label=r'parametro $\epsilon$')
plt.plot(abscissa, ordinate, marker = 'o', color='g', label=r'parametro $n$ bits')
plt.xscale('log', basex=2)
plt.xticks(parameters, parameters)
plt.xlabel(r'$n$ bits')
plt.ylabel('mP@100(%)')
#plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('20 Newsgroups (Wang)')
#plt.rc('grid', linestyle=":", color='grey')
#plt.grid(True)

#for i in range(len(ordinate)):
    # Create a formatted string with three spaces, one newline
#    ann = r'{0:.2f}$\pm${0:.2f}'.format(ordinate[i], error[i])
#    plt.annotate(ann, xy=(abscissa[i], ordinate[i]), xycoords='data')

plt.show()