**Instalación de librerías necesarias**

In [43]:
!pip install librosa==0.9.2 
!pip install numpy
!pip install IPython
!pip install --upgrade setuptools
!pip install pandas
!pip install matplotlib
!pip install rtree
!pip install faiss-cpu

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


**Importación de librerías**:

In [11]:
import librosa
import IPython.display as ipd
import os
import numpy as np
import pandas as pd
import heapq
import time
import matplotlib.pyplot as plt
import concurrent.futures
import rtree
import faiss

**Definición de funciones**:

In [12]:
# Funciones de extraccion de caracteristicas
def get_audios(n):
    audios = []
    for i in range(1, 156):
        num = ""
        if i < 10:
            num = "00" + str(i)
        elif i < 100:
            num = "0" + str(i)
        else:
            num = str(i)

        for file in os.listdir('fma_medium/' + num):
            if file.endswith('.mp3'):
                audios.append('fma_medium/' + num + '/' + file)
            if len(audios) == n:
                return audios

def features_extraction(file_path, dimensions):
    #load the audio file
    x, sample_rate = librosa.load(file_path, res_type='kaiser_fast')
    #extract features from the audio file
    mfcc = np.mean(librosa.feature.mfcc(y=x, sr=sample_rate, n_mfcc=dimensions).T, axis=0) # axis = 1
    
    return mfcc

def extract_n_features(n, dimensions, threads):
    audios = get_audios(n)
    features = []
    results = {}

    with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
        future_to_data = {executor.submit(features_extraction, item, dimensions): item for item in audios }
        for future in concurrent.futures.as_completed(future_to_data):
            data_item = future_to_data[future]
            try:
                result = future.result()
                # Assign the result to the input string in the dictionary
                results[data_item] = result
            except Exception as exc:
                print(f'{data_item} generated an exception: {exc}')
    return results

# Funciones de KNN Range priority queue
def euclidean_distances(query, data):
    return np.sqrt(((data - query) ** 2).sum(axis=1))

def KNNPriorityQueue(query, K, chars):
    data = np.loadtxt(chars, delimiter=',')
    
    if len(data) < K:
        raise ValueError("El número de datos es menor que K")
    
    distances = euclidean_distances(query, data)
    pq = []
    
    # Insertamos los primeros K elementos de la base de datos en la cola de prioridad
    for i in range(K):
        heapq.heappush(pq, (-distances[i], i))  # Distancia negativa para simular una cola de prioridad máxima

    for i in range(K, len(data)):
        # Si la distancia entre el objeto de consulta y el objeto i es menor que la mayor distancia en la cola de prioridad
        if -pq[0][0] > distances[i]: # Eliminamos el objeto en la cola de prioridad con mayor distancia
            heapq.heappop(pq)
            heapq.heappush(pq, (-distances[i], i))
            
    return [(data[i], -distance) for distance, i in sorted(pq, reverse=True)]

def knnSearch(collection, query, k):
    heap = [] # heap vacio
    ED = lambda P, Q: np.sqrt((P-Q)**2)

    for i in range(len(collection)):
        dist = np.sqrt((ED(collection[i], query) ** 2).sum()) # porque es una serie de pandas y no un array de numpy
        # Use negative distance because heapq is a min heap
        if len(heap) < k:
            heapq.heappush(heap, (-dist, i)) # si no se ha llenado el heap, se inserta
        else:
            heapq.heappushpop(heap, (-dist, i)) # si ya se lleno el heap, se inserta y se elimina el menor

    # Return indices and distances, reversing the order so that the closest is first
    indices_and_distances = [(i, -d) for d, i in sorted(heap, reverse=True)]
    return indices_and_distances
 
# Funciones para KNN Range radio
def genDistancias(data, N):
    ED = lambda P, Q: np.sqrt(sum((P-Q)**2))
    v = np.zeros(N)
    for i in range(N):
        ind = np.random.choice(data.shape[0], size=2, replace=False)
        P = data[ind[0], :]
        Q = data[ind[1], :]
        #v[i] = distance.euclidean(P, Q)
        v[i] = np.sqrt((ED(P, Q) ** 2).sum())
    return v

def rangeSearch(collection, query, r, query_index):
    ED = lambda P, Q: np.sqrt(sum((P-Q)**2))
    result = []
    heap = []
    for i in range(len(collection)):
        dist = np.sqrt((ED(collection[i], query) ** 2).sum())
        if (dist < r).all():  # Check if all elements in the Series are less than r
            result.append(i)
            #sort by distance to r
            heapq.heappush(heap, (dist, i))
    sorted_result = [index for _, index in heapq.nsmallest(len(heap), heap)]
    return sorted_result # result

# Funciones para KNN Rtree
def knn_rtree(collection, query, k, dimensions):
    # Create a new RTree index
    prop = rtree.index.Property()
    prop.dimension = dimensions   # dimension del vector caracteristico
    #prop.buffering_capacity = 8    # Cantidad maxima de MBRs en un nodo 
    ind = rtree.index.Index(properties = prop)

    # insertar los puntos                
    for i in range(collection.shape[0]):
        ind.insert(i, collection[i].tolist() + collection[i].tolist())     

    # Promedio
    avg = 0
    # aplicar la consulta indexada (correr 10 veces y tomar el promedio)
    for i in range(10):
        start_time = time.time()
        ind.nearest(query, num_results=k) # cambiar a k+1 porque se cuenta a si mismo
        avg += time.time() - start_time

    avg = avg/10
    # Obtener los k vecinos más cercanos (k+1 para excluir el punto de consulta si está presente)
    k_nearest = list(ind.nearest(query.tolist() + query.tolist(), num_results=k))
    ind.close()
    
    neighbors = collection[k_nearest]

    # Calcular distancias y ordenar
    distances = []
    for idx in k_nearest:
        dist = np.linalg.norm(collection[idx] - query)
        distances.append((dist, idx))
            
    distances.sort(key=lambda x: x[0])  # Ordenar por distancia
    
    # Extraer índices ordenados
    sorted_indices = [idx for _, idx in distances]

    print(f'La consulta tomó en promedio {avg} segundos')
    #print(f'Los k={k} más cercanos a la consulta son:{neighbors_without_query}')
    print(f'Las k:{k} canciones más cercanas a la consulta son (contar la query): {sorted_indices}')
    #print(f'Las k:{k} canciones más cercanas a la consulta son (sin contar la query): {nearest_songs}')

# Funciones para HighD
def knn_faiss(data, query, k, num_trials=10):
    total_time = 0  # Acumulador para el tiempo total
    for _ in range(num_trials):
        start_time = time.time()  # Inicio del temporizador
        d = len(data[0])  # Dimensionality of the feature vectors
        index = faiss.IndexFlatL2(d)  # L2 distance
        index.add(np.array(data, dtype='float32'))  # Add data to the index
        D, I = index.search(np.array([query], dtype='float32'), k)  # Perform the search k+1 para no contar la query
        end_time = time.time()  # Fin del temporizador
        total_time += end_time - start_time  # Acumular el tiempo de ejecución
    average_time = total_time / num_trials  # Calcular el promedio
    return I[0], D[0], average_time

# Testbench 1k

**Extracción de características**

In [13]:
features = extract_n_features(1000, 64, 8)

# Guardar las características en un archivo CSV
output_file = 'caracteristicas_1000.txt'
with open(output_file, 'w') as f:
    for audio_path, feature_vector in features.items():
        feature_str = ','.join(map(str, feature_vector))
        f.write(feature_str + '\n')

print(f'Características guardadas en {output_file}')

# Variables para tests
datatrain = pd.read_csv('caracteristicas_1000.txt', delimiter=',',header=None)
k = 8
ind_q = 5

Características guardadas en caracteristicas_1000.txt


**Range KNN Priority queue**

In [14]:
query = datatrain.iloc[ind_q]
#dataT = datatrain.drop([ind_q], axis=0, inplace=False)
dataT = datatrain
result = knnSearch(dataT.values, query, k)
# Promedio de tiempo
total_time = 0
for _ in range(10):
    start_time = time.time()
    result = knnSearch(dataT.values, query, k)
    end_time = time.time()
    total_time += end_time - start_time
average_time = total_time

print(f'Resultado para k={k}: {result}')
print(f'Tiempo promedio: {average_time} segundos') 

Resultado para k=8: [(5, 0.0), (33, 21.27868951349949), (10, 26.562299151551848), (202, 34.62912923822806), (631, 46.58289851191702), (720, 51.487770242024624), (43, 51.938023758369816), (22, 52.98509342871366)]
Tiempo promedio: 3.2795703411102295 segundos


**Range KNN Radio**

In [15]:
D = genDistancias(datatrain.values, 5000)

# Calcular percentiles para elegir r1, r2, y r3
percentiles = np.percentile(D, [10, 20, 30]) # tmb puede ser [25, 50, 75]
r1, r2, r3 = percentiles[0], percentiles[1], percentiles[2]

# calcular la busqueda por rango
radios_obtenidos = [r1, r2, r3]

for radio in radios_obtenidos:
    query = datatrain.iloc[ind_q]
    dataT = datatrain
    # Promedio
    total_time = 0
    for _ in range(10):
        start_time = time.time()
        result = rangeSearch(dataT.values, query, radio, ind_q)
        end_time = time.time()
        total_time += end_time - start_time
    average_time = total_time / 10

    print(f'Resultado para radio={radio}: {result}')
    print(f'Tiempo promedio: {average_time} segundos')

Resultado para radio=58.15379315967679: [5, 33, 10, 202, 631, 720, 43, 22, 177, 241, 145, 162, 127, 587, 658]
Tiempo promedio: 0.17390449047088624 segundos
Resultado para radio=72.75107689270303: [5, 33, 10, 202, 631, 720, 43, 22, 177, 241, 145, 162, 127, 587, 658, 373, 119, 68, 394, 14, 478, 826, 161, 524, 212, 309, 188, 200, 38, 183, 881, 491, 727, 88, 146, 704, 89, 777, 740, 332, 904, 66, 59, 861, 764, 45, 18, 630, 143, 385, 468, 951, 488, 962, 500, 138, 301]
Tiempo promedio: 0.1724404811859131 segundos
Resultado para radio=86.82332392370189: [5, 33, 10, 202, 631, 720, 43, 22, 177, 241, 145, 162, 127, 587, 658, 373, 119, 68, 394, 14, 478, 826, 161, 524, 212, 309, 188, 200, 38, 183, 881, 491, 727, 88, 146, 704, 89, 777, 740, 332, 904, 66, 59, 861, 764, 45, 18, 630, 143, 385, 468, 951, 488, 962, 500, 138, 301, 187, 749, 813, 12, 82, 585, 656, 746, 971, 150, 382, 513, 148, 555, 281, 698, 275, 600, 537, 64, 380, 62, 870, 687, 110, 794, 773, 474, 593, 884, 49, 325, 55, 80, 959, 318, 988,

**RTree KNN**

In [16]:
D = len(datatrain.columns)  

# Create a new RTree index
prop = rtree.index.Property()
prop.dimension = D    # dimension del vector caracteristico
prop.buffering_capacity = 8    # Cantidad maxima de MBRs en un nodo 
ind = rtree.index.Index(properties = prop)

#Generar los datos    
data = datatrain.values
query = data[ind_q]

# insertar los puntos                
for i in range(data.shape[0]):
    ind.insert(i, data[i])    

knn_rtree(data, query, k, 64)

La consulta tomó en promedio 0.00041570663452148435 segundos
Las k:8 canciones más cercanas a la consulta son (contar la query): [5, 33, 10, 202, 631, 720, 43, 22]


**HighD**

In [17]:
#Generar los datos    
data = datatrain.values
query = data[5]

faiss_knn_results, faiss_knn_distances, average_time = knn_faiss(data, query, k=8)

print(f"Tiempo de ejecución promedio de knn_faiss: {average_time} segundos")
print(f'Los k={k} más cercanos a la consulta son:{faiss_knn_results}') # faiss_knn_results[1:]
print(f'Las distancias a los k={k} vecinos más cercanos son:{faiss_knn_distances}') # faiss_knn_results[1:]

Tiempo de ejecución promedio de knn_faiss: 0.00039606094360351564 segundos
Los k=8 más cercanos a la consulta son:[  5  33  10 202 631 720  43  22]
Las distancias a los k=8 vecinos más cercanos son:[   0.       452.78265  705.5557  1199.1769  2169.966   2650.9902
 2697.558   2807.42   ]


# Testbench 2k

**Extracción de características**

In [18]:
features = extract_n_features(2000, 64, 8)

# Guardar las características en un archivo CSV
output_file = 'caracteristicas_2000.txt'
with open(output_file, 'w') as f:
    for audio_path, feature_vector in features.items():
        feature_str = ','.join(map(str, feature_vector))
        f.write(feature_str + '\n')

print(f'Características guardadas en {output_file}')

# Variables para tests
datatrain = pd.read_csv('caracteristicas_2000.txt', delimiter=',',header=None)
k = 8
ind_q = 5

[src/libmpg123/layer3.c:INT123_do_layer3():1841] error: dequantization failed!


Características guardadas en caracteristicas_2000.txt


**Range KNN Priority queue**

In [19]:
query = datatrain.iloc[ind_q]
#dataT = datatrain.drop([ind_q], axis=0, inplace=False)
dataT = datatrain
result = knnSearch(dataT.values, query, k)
# Promedio de tiempo
total_time = 0
for _ in range(10):
    start_time = time.time()
    result = knnSearch(dataT.values, query, k)
    end_time = time.time()
    total_time += end_time - start_time
average_time = total_time

print(f'Resultado para k={k}: {result}')
print(f'Tiempo promedio: {average_time} segundos') 

Resultado para k=8: [(5, 0.0), (578, 42.18969438056244), (1955, 44.73881159385126), (475, 44.92502947385553), (4, 45.16398389591528), (929, 45.395156714320734), (505, 45.71342807329074), (96, 45.74680786205768)]
Tiempo promedio: 6.101983547210693 segundos


**Range KNN Radio**

In [20]:
D = genDistancias(datatrain.values, 8000)

# Calcular percentiles para elegir r1, r2, y r3
percentiles = np.percentile(D, [10, 20, 30]) # tmb puede ser [25, 50, 75]
r1, r2, r3 = percentiles[0], percentiles[1], percentiles[2]

# calcular la busqueda por rango
radios_obtenidos = [r1, r2, r3]

for radio in radios_obtenidos:
    query = datatrain.iloc[ind_q]
    dataT = datatrain
    # Promedio
    total_time = 0
    for _ in range(10):
        start_time = time.time()
        result = rangeSearch(dataT.values, query, radio, ind_q)
        end_time = time.time()
        total_time += end_time - start_time
    average_time = total_time / 10

    print(f'Resultado para radio={radio}: {result}')
    print(f'Tiempo promedio: {average_time} segundos')

Resultado para radio=60.40488682964991: [5, 578, 1955, 475, 4, 929, 505, 96, 1920, 493, 1857, 548, 85, 255, 1091, 482, 407, 692, 1496, 461, 789, 428, 362, 945, 662, 447, 559, 788, 1238, 628, 156, 1164, 456, 1521, 487, 529, 552, 491, 919, 439, 315, 829, 1052, 1124, 611, 1942, 1772, 308, 304, 815, 556, 397, 28, 182, 195, 329, 236, 566, 858, 438, 1717, 402, 1965, 717, 874, 897, 335, 921, 1706, 1611, 1738, 276, 1928, 1956, 298, 1637, 1922, 1497, 535, 863, 1064, 1987, 374, 616, 896, 991, 1211, 949, 927, 1061, 151, 1683, 431, 806, 440, 627, 1914, 401, 1791, 131, 1483, 1730, 361, 1189, 905]
Tiempo promedio: 0.3452352285385132 segundos
Resultado para radio=76.06264876876502: [5, 578, 1955, 475, 4, 929, 505, 96, 1920, 493, 1857, 548, 85, 255, 1091, 482, 407, 692, 1496, 461, 789, 428, 362, 945, 662, 447, 559, 788, 1238, 628, 156, 1164, 456, 1521, 487, 529, 552, 491, 919, 439, 315, 829, 1052, 1124, 611, 1942, 1772, 308, 304, 815, 556, 397, 28, 182, 195, 329, 236, 566, 858, 438, 1717, 402, 1965, 7

**RTree KNN**

In [21]:
D = len(datatrain.columns)  

# Create a new RTree index
prop = rtree.index.Property()
prop.dimension = D    # dimension del vector caracteristico
prop.buffering_capacity = 8    # Cantidad maxima de MBRs en un nodo 
ind = rtree.index.Index(properties = prop)

#Generar los datos    
data = datatrain.values
query = data[ind_q]

# insertar los puntos                
for i in range(data.shape[0]):
    ind.insert(i, data[i])    

knn_rtree(data, query, k, 64)

La consulta tomó en promedio 0.0011132717132568359 segundos
Las k:8 canciones más cercanas a la consulta son (contar la query): [5, 578, 1955, 475, 4, 929, 505, 96]


**HighD**

In [22]:
#Generar los datos    
data = datatrain.values
query = data[5]

faiss_knn_results, faiss_knn_distances, average_time = knn_faiss(data, query, k=8)

print(f"Tiempo de ejecución promedio de knn_faiss: {average_time} segundos")
print(f'Los k={k} más cercanos a la consulta son:{faiss_knn_results}') # faiss_knn_results[1:]
print(f'Las distancias a los k={k} vecinos más cercanos son:{faiss_knn_distances}') # faiss_knn_results[1:]

Tiempo de ejecución promedio de knn_faiss: 0.0007781505584716797 segundos
Los k=8 más cercanos a la consulta son:[   5  578 1955  475    4  929  505   96]
Las distancias a los k=8 vecinos más cercanos son:[   0.     1779.9702 2001.5612 2018.2582 2039.7855 2060.7205 2089.7178
 2092.7705]


# Testbench 4k

**Extracción de características**

In [23]:
features = extract_n_features(4000, 64, 8)

# Guardar las características en un archivo CSV
output_file = 'caracteristicas_4000.txt'
with open(output_file, 'w') as f:
    for audio_path, feature_vector in features.items():
        feature_str = ','.join(map(str, feature_vector))
        f.write(feature_str + '\n')

print(f'Características guardadas en {output_file}')

# Variables para tests
datatrain = pd.read_csv('caracteristicas_4000.txt', delimiter=',',header=None)
k = 8
ind_q = 5

[src/libmpg123/layer3.c:INT123_do_layer3():1841] error: dequantization failed!
[src/libmpg123/layer3.c:INT123_do_layer3():1771] error: part2_3_length (3264) too large for available bit count (3224)
[src/libmpg123/layer3.c:INT123_do_layer3():1801] error: dequantization failed!


Características guardadas en caracteristicas_4000.txt


**Range KNN Priority queue**

In [24]:
query = datatrain.iloc[ind_q]
#dataT = datatrain.drop([ind_q], axis=0, inplace=False)
dataT = datatrain
result = knnSearch(dataT.values, query, k)
# Promedio de tiempo
total_time = 0
for _ in range(10):
    start_time = time.time()
    result = knnSearch(dataT.values, query, k)
    end_time = time.time()
    total_time += end_time - start_time
average_time = total_time

print(f'Resultado para k={k}: {result}')
print(f'Tiempo promedio: {average_time} segundos') 

Resultado para k=8: [(5, 0.0), (930, 36.72489783893421), (578, 40.403514219396385), (3421, 41.562003368466094), (3024, 44.8618206449251), (0, 45.16398389591528), (1125, 45.244057322367745), (184, 45.43412030859485)]
Tiempo promedio: 12.977379083633423 segundos


**Range KNN Radio**

In [25]:
D = genDistancias(datatrain.values, 12000)

# Calcular percentiles para elegir r1, r2, y r3
percentiles = np.percentile(D, [10, 20, 30]) # tmb puede ser [25, 50, 75]
r1, r2, r3 = percentiles[0], percentiles[1], percentiles[2]

# calcular la busqueda por rango
radios_obtenidos = [r1, r2, r3]

for radio in radios_obtenidos:
    query = datatrain.iloc[ind_q]
    dataT = datatrain
    # Promedio
    total_time = 0
    for _ in range(10):
        start_time = time.time()
        result = rangeSearch(dataT.values, query, radio, ind_q)
        end_time = time.time()
        total_time += end_time - start_time
    average_time = total_time / 10

    print(f'Resultado para radio={radio}: {result}')
    print(f'Tiempo promedio: {average_time} segundos')

Resultado para radio=63.426783318861716: [5, 930, 578, 3421, 3024, 0, 1125, 184, 1954, 3566, 1496, 693, 83, 2054, 2999, 492, 3905, 3528, 922, 2818, 89, 3877, 1919, 3870, 1523, 454, 2852, 3412, 817, 1060, 2900, 450, 3973, 581, 2415, 3181, 529, 827, 3758, 1166, 2400, 950, 3676, 1186, 3894, 28, 473, 2035, 3942, 2941, 1093, 3409, 3434, 2569, 894, 948, 789, 1241, 477, 310, 663, 68, 1925, 788, 3729, 1705, 546, 3091, 1520, 837, 2008, 249, 610, 575, 506, 2031, 360, 1857, 3931, 460, 858, 3456, 547, 861, 320, 720, 3068, 1413, 2128, 2440, 3748, 920, 924, 2591, 2411, 314, 195, 37, 3873, 3823, 3696, 2321, 1904, 428, 811, 2433, 2195, 274, 1938, 751, 155, 1613, 2055, 2300, 2892, 1746, 301, 3826, 3952, 329, 2387, 3242, 550, 1327, 3087, 3186, 3807, 3718, 3733, 2665, 895, 1922, 3974, 407, 3187, 3398, 1939, 340, 3911, 1482, 2731, 1858, 3726, 807, 3090, 1078, 1873, 397, 1063, 2409, 1686, 2737, 1894, 1951, 801, 3179, 1639, 2018, 3944, 466, 1774, 439, 3472, 1094, 1717]
Tiempo promedio: 0.6834704875946045 se

**RTree KNN**

In [26]:
D = len(datatrain.columns)  

# Create a new RTree index
prop = rtree.index.Property()
prop.dimension = D    # dimension del vector caracteristico
prop.buffering_capacity = 8    # Cantidad maxima de MBRs en un nodo 
ind = rtree.index.Index(properties = prop)

#Generar los datos    
data = datatrain.values
query = data[ind_q]

# insertar los puntos                
for i in range(data.shape[0]):
    ind.insert(i, data[i])    

knn_rtree(data, query, k, 64)

La consulta tomó en promedio 0.005534076690673828 segundos
Las k:8 canciones más cercanas a la consulta son (contar la query): [5, 930, 578, 3421, 3024, 0, 1125, 184]


**HighD**

In [27]:
#Generar los datos    
data = datatrain.values
query = data[5]

faiss_knn_results, faiss_knn_distances, average_time = knn_faiss(data, query, k=8)

print(f"Tiempo de ejecución promedio de knn_faiss: {average_time} segundos")
print(f'Los k={k} más cercanos a la consulta son:{faiss_knn_results}') # faiss_knn_results[1:]
print(f'Las distancias a los k={k} vecinos más cercanos son:{faiss_knn_distances}') # faiss_knn_results[1:]

Tiempo de ejecución promedio de knn_faiss: 0.0010005712509155273 segundos
Los k=8 más cercanos a la consulta son:[   5  930  578 3421 3024    0 1125  184]
Las distancias a los k=8 vecinos más cercanos son:[   0.     1348.7183 1632.444  1727.4    2012.583  2039.7855 2047.0249
 2064.2593]


# Testbench 8k

**Extracción de características**

In [28]:
features = extract_n_features(8000, 64, 8)

# Guardar las características en un archivo CSV
output_file = 'caracteristicas_8000.txt'
with open(output_file, 'w') as f:
    for audio_path, feature_vector in features.items():
        feature_str = ','.join(map(str, feature_vector))
        f.write(feature_str + '\n')

print(f'Características guardadas en {output_file}')

# Variables para tests
datatrain = pd.read_csv('caracteristicas_8000.txt', delimiter=',',header=None)
k = 8
ind_q = 5

[src/libmpg123/layer3.c:INT123_do_layer3():1841] error: dequantization failed!
[src/libmpg123/layer3.c:INT123_do_layer3():1771] error: part2_3_length (3264) too large for available bit count (3224)
[src/libmpg123/layer3.c:INT123_do_layer3():1801] error: dequantization failed!
[src/libmpg123/layer3.c:INT123_do_layer3():1801] error: dequantization failed!
[src/libmpg123/layer3.c:INT123_do_layer3():1801] error: dequantization failed!
[src/libmpg123/layer3.c:INT123_do_layer3():1801] error: dequantization failed!
[src/libmpg123/layer3.c:INT123_do_layer3():1801] error: dequantization failed!


Características guardadas en caracteristicas_8000.txt


**Range KNN Priority queue**

In [29]:
query = datatrain.iloc[ind_q]
#dataT = datatrain.drop([ind_q], axis=0, inplace=False)
dataT = datatrain
result = knnSearch(dataT.values, query, k)
# Promedio de tiempo
total_time = 0
for _ in range(10):
    start_time = time.time()
    result = knnSearch(dataT.values, query, k)
    end_time = time.time()
    total_time += end_time - start_time
average_time = total_time

print(f'Resultado para k={k}: {result}')
print(f'Tiempo promedio: {average_time} segundos') 

Resultado para k=8: [(5, 0.0), (784, 29.24923060155461), (980, 29.42602781609264), (639, 30.116354291957794), (765, 30.265184933752206), (544, 31.792580509416787), (1014, 32.51851985615925), (6861, 32.93304487231995)]
Tiempo promedio: 25.526981830596924 segundos


**Range KNN Radio**

In [30]:
D = genDistancias(datatrain.values, 15000)

# Calcular percentiles para elegir r1, r2, y r3
percentiles = np.percentile(D, [10, 20, 30]) # tmb puede ser [25, 50, 75]
r1, r2, r3 = percentiles[0], percentiles[1], percentiles[2]

# calcular la busqueda por rango
radios_obtenidos = [r1, r2, r3]

for radio in radios_obtenidos:
    query = datatrain.iloc[ind_q]
    dataT = datatrain
    # Promedio
    total_time = 0
    for _ in range(10):
        start_time = time.time()
        result = rangeSearch(dataT.values, query, radio, ind_q)
        end_time = time.time()
        total_time += end_time - start_time
    average_time = total_time / 10

    print(f'Resultado para radio={radio}: {result}')
    print(f'Tiempo promedio: {average_time} segundos')

Resultado para radio=63.86916065497923: [5, 784, 980, 639, 765, 544, 1014, 6861, 105, 1006, 6315, 6850, 73, 7022, 5255, 1232, 761, 3047, 1157, 6115, 1574, 215, 4548, 206, 6274, 3066, 1201, 227, 2612, 7349, 5167, 312, 2716, 5284, 1221, 151, 7650, 2393, 1150, 2532, 6959, 209, 6541, 1184, 1143, 1472, 6549, 12, 5175, 1167, 7278, 2956, 1187, 6897, 2742, 6004, 168, 1411, 356, 6143, 6758, 1176, 3238, 7612, 5198, 4060, 54, 1612, 5950, 1142, 1214, 7132, 3658, 113, 220, 3717, 35, 1145, 6858, 1170, 6477, 5057, 7872, 3321, 3561, 7662, 678, 7621, 2280, 5271, 6943, 3477, 3551, 3473, 6757, 7618, 1092, 1656, 6916, 868, 190, 39, 4544, 6836, 3497, 6822, 1128, 841, 7240, 2150, 6823, 107, 79, 6410, 4922, 4450, 4138, 4122, 7016, 174, 6333, 4992, 3503, 7704, 1205, 1491, 1228, 3195, 5251, 3727, 4989, 5020, 1695, 5383, 5393, 3856, 6940, 1394, 1072, 1514, 2771, 2240, 1778, 1869, 4164, 5350, 3178, 48, 3393, 453, 6863, 5033, 5079, 6835, 6036, 6226, 7341, 6326, 4793, 1829, 6473, 4649, 6781, 4966, 13, 484, 2114, 1

**RTree KNN**

In [31]:
D = len(datatrain.columns)  

# Create a new RTree index
prop = rtree.index.Property()
prop.dimension = D    # dimension del vector caracteristico
prop.buffering_capacity = 8    # Cantidad maxima de MBRs en un nodo 
ind = rtree.index.Index(properties = prop)

#Generar los datos    
data = datatrain.values
query = data[ind_q]

# insertar los puntos                
for i in range(data.shape[0]):
    ind.insert(i, data[i])    

knn_rtree(data, query, k, 64)

La consulta tomó en promedio 0.0014316558837890625 segundos
Las k:8 canciones más cercanas a la consulta son (contar la query): [5, 784, 980, 639, 765, 544, 1014, 6861]


**HighD**

In [32]:
#Generar los datos    
data = datatrain.values
query = data[5]

faiss_knn_results, faiss_knn_distances, average_time = knn_faiss(data, query, k=8)

print(f"Tiempo de ejecución promedio de knn_faiss: {average_time} segundos")
print(f'Los k={k} más cercanos a la consulta son:{faiss_knn_results}') # faiss_knn_results[1:]
print(f'Las distancias a los k={k} vecinos más cercanos son:{faiss_knn_distances}') # faiss_knn_results[1:]

Tiempo de ejecución promedio de knn_faiss: 0.0022679805755615235 segundos
Los k=8 más cercanos a la consulta son:[   5  784  980  639  765  544 1014 6861]
Las distancias a los k=8 vecinos más cercanos son:[   0.       855.5176   865.8911   906.99475  915.9813  1010.76825
 1057.4541  1084.5854 ]


# Testbench 16k

**Extracción de características**

In [33]:
features = extract_n_features(16000, 64, 8)

# Guardar las características en un archivo CSV
output_file = 'caracteristicas_16000.txt'
with open(output_file, 'w') as f:
    for audio_path, feature_vector in features.items():
        feature_str = ','.join(map(str, feature_vector))
        f.write(feature_str + '\n')

print(f'Características guardadas en {output_file}')

# Variables para tests
datatrain = pd.read_csv('caracteristicas_16000.txt', delimiter=',',header=None)
k = 8
ind_q = 5

[src/libmpg123/layer3.c:INT123_do_layer3():1841] error: dequantization failed!
[src/libmpg123/layer3.c:INT123_do_layer3():1771] error: part2_3_length (3264) too large for available bit count (3224)
[src/libmpg123/layer3.c:INT123_do_layer3():1801] error: dequantization failed!
[src/libmpg123/layer3.c:INT123_do_layer3():1801] error: dequantization failed!
[src/libmpg123/layer3.c:INT123_do_layer3():1801] error: dequantization failed!
[src/libmpg123/layer3.c:INT123_do_layer3():1801] error: dequantization failed!
[src/libmpg123/layer3.c:INT123_do_layer3():1801] error: dequantization failed!
[src/libmpg123/layer3.c:INT123_do_layer3():1771] error: part2_3_length (3328) too large for available bit count (3240)
[src/libmpg123/layer3.c:INT123_do_layer3():1771] error: part2_3_length (3360) too large for available bit count (3240)
[src/libmpg123/layer3.c:INT123_do_layer3():1801] error: dequantization failed!
[src/libmpg123/layer3.c:INT123_do_layer3():1801] error: dequantization failed!
  return f(

fma_medium/065/065753.mp3 generated an exception: 


[src/libmpg123/layer3.c:INT123_do_layer3():1841] error: dequantization failed!


fma_medium/080/080391.mp3 generated an exception: 




fma_medium/098/098558.mp3 generated an exception: 


[src/libmpg123/layer3.c:INT123_do_layer3():1801] error: dequantization failed!
Note: Illegal Audio-MPEG-Header 0x00000000 at offset 63168.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1365] error: Giving up resync after 1024 bytes - your stream is not nice... (maybe increasing resync limit could help).
Note: Illegal Audio-MPEG-Header 0x00000000 at offset 33361.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1365] error: Giving up resync after 1024 bytes - your stream is not nice... (maybe increasing resync limit could help).


fma_medium/098/098571.mp3 generated an exception: 


Note: Illegal Audio-MPEG-Header 0x00000000 at offset 106439.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1365] error: Giving up resync after 1024 bytes - your stream is not nice... (maybe increasing resync limit could help).


fma_medium/098/098568.mp3 generated an exception: 


Note: Illegal Audio-MPEG-Header 0x00000000 at offset 187493.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1365] error: Giving up resync after 1024 bytes - your stream is not nice... (maybe increasing resync limit could help).


fma_medium/098/098559.mp3 generated an exception: 


Note: Illegal Audio-MPEG-Header 0x00000000 at offset 22401.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1365] error: Giving up resync after 1024 bytes - your stream is not nice... (maybe increasing resync limit could help).


fma_medium/098/098560.mp3 generated an exception: 




fma_medium/099/099134.mp3 generated an exception: 
Características guardadas en caracteristicas_16000.txt


**Range KNN Priority queue**

In [34]:
query = datatrain.iloc[ind_q]
#dataT = datatrain.drop([ind_q], axis=0, inplace=False)
dataT = datatrain
result = knnSearch(dataT.values, query, k)
# Promedio de tiempo
total_time = 0
for _ in range(10):
    start_time = time.time()
    result = knnSearch(dataT.values, query, k)
    end_time = time.time()
    total_time += end_time - start_time
average_time = total_time

print(f'Resultado para k={k}: {result}')
print(f'Tiempo promedio: {average_time} segundos') 

Resultado para k=8: [(5, 0.0), (5127, 40.70284258279437), (9424, 41.13096139850029), (13180, 41.59691419033624), (576, 42.18969438056244), (6648, 44.35453305583134), (9973, 44.45156853851012), (1956, 44.73881159385126)]
Tiempo promedio: 49.964767932891846 segundos


**Range KNN Radio**

In [35]:
D = genDistancias(datatrain.values, 24000)

# Calcular percentiles para elegir r1, r2, y r3
percentiles = np.percentile(D, [10, 20, 30]) # tmb puede ser [25, 50, 75]
r1, r2, r3 = percentiles[0], percentiles[1], percentiles[2]

# calcular la busqueda por rango
radios_obtenidos = [r1, r2, r3]

for radio in radios_obtenidos:
    query = datatrain.iloc[ind_q]
    dataT = datatrain
    # Promedio
    total_time = 0
    for _ in range(10):
        start_time = time.time()
        result = rangeSearch(dataT.values, query, radio, ind_q)
        end_time = time.time()
        total_time += end_time - start_time
    average_time = total_time / 10

    print(f'Resultado para radio={radio}: {result}')
    print(f'Tiempo promedio: {average_time} segundos')

Resultado para radio=62.45545408578145: [5, 5127, 9424, 13180, 576, 6648, 9973, 1956, 472, 10824, 6, 14542, 929, 505, 93, 6362, 10265, 1918, 3904, 493, 3877, 12069, 14468, 3000, 5806, 13606, 1856, 550, 3181, 5910, 3821, 5977, 10529, 85, 2038, 3676, 3876, 6029, 3024, 9922, 8940, 252, 5908, 13720, 4035, 9476, 4550, 8970, 3944, 2902, 2853, 11550, 1093, 2570, 480, 8841, 6486, 7317, 6685, 407, 11312, 3093, 2007, 2440, 3758, 8929, 11462, 693, 9988, 10812, 1496, 11234, 9131, 461, 2892, 789, 7201, 5547, 3806, 427, 9892, 362, 8263, 5930, 10507, 10360, 8968, 5092, 4498, 948, 11776, 6588, 15966, 664, 14361, 7488, 5640, 12058, 15835, 3070, 445, 6484, 14782, 13896, 8835, 15432, 10594, 5948, 13678, 8784, 10250, 2303, 3185, 14319, 560, 787, 6573, 13174, 4616, 11429, 1238, 9900, 11506, 628, 2433, 156, 3180, 1163, 10281, 12314, 452, 7325, 1524, 486, 5756, 9819, 9099, 526, 3187, 5025, 552, 2398, 492, 14235, 7601, 15239, 922, 439, 315, 3419, 2817, 2054, 828, 1052, 6592, 12006, 13580, 13875, 3733, 7829, 2

**RTree KNN**

In [36]:
D = len(datatrain.columns)  

# Create a new RTree index
prop = rtree.index.Property()
prop.dimension = D    # dimension del vector caracteristico
prop.buffering_capacity = 8    # Cantidad maxima de MBRs en un nodo 
ind = rtree.index.Index(properties = prop)

#Generar los datos    
data = datatrain.values
query = data[ind_q]

# insertar los puntos                
for i in range(data.shape[0]):
    ind.insert(i, data[i])    

knn_rtree(data, query, k, 64)

La consulta tomó en promedio 0.023974895477294922 segundos
Las k:8 canciones más cercanas a la consulta son (contar la query): [5, 5127, 9424, 13180, 576, 6648, 9973, 1956]


**HighD**

In [37]:
#Generar los datos    
data = datatrain.values
query = data[5]

faiss_knn_results, faiss_knn_distances, average_time = knn_faiss(data, query, k=8)

print(f"Tiempo de ejecución promedio de knn_faiss: {average_time} segundos")
print(f'Los k={k} más cercanos a la consulta son:{faiss_knn_results}') # faiss_knn_results[1:]
print(f'Las distancias a los k={k} vecinos más cercanos son:{faiss_knn_distances}') # faiss_knn_results[1:]

Tiempo de ejecución promedio de knn_faiss: 0.004761791229248047 segundos
Los k=8 más cercanos a la consulta son:[    5  5127  9424 13180   576  6648  9973  1956]
Las distancias a los k=8 vecinos más cercanos son:[   0.     1656.7214 1691.756  1730.3032 1779.9702 1967.3242 1975.942
 2001.5612]


# Testbench 24k

**Extracción de características**

In [38]:
features = extract_n_features(24000, 64, 8)

# Guardar las características en un archivo CSV
output_file = 'caracteristicas_24000.txt'
with open(output_file, 'w') as f:
    for audio_path, feature_vector in features.items():
        feature_str = ','.join(map(str, feature_vector))
        f.write(feature_str + '\n')

print(f'Características guardadas en {output_file}')

# Variables para tests
datatrain = pd.read_csv('caracteristicas_24000.txt', delimiter=',',header=None)
k = 8
ind_q = 5

**Range KNN Priority queue**

In [None]:
query = datatrain.iloc[ind_q]
#dataT = datatrain.drop([ind_q], axis=0, inplace=False)
dataT = datatrain
result = knnSearch(dataT.values, query, k)
# Promedio de tiempo
total_time = 0
for _ in range(10):
    start_time = time.time()
    result = knnSearch(dataT.values, query, k)
    end_time = time.time()
    total_time += end_time - start_time
average_time = total_time

print(f'Resultado para k={k}: {result}')
print(f'Tiempo promedio: {average_time} segundos') 

Resultado para k=8: [(5, 0.0), (31, 21.27868951349949), (9, 26.562299151551848), (200, 34.62912923822806), (2993, 36.522520758226264), (631, 46.58289851191702), (2873, 47.80771715933784), (3059, 47.908723321964956)]
Tiempo promedio: 12.864285469055176 segundos


**Range KNN Radio**

In [None]:
D = genDistancias(datatrain.values, 30000)

# Calcular percentiles para elegir r1, r2, y r3
percentiles = np.percentile(D, [10, 20, 30]) # tmb puede ser [25, 50, 75]
r1, r2, r3 = percentiles[0], percentiles[1], percentiles[2]

# calcular la busqueda por rango
radios_obtenidos = [r1, r2, r3]

for radio in radios_obtenidos:
    query = datatrain.iloc[ind_q]
    dataT = datatrain
    # Promedio
    total_time = 0
    for _ in range(10):
        start_time = time.time()
        result = rangeSearch(dataT.values, query, radio, ind_q)
        end_time = time.time()
        total_time += end_time - start_time
    average_time = total_time / 10

    print(f'Resultado para radio={radio}: {result}')
    print(f'Tiempo promedio: {average_time} segundos')

Resultado para radio=62.631407362456756: [5, 31, 9, 200, 2993, 631, 2873, 3059, 2277, 2707, 3681, 1274, 2036, 2180, 719, 1661, 43, 2621, 1307, 22, 2194, 1430, 3652, 3637, 174, 1023, 238, 3395, 145, 162, 128, 587, 1673, 3440, 1406, 1323, 2848, 658, 1114, 1766, 2487, 1967, 372, 1250, 3193, 2023, 3197, 3150, 3123, 3054, 3137, 119, 2365, 68, 3625, 1630, 2254, 2549, 3755, 3626, 2840, 1741, 393, 14, 1066, 1791]
Tiempo promedio: 0.6736599922180175 segundos
Resultado para radio=79.35393718762391: [5, 31, 9, 200, 2993, 631, 2873, 3059, 2277, 2707, 3681, 1274, 2036, 2180, 719, 1661, 43, 2621, 1307, 22, 2194, 1430, 3652, 3637, 174, 1023, 238, 3395, 145, 162, 128, 587, 1673, 3440, 1406, 1323, 2848, 658, 1114, 1766, 2487, 1967, 372, 1250, 3193, 2023, 3197, 3150, 3123, 3054, 3137, 119, 2365, 68, 3625, 1630, 2254, 2549, 3755, 3626, 2840, 1741, 393, 14, 1066, 1791, 2287, 478, 3276, 3678, 2702, 1105, 1382, 3222, 1537, 2540, 827, 3363, 159, 2567, 3697, 523, 2109, 3664, 212, 1301, 3501, 310, 1446, 1059, 

**RTree KNN**

In [None]:
D = len(datatrain.columns)  

# Create a new RTree index
prop = rtree.index.Property()
prop.dimension = D    # dimension del vector caracteristico
prop.buffering_capacity = 8    # Cantidad maxima de MBRs en un nodo 
ind = rtree.index.Index(properties = prop)

#Generar los datos    
data = datatrain.values
query = data[ind_q]

# insertar los puntos                
for i in range(data.shape[0]):
    ind.insert(i, data[i])    

knn_rtree(data, query, k, 64)

La consulta tomó en promedio 0.002820467948913574 segundos
Las k:8 canciones más cercanas a la consulta son (contar la query): [5, 31, 9, 200, 2993, 631, 2873, 3059]


**HighD**

In [None]:
#Generar los datos    
data = datatrain.values
query = data[5]

faiss_knn_results, faiss_knn_distances, average_time = knn_faiss(data, query, k=8)

print(f"Tiempo de ejecución promedio de knn_faiss: {average_time} segundos")
print(f'Los k={k} más cercanos a la consulta son:{faiss_knn_results}') # faiss_knn_results[1:]
print(f'Las distancias a los k={k} vecinos más cercanos son:{faiss_knn_distances}') # faiss_knn_results[1:]

Tiempo de ejecución promedio de knn_faiss: 0.0010390996932983399 segundos
Los k=8 más cercanos a la consulta son:[   5   31    9  200 2993  631 2873 3059]
Las distancias a los k=8 vecinos más cercanos son:[   0.       452.78265  705.5557  1199.1769  1333.8945  2169.966
 2285.578   2295.246  ]
