In [7]:
import os
import glob
import numpy as np
from annoy import AnnoyIndex
import heapq
import shutil
from tqdm import tqdm

In [8]:
def get_embedding_dim(folder):
    for filename in os.listdir(folder):
        if filename.endswith(".npy"):
            emb = np.load(os.path.join(folder, filename)).squeeze(0)
            return emb.shape[0]
    raise ValueError("No .npy files found in the folder.")

In [9]:
def build_annoy_index(folder, embedding_dim, n_trees=10):
    index = AnnoyIndex(embedding_dim, "angular")  # Use 'angular' for cosine similarity
    idx = 0
    for filename in tqdm(os.listdir(folder)):
        if filename.endswith(".npy"):
            emb = np.load(os.path.join(folder, filename)).squeeze(0)
            index.add_item(idx, emb)
            idx += 1
    index.build(n_trees)
    return index

In [10]:
def count_embeddings(folder):
    return sum(1 for filename in os.listdir(folder) if filename.endswith(".npy"))

In [11]:
def select_embeddings(first_folder, second_folder, n=None, k=None):
    if k is None and n is None:
        raise ValueError("either `n` of `k` should be specified")
    elif k is not None and n is not None:
        raise ValueError("`n` of `k` cannot both be specified")

    embedding_dim = get_embedding_dim(first_folder)
    print(f"Embedding dimension: {embedding_dim}")

    print("Building Annoy index from the first folder...")
    index = build_annoy_index(first_folder, embedding_dim, n_trees=10)
    print("Annoy index built.")

    total_embeddings = count_embeddings(second_folder)
    print(f"Total embeddings in the second folder: {total_embeddings}")

    k = total_embeddings // n if n is not None else k
    print(f"Selecting top {k} embeddings with the largest distances.")

    heap = []  # Min-heap to keep track of top-k largest distances

    for filename in tqdm(os.listdir(second_folder)):
        if filename.endswith(".npy"):
            emb_path = os.path.join(second_folder, filename)
            emb = np.load(emb_path).squeeze(0)

            # Find the nearest neighbor in the first folder
            nearest_idxs, distances = index.get_nns_by_vector(
                emb, 1, include_distances=True
            )
            min_distance = distances[0]

            # Use negative distance for max-heap behavior in min-heap
            neg_distance = -min_distance

            if len(heap) < k:
                heapq.heappush(heap, (neg_distance, emb_path))
            else:
                if neg_distance > heap[0][0]:
                    heapq.heappushpop(heap, (neg_distance, emb_path))

    print("Selection completed.")

    # Extract the embeddings from the heap
    top_embeddings = [heapq.heappop(heap) for _ in range(len(heap))]
    top_embeddings.reverse()  # Largest distances first

    # Get the file paths
    selected_emb_paths = [item[1] for item in top_embeddings]

    return selected_emb_paths

In [13]:
# Specify your folders and the fraction 1/n
first_folder = "/home/setupishe/bel_conf/embeds_reduced_0.2/embeds_0.2"
second_folder = (
    "/home/setupishe/bel_conf/remainder_embeds_reduced_0.2/remainder_embeds_0.2"
)
n = 5  # Adjust as needed

# Select embeddings
selected_embeddings = select_embeddings(
    first_folder, second_folder, k=118287 * 0.1 * 1.5
)

Embedding dimension: 512
Building Annoy index from the first folder...


100%|██████████| 507685/507685 [00:11<00:00, 46064.09it/s]


Annoy index built.
Total embeddings in the second folder: 579873
Selecting top 17743.050000000003 embeddings with the largest distances.


100%|██████████| 1924875/1924875 [00:53<00:00, 35915.17it/s]

Selection completed.





In [14]:
def force_mkdir(directory):
    if os.path.isdir(directory):
        shutil.rmtree(directory)
    os.mkdir(directory)

In [15]:
# Optional: Copy selected embeddings to a new folder
output_folder = "test_folder"
force_mkdir(output_folder)

for emb_path in tqdm(selected_embeddings):
    filename = os.path.basename(emb_path)
    shutil.copy(emb_path, os.path.join(output_folder, filename))

print("Selected embeddings have been copied to the output folder.")

100%|██████████| 17744/17744 [00:00<00:00, 44448.24it/s]

Selected embeddings have been copied to the output folder.



