<a href="https://colab.research.google.com/github/stellaevat/ontology-mapping/blob/main/colabs/faiss.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install faiss-gpu

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ":4096:8"

In [None]:
import gc
import torch
import faiss
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

# Read/Write to file

In [None]:
def read_bi_embeddings_from_file(filepath):
  indices, sources, targets = [], [], []

  with open(filepath) as f:
    progress_bar = tqdm(range(len(f)), position=0, leave=True)
    for line in f:
      strings = line.strip().split(",[")
      lists = [list(map(float, s.strip("[]").split(","))) for s in strings[1:]]
      source, target = lists
      index = int(strings[0])

      indices.append(index)
      sources.append(source)
      targets.append(target)
      progress_bar.update(1)

  bi_embeddings = {"indices" : indices,
                   "sources" : np.array(sources, dtype=np.single),
                   "targets" : np.array(targets, dtype=np.single)}
  return bi_embeddings

def read_onto_embeddings_from_file(filepath):
  indices, embeddings = [], []

  with open(filepath) as f:
    progress_bar = tqdm(range(len(f)), position=0, leave=True)
    for line in f:
      strings = line.strip().split(",[")
      embedding = list(map(float, strings[1].strip("[]").split(",")))
      index = int(strings[0])

      indices.append(index)
      embeddings.append(embeddings)
      progress_bar.update(1)

  onto_embeddings = {"indices"    : indices,
                     "embeddings" : np.array(embeddings, dtype=np.single)}
  return onto_embeddings

In [None]:
features = ['term', 'int', 'ext']
negative_sampling = ['random', 'multi', 'neighbour']

feature = features[2]
negatives = negative_sampling[2]

dir = ""
dir = "/content/drive/MyDrive/embeddings/"

bi_embeddings = read_bi_embeddings_from_file(f"{dir}bi_embeddings_{feature}_{negatives}.csv")
onto_embeddings = read_onto_embeddings_from_file(f"{dir}doid_embeddings_{feature}_{negatives}.csv")

# Faiss

In [None]:
xq = bi_embeddings["sources"]
xb = onto_embeddings["embeddings"]
d = len(xb[0])

print(d, xq.shape, xb.shape)

In [None]:
offset = 0
q_size = len(xq)
db_size = len(xb)

In [None]:
res = faiss.StandardGpuResources()  # use a single GPU

index_flat = faiss.IndexFlatIP(d)   # build a flat (CPU) index

index_flat.add(xb)                  # add vectors to the index
print(index_flat.ntotal)

k = db_size                         # we want to see 100 nearest neighbors
D, I = index_flat.search(xq, k)     # actual search

In [None]:
# count = 0
# sim = torch.nn.CosineSimilarity(dim=-1)
# for i in range(q_size):
#   score = sim(tokenized_queries[i], tokenized_database[i])
#   if score >= 0.95:
#     count += 1
#     # print(score)

# print(count)

In [None]:
# ranks = []
# for i in range(q_size):
#   source, target = queries[offset + i], database[offset + i]
#   # print("\nSource:", source)
#   # print("Target:", target)

#   for n, index in enumerate(I[i]):
#     # print(n + 1, database[offset + index])
#     if database[offset + index] == target:
#       ranks.append(n + 1)
#       # print("Rank found:", n + 1)
#       break
#     # if n == k - 1:
#     #   print("Not found")

In [None]:
# plt.hist(ranks, bins=max(ranks) // 10)

In [None]:
# to_consider = 100
# in_first = ranks.count(1)
# in_ten = len([r for r in ranks if r <= 10])
# in_consider = len([r for r in ranks if r <= to_consider])
# in_all = len(ranks)
# mrr = np.mean(1 / np.array(ranks))

# print(f"In first result: {in_first} ({(100 * in_first / len(ranks)):.1f}%)")
# print(f"In first 10 results: {in_ten} ({(100 * in_ten / len(ranks)):.1f}%)")
# print(f"In first {to_consider} results: {in_consider} ({(100 * in_consider / len(ranks)):.1f}%)")
# print(f"In first {k} results: {in_all} ({(100 * in_all / len(ranks)):.1f}%)")
# print(f"Lowest rank: {max(ranks)}")
# print(f"MRR: {mrr:.3f}")

# ## (can you load-unload model to file?)

In [None]:
# del xq
# del xb
# del bi_embeddings
# del onto_embeddings
# gc.collect()