<a href="https://colab.research.google.com/github/stellaevat/ontology-mapping/blob/main/colabs/faiss.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ":4096:8"

In [None]:
import gc
import itertools
import torch
import faiss
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

# Read/Write to file

In [None]:
def read_bi_embeddings_from_file(filepath):
  indices, sources, targets = [], [], []

  with open(filepath) as f:
    for line in tqdm(f):
      strings = line.strip().split(",[")
      lists = [list(map(float, s.strip("[]").split(","))) for s in strings[1:]]
      source, target = lists
      index = int(strings[0])

      indices.append(index)
      sources.append(source)
      targets.append(target)

  bi_embeddings = {"indices" : indices,
                   "sources" : np.array(sources, dtype=np.single),
                   "targets" : np.array(targets, dtype=np.single)}
  return bi_embeddings

def read_onto_embeddings_from_file(filepath):
  indices, embeddings = [], None

  with open(filepath) as f:
    for line in tqdm(f):
      strings = line.strip().split(",[")
      embedding = np.array(list(map(float, strings[1].strip("[]").split(","))), dtype=np.single).reshape(1,-1)
      index = int(strings[0])

      indices.append(index)
      embeddings = embedding if embeddings is None else np.concatenate((embeddings, embedding), axis=0, dtype=np.single)

  onto_embeddings = {"indices"    : indices,
                     "embeddings" : embeddings}
  return onto_embeddings

# Faiss

In [None]:
def get_real_ranks(I, bi_embeddings, db_embeddings):
  ranks = []
  for i, target_embed in tqdm(enumerate(bi_embeddings["targets"])):
    for rank, j in enumerate(I[i]):
      ranked_embed = db_embeddings[j]
      if np.array_equal(target_embed, ranked_embed):
        ranks.append(rank + 1)
        break
  return ranks

def plot_ranks(ranks, feature, negatives, database):
  plt.figure()
  plt.hist(ranks, bins=max(ranks) // 100)
  plt.xlabel("Subsumer rank")
  plt.ylabel("No. of queries")
  plt.title(f"Correct subsumer rank distribution across queries ({feature}, {negatives}, {database})")
  plt.savefig(f"bi_ranking_histogram_{feature}_{negatives}_{database}.png")

def compute_result(ranks):
  result = {}

  raw_at_rank = lambda x: len([r for r in ranks if r <= x])
  hits_at_rank = lambda x: x / len(ranks)
  for r in [1, 5, 10, 100, 1000]:
    score = hits_at_rank(raw_at_rank(r))
    result[f"H@{r}"] = score

  result["MRR"]     = np.mean(1 / np.array(ranks))
  result["Highest"] = np.min(ranks)
  result["Median"]  = np.median(ranks)
  result["Mean"]    = np.mean(ranks)
  result["Lowest"]  = np.max(ranks)

  return result

def pretty_print_result(result, feature, negatives, database):
  result_str = f"\nPerformance on ({feature}, {negatives}, {database}):\n\n"

  for (name, value) in result.items():
    if name.startswith("H@"):
      result_str += f"{name:<6} = {value:.3f}\n"
    elif name == "MRR":
      result_str += f"MRR    = {value:.3f}\n\n"
    elif not isinstance(value, str):
      result_str += f"{name}:" + " " * (13 - len(name)) + f"{value:.1f}\n"

  result_str += "\n"
  print(result_str)

In [None]:
dir = ""
dir = "/content/drive/MyDrive/embeddings/"
features = ['term', 'int', 'ext']
negative_sampling = ['random', 'multi', 'neighbour']
columns = ["database", "feature", "negatives", "H@1", "H@5", "H@10", "H@50", "H@100", "H@1000", "MRR", "Highest", "Median", "Mean", "Lowest"]
results_df = pd.DataFrame(columns=columns)
all_ranks = {}

for (feature, negatives) in list(itertools.product(features, negative_sampling))[3:]:
  bi_embeddings = read_bi_embeddings_from_file(f"{dir}bi_test_embeddings_{feature}_{negatives}.csv")
  random_embeddings = read_onto_embeddings_from_file(f"{dir}random_embeddings_{feature}_{negatives}.csv")
  hard_embeddings = read_onto_embeddings_from_file(f"{dir}hard_embeddings_{feature}_{negatives}.csv")
  onto_embeddings = read_onto_embeddings_from_file(f"{dir}doid_embeddings_{feature}_{negatives}.csv")
  databases = {"hard" : hard_embeddings, "onto" : onto_embeddings, "random" : random_embeddings}

  for (db_type, db_embeddings) in databases.items():
    if db_embeddings is None:
      continue

    print()
    print(feature, negatives, db_type)

    xq = bi_embeddings["sources"]
    xb = db_embeddings["embeddings"]

    xb = np.concatenate((xb, bi_embeddings["targets"]), axis=0)
    print(f"Database ({db_type}) contains {len(xb)} terms")

    d = xb[0].shape[0]
    q_size = len(xq)
    db_size = len(xb)

    res = faiss.StandardGpuResources()
    index_flat = faiss.IndexFlatIP(d)
    index_flat.add(xb)
    D, I = index_flat.search(xq, db_size)

    ranks = get_real_ranks(I, bi_embeddings, xb)
    if not ranks:
      print("No ranks!\n")
      continue
    else:
      print(len(ranks))

    plot_ranks(ranks, feature, negatives, db_type)
    result = {"database" : db_type, "feature" : feature, "negatives" : negatives}
    result = result | compute_result(ranks)
    # pretty_print_result(result, feature, negatives, db_type)

    results_df.loc[len(results_df)] = result
    results_df.sort_values(by=["database", "feature", "negatives"]).to_csv("bi_ranking_metrics.csv", index=False)

    all_ranks[(database, feature, negatives)] = ranks
    with open("bi_ranks.csv", "a") as f:
      f.write(f"{database},{feature},{negatives},{",".join(ranks)}\n")

    print()