<a href="https://colab.research.google.com/github/rahiakela/nlp-research-and-practice/blob/main/ai-powered-search/13_3_semantic_search_with__quantization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

In this notebook, we"re going to install a transformer model, analyze the embedding output, and compare some vectors

In [1]:
#outdoors
![ ! -d 'outdoors' ] && git clone --depth=1 https://github.com/ai-powered-search/outdoors.git
! cd outdoors && git pull
! cd outdoors && cat outdoors.tgz.part* > outdoors.tgz
! cd outdoors && mkdir -p '../data/outdoors/' && tar -xvf outdoors.tgz -C '../data/outdoors/'

Cloning into 'outdoors'...
remote: Enumerating objects: 25, done.[K
remote: Counting objects: 100% (25/25), done.[K
remote: Compressing objects: 100% (24/24), done.[K
remote: Total 25 (delta 0), reused 22 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (25/25), 491.39 MiB | 23.32 MiB/s, done.
Updating files: 100% (23/23), done.
Already up to date.
README.md
concepts.pickle
._guesses.csv
guesses.csv
._guesses_all.json
guesses_all.json
outdoors_concepts.pickle
outdoors_embeddings.pickle
._outdoors_golden_answers.csv
outdoors_golden_answers.csv
._outdoors_golden_answers.xlsx
outdoors_golden_answers.xlsx
._outdoors_golden_answers_20210130.csv
outdoors_golden_answers_20210130.csv
outdoors_labels.pickle
outdoors_question_answering_contexts.json
outdoors_questionanswering_test_set.json
outdoors_questionanswering_train_set.json
._posts.csv
posts.csv
predicates.pickle
pull_aips_dependency.py
._question-answer-seed-contexts.csv
question-answer-seed-contexts.csv
question-answer-sq

In [2]:
%%capture

!pip install faiss-cpu --no-cache
!pip install nmslib
# !pip install faiss-gpu

In [None]:
import sys
import os
import time
sys.path.append("../..")
# from aips import *
import pandas as pd
import numpy as np
import pickle
import json
import tqdm

import faiss
import sentence_transformers
from sentence_transformers import SentenceTransformer, SimilarityFunction
from sentence_transformers.quantization import quantize_embeddings
from sentence_transformers import CrossEncoder

import nmslib

from IPython.display import display, HTML

In [None]:
model = SentenceTransformer(
    "mixedbread-ai/mxbai-embed-large-v1",
    similarity_fn_name=SimilarityFunction.DOT_PRODUCT,
    truncate_dim=1024
)

## Get embeddings

In [5]:
def get_embeddings(texts, model, cache_name, ignore_cache=False):
  cache_file_name = f"data/outdoors/{cache_name}.pickle"
  if ignore_cache or not os.path.isfile(cache_file_name):
    embeddings = model.encode(texts, normalize_embeddings=True)
    os.makedirs(os.path.dirname(cache_file_name), exist_ok=True)
    with open(cache_file_name, "wb") as cache_file:
      pickle.dump(embeddings, cache_file)
  else:
    with open(cache_file_name, "rb") as cache_file:
      embeddings = pickle.load(cache_file)
  return embeddings

In [6]:
def display_results(scores, ids, data):
    results = generate_search_results(scores, ids, data)
    display(results)
    return results

def get_outdoors_data():
    outdoors_dataframe = pd.read_csv("data/outdoors/posts.csv")
    # outdoors_data = list(outdoors_dataframe.to_dict())
    outdoors_data = list(outdoors_dataframe.to_dict(orient='records'))
    return outdoors_data

def display_statistics(search_results, baseline_search_results=None, start_message="Recall"):
    index_name = search_results["index_name"]
    time_taken = search_results["time_taken"]
    index_size = search_results["size"]
    improvement_ms = ""
    improvement_size = ""
    recall = 1.0
    if baseline_search_results:
        full_search_time = baseline_search_results["time_taken"]
        time_imp = round((full_search_time - time_taken) * 100 / full_search_time, 2)
        improvement_ms = f" ({time_imp}% improvement)"
        improvement_size = f" ({round((baseline_search_results['size'] - index_size) * 100 / baseline_search_results['size'], 2)}% improvement)"
        recall = calculate_recall(baseline_search_results["results"], search_results["results"])

    print(f"{index_name} search took: {time_taken:.3f} ms{improvement_ms}")
    print(f"{index_name} index size: {round(index_size / 1000000, 2)} MB{improvement_size}")
    print(f"{start_message}: {round(recall, 4)}")

def calculate_recall(scored_full_results, scored_quantized_results):
    recalls = []
    for i in range(len(scored_full_results)):
        full_ids = [r["id"] for r in scored_full_results[i]]
        quantized_ids = [r["id"] for r in scored_quantized_results[i]]
        recalls.append((len(set(full_ids).intersection(set(quantized_ids))) /
                       len(set(quantized_ids))))
    return sum(recalls) / len(recalls)

def generate_search_results(faiss_scores, faiss_ids):
    outdoors_data = get_outdoors_data()
    faiss_results = []
    for i in range(len(faiss_scores)):
        results = []
        for j, id in enumerate(faiss_ids[i]):
            id = int(id)
            result = {"score": faiss_scores[i][j],
                      "title": outdoors_data[id]["title"],
                      "body": outdoors_data[id]["body"],
                      "id": id}
            results.append(result)
        faiss_results.append(results)
    return faiss_results

def time_and_execute_search(index, index_name, query_embeddings, k=25, num_runs=100):
    search_times = []
    faiss_scores = None
    faiss_ids = None

    for i in range(num_runs):
        start_time = time.time()
        faiss_scores, faiss_ids = index.search(query_embeddings, k=k)
        time_taken = ((time.time() - start_time) * 1000)
        search_times.append(time_taken)

    results = {"results": generate_search_results(faiss_scores, faiss_ids),
               "time_taken": np.average(search_times),
               "faiss_scores": faiss_scores, "faiss_ids": faiss_ids}
    index_stats = {}
    if index_name:
        index_stats ={
            "index_name": index_name,
            "size": os.path.getsize(index_name)
        }
    return results | index_stats

##Scalar quantization

In [7]:
outdoors_dataframe = pd.read_csv("data/outdoors/posts.csv")
post_texts = [f"{post['title']} {post['body']}" for _, post in outdoors_dataframe.iterrows()]
# post_texts

In [8]:
# let's index full-precision embeddings using FAISS
def index_full_precision_embeddings(doc_embeddings, name):
  # IndexFlatIP is a simple, unoptimized index supporting different embedding formats
  index = faiss.IndexFlatIP(doc_embeddings.shape[1])
  index.add(doc_embeddings)      # Adds documents to the index
  faiss.write_index(index, name) # Writes the index to disk
  return index

def get_outdoors_embeddings(model):
  outdoors_dataframe = pd.read_csv("data/outdoors/posts.csv")
  post_texts = [
      f"{post['title']} {post['body']}"
      for _, post in outdoors_dataframe.iterrows()
  ]
  return np.array(get_embeddings(post_texts, model, "outdoors_mrl_normed"))

# Generates embeddings for the outdoors dataset
outdoors_embeddings = get_outdoors_embeddings(model)
# Creates a full-precision(Float32) FAISS index
full_index = index_full_precision_embeddings(outdoors_embeddings, "full_embeddings")

In [9]:
# let's generate full-precision query embeddings
def get_test_queries():
  return[
      "tent poles", "hiking trails", "mountain forests",
      "white water", "best waterfalls", "mountain biking",
      "snowboarding slopes", "bungee jumping", "public parks"
  ]

# Gets test queries for benchmarking
queries = get_test_queries()
# Generates embeddings for each query
query_embeddings = model.encode(queries, normalize_embeddings=True, convert_to_numpy=True)

# Generates search time, index size, and recall statistics for the full-precision (Float32) index
full_results = time_and_execute_search(full_index, "full_embeddings", query_embeddings, k=25)
# Displays the benchmarking stats for the full-precision index
display_statistics(full_results)

full_embeddings search took: 41.694 ms
full_embeddings index size: 80.22 MB
Recall: 1.0


In [None]:
# let's define functions for benchmark quantized search approaches
def evaluate_search(full_index, optimized_index, optimized_index_name,
                    query_embeddings, optimized_query_embeddings,
                    k=25, display=True, log=False):
    full_results = time_and_execute_search(full_index, "full_embeddings", query_embeddings, k=k)
    optimized_results = time_and_execute_search(optimized_index, optimized_index_name, optimized_query_embeddings, k=k)
    if display:
        display_statistics(optimized_results, full_results)
    return optimized_results, full_results

def evaluate_rerank_search(full_index, optimized_index,
                           query_embeddings,
                           optimized_embeddings,
                           k=50, limit=25):
    results, full_results = evaluate_search(full_index, optimized_index, None, query_embeddings,
                                            optimized_embeddings, display=False, k=k)

    doc_embeddings = get_outdoors_embeddings(model) #This can point to a cheap on-disk data source containing the original full-precision embeddings
    rescore_scores, rescore_ids = [], []
    for i in range(len(results["results"])):
        embedding_ids = results["faiss_ids"][i]
        top_k_embeddings = [doc_embeddings[id] for id in embedding_ids]
        query_embedding = query_embeddings[i]
        scores = query_embedding @ np.array(top_k_embeddings).T
        indices = scores.argsort()[::-1][:limit]
        top_k_indices = embedding_ids[indices]
        top_k_scores = scores[indices]
        rescore_scores.append(top_k_scores)
        rescore_ids.append(top_k_indices)

    results = generate_search_results(rescore_scores, rescore_ids)
    recall = calculate_recall(full_results["results"], results)
    print(f"Reranked recall: {round(recall, 4)}")

In [None]:
# let’s implement Float16 scalar quantization
def index_float16_precision_embeddings(doc_embeddings, name):
  float16_embeddings = quantize_embeddings(doc_embeddings, precision="float16")
  print(f"Float16 embeddings shape: {float16_embeddings.shape}")
  # IndexFlatIP is a simple, unoptimized index supporting different embedding formats
  index = faiss.IndexFlatIP(float16_embeddings.shape[1])
  index.add(float16_embeddings)      # Adds documents to the index
  faiss.write_index(index, name)  # Writes the index to disk
  return index

float16_index_name = "int16_embeddings"
float16_index = index_float16_precision_embeddings(outdoors_embeddings, int16_index_name)
# Quantizes the query embeddings to Int16 precision
quantized_queries = quantize_embeddings(query_embeddings, calibration_embeddings=outdoors_embeddings, precision="float16")
# Performs benchmarks for search time, index size, and recall
evaluate_search(full_index, float16_index, float16_index_name, query_embeddings, quantized_queries)
# Performs benchmarks again allowing reranking of top results with full-precision embeddings
evaluate_rerank_search(full_index, float16_index, query_embeddings, quantized_queries)

NameError: name 'int16_index_name' is not defined

In [None]:
# let’s implement Int8 scalar quantization
def index_int8_precision_embeddings(doc_embeddings, name):
  int8_embeddings = quantize_embeddings(doc_embeddings, precision="int8")
  print(f"Int8 embeddings shape: {int8_embeddings.shape}")
  # IndexFlatIP is a simple, unoptimized index supporting different embedding formats
  index = faiss.IndexFlatIP(int8_embeddings.shape[1])
  index.add(int8_embeddings)      # Adds documents to the index
  faiss.write_index(index, name)  # Writes the index to disk
  return index

int8_index_name = "int8_embeddings"
int8_index = index_int8_precision_embeddings(outdoors_embeddings, int8_index_name)
# Quantizes the query embeddings to Int8 precision
quantized_queries = quantize_embeddings(query_embeddings, calibration_embeddings=outdoors_embeddings, precision="int8")
# Performs benchmarks for search time, index size, and recall
evaluate_search(full_index, int8_index, int8_index_name, query_embeddings, quantized_queries)
# Performs benchmarks again allowing reranking of top results with full-precision embeddings
evaluate_rerank_search(full_index, int8_index, query_embeddings, quantized_queries)

0.633 | The re-hydration time for deydrated foods


##Binary quantization

In [None]:
def index_binary_embeddings(doc_embeddings, binary_index_name):
  # Quantizes the doc embeddings to binary (1 bit per dimension)
  binary_embeddings = quantize_embeddings(doc_embeddings, precision="binary").astype(np.uint8)
  print(f"Binary embeddings shape: {binary_embeddings.shape}")
  # Creates the binary embeddings index
  index = faiss.IndexBinaryFlat(binary_embeddings.shape[1] * 8)
  index.add(binary_embeddings)      # Adds documents to the index
  faiss.write_index(index, binary_index_name)  # Writes the index to disk
  return index

In [None]:
binary_index_name = "binary_embeddings"
binary_index = index_binary_embeddings(outdoors_embeddings, binary_index_name)
# Quantizes the query embeddings to binary
quantized_queries = quantize_embeddings(
    query_embeddings,
    calibration_embeddings=outdoors_embeddings,
    precision="binary").astype(np.uint8) # Saves every 8 dimensions as 1 byte, encoded as unsigned Int8
# Performs benchmarks for search time, index size, and recall
evaluate_search(full_index, binary_index, binary_index_name, query_embeddings, quantized_queries)
# Performs benchmarks again allowing reranking of top results with full-precision embeddings
evaluate_rerank_search(full_index, binary_index, query_embeddings, quantized_queries)

##Product quantization

In [None]:
def index_pq_embeddings(doc_embeddings, index_name, num_subvectors=16):
  dimensions = doc_embeddings.shape[1]
  # Divides the embedding into M=16 subvectors (of 64 dimensions each)
  M = num_subvectors
  # 8 bits = 256 maximum cluster centroids per subvector
  num_bits = 8
  # Creates the PQ embeddings index
  pq_index = faiss.IndexPQ(dimensions, M, num_bits)
  # Generates the cluster centroids using k-means clustering
  pq_index.train(doc_embeddings)
  pq_index.add(doc_embeddings)
  faiss.write_index(pq_index, index_name)  # Writes the index to disk
  return pq_index

In [None]:
pq_index_name = "pq_embeddings"
pq_index = index_pq_embeddings(outdoors_embeddings, pq_index_name)

# Performs benchmarks for search time, index size, and recall
evaluate_search(full_index, pq_index, pq_index_name, query_embeddings, query_embeddings)
# Performs benchmarks again allowing reranking of top results with full-precision embeddings
evaluate_rerank_search(full_index, pq_index, query_embeddings, query_embeddings)

In [None]:
pq_index_name = "pq_embeddings"
pq_index = index_pq_embeddings(outdoors_embeddings, pq_index_name, num_subvectors=32)

# Performs benchmarks for search time, index size, and recall
evaluate_search(full_index, pq_index, pq_index_name, query_embeddings, query_embeddings)
# Performs benchmarks again allowing reranking of top results with full-precision embeddings
evaluate_rerank_search(full_index, pq_index, query_embeddings, query_embeddings)

In [None]:
pq_index_name = "pq_embeddings"
pq_index = index_pq_embeddings(outdoors_embeddings, pq_index_name, num_subvectors=64)

# Performs benchmarks for search time, index size, and recall
evaluate_search(full_index, pq_index, pq_index_name, query_embeddings, query_embeddings)
# Performs benchmarks again allowing reranking of top results with full-precision embeddings
evaluate_rerank_search(full_index, pq_index, query_embeddings, query_embeddings)

##Matryoshka Representation Learning

In [None]:
def get_mrl_embeddings(embeddings, num_dimensions):
  mrl_embeddings = np.array(list(map(lambda e: e[num_dimensions], embeddings)))
  return mrl_embeddings

def index_mrl_embeddings(doc_embeddings, num_dimensions, mrl_index_name):
  mrl_doc_embeddings = get_mrl_embeddings(doc_embeddings, num_dimensions)
  print(f"{mrl_index_name} embeddings shape: {mrl_doc_embeddings.shape}")
  # An MRL index is a standard index, just with a reduced number of dimensions
  mrl_index = index_full_precision_embeddings(mrl_doc_embeddings, mrl_index_name)
  return mrl_index

In [None]:
print(f"Original embeddings shape: {outdoors_embeddings.shape}")
# 1024 dimensions
original_dimensions = outdoors_embeddings.shape[1]

for num_dimensions in [
    original_dimensions // 2,   # 512 dimensions
    original_dimensions // 4,   # 256 dimensions
    original_dimensions // 8]:  # 128 dimensions
  mrl_index_name = f"mrl_embeddings_{num_dimensions}"
  mrl_index = index_mrl_embeddings(outdoors_embeddings, num_dimensions, mrl_index_name)
  mrl_queries = get_mrl_embeddings(query_embeddings, num_dimensions)
  # Benchmark MRL search
  evaluate_search(full_index, mrl_index, mrl_index_name, query_embeddings, mrl_queries)
  # Benchmark MRL search + Reranking
  evaluate_rerank_search(full_index, mrl_index, query_embeddings, mrl_queries)

##Combining techniques

In [None]:
def index_binary_ivf_mrl_embeddings(reduced_mrl_doc_embeddings, binary_index_name):
  # Binary quantization
  binary_embeddings = quantize_embeddings(
      reduced_mrl_doc_embeddings,
      calibration_embeddings=reduced_mrl_doc_embeddings,
      precision="binary"
  ).astype(np.unit8)

  # Configuration so the index knows how the doc embeddings have been quantized
  dimensions = reduced_mrl_doc_embeddings.shape[1]
  quantizer = faiss.IndexBinaryFlat(dimensions)

  # ANN-IVF Flat Algorithm: uses a binary-quantized IVF index for ANN search
  num_clusters = 256
  index = faiss.IndexBinaryIVF(quantizer, dimensions, num_clusters)
  index.nprobe = 4

  # Trains, adds documents, and saves the combined index to disk
  index.train(binary_embeddings)
  index.add(binary_embeddings)
  faiss.write_index_binary(index, binary_index_name)
  return index

In [None]:
# MRL: gets reduced-dimension doc embeddings
mrl_dimensions = outdoors_embeddings.shape[1] // 2
reduced_mrl_doc_embeddings = get_mrl_embeddings(outdoors_embeddings, mrl_dimensions)

In [None]:
binary_ivf_mrl_index_name = "binary_ivf_mrl_embeddings"
binary_ivf_mrl_index = index_binary_ivf_mrl_embeddings(
    reduced_mrl_doc_embeddings,
    mrl_dimensions,
    binary_ivf_mrl_index_name
)

In [None]:
# MRL: gets reduced-dimension query embeddings
mrl_queries = get_mrl_embeddings(query_embeddings, mrl_dimensions)
# Binary quantization: applies quantization to the query embeddings
quantized_queries = quantize_embeddings(
      mrl_queries,
      calibration_embeddings=reduced_mrl_doc_embeddings,
      precision="binary"
  ).astype(np.unit8)

In [None]:
# Benchmarks the binary ANN, binary quantization, and MRL embeddings
evaluate_search(full_index, binary_ivf_mrl_index, binary_ivf_mrl_index_name, query_embeddings, quantized_queries)
# Benchmarks again with reranking using full-precision embeddings
evaluate_rerank_search(full_index, binary_ivf_mrl_index, query_embeddings, quantized_queries)

## Cross-encoders v/s bi-encoders

In [47]:
cache_name = "outdoors_semantic_search_embeddings"

def get_embeddings(texts, model, cache_name, ignore_cache=False):
  cache_file_name = f"data/outdoors/{cache_name}.pickle"
  if ignore_cache or not os.path.isfile(cache_file_name):
    embeddings = model.encode(texts)
    os.makedirs(os.path.dirname(cache_file_name), exist_ok=True)
    with open(cache_file_name, "wb") as cache_file:
      pickle.dump(embeddings, cache_file)
  else:
    with open(cache_file_name, "rb") as cache_file:
      embeddings = pickle.load(cache_file)
  return embeddings

def normalize_embedding(embedding):
  normalized = np.divide(embedding, np.linalg.norm(embedding))
  return list(map(float, normalized))

def print_labels(query, matches):
  display(HTML(f"<h4>Results for: <em>{query}</em></h4>"))
  for (l, d) in matches:
    print(str(int(d * 1000) / 1000), "|", l)

def display_results(query, search_results):
    print_labels(query, [(d["title"], d["score"]) for d in search_results])

def get_outdoor_titles():
    outdoors_dataframe = pd.read_csv("data/outdoors/posts.csv")
    outdoors_dataframe = outdoors_dataframe[["id", "title"]]
    outdoors_dataframe = outdoors_dataframe.dropna()
    # print(f"Calculating embeddings for {outdoors_dataframe.count()} docs.")
    return outdoors_dataframe

def bi_encoder_embedding_search(model, index, query, phrases, k=20, min_similarity=0.75):
  matches = []
  # Gets the embeddings for query
  query_embedding = model.encode(query, convert_to_tensor=True)
  query_embedding = normalize_embedding(query_embedding)
  ids, distances = index.knnQuery(query_embedding, k=k)
  for i in range(len(ids)):
    # Converts negative dot product distance into a positive dot product
    distance = distances[i] * -1
    if distance > min_similarity:
      matches.append((phrases[ids[i]], distance))
  if not len(matches):
    # No neighbors found! Returns just the original term
    matches.append((phrases[ids[1]], distances[1] * -1))
  return matches

def cross_encoder_semantic_search(model, titles, query, limit=10):
    ranks = model.rank(query, titles)
    # Print the scores
    print("Results for:", query)
    for rank in ranks:
      if rank['score'] > .25:
        print(f"{rank['score']:.2f}\t{titles[rank['corpus_id']]}")

In [None]:
bi_encoder_model = SentenceTransformer("roberta-base-nli-stsb-mean-tokens")
cross_encoder_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

In [None]:
cache_name = "outdoors_semantic_search_embeddings"
outdoor_titles = list(get_outdoor_titles()["title"])
embeddings = get_embeddings(outdoor_titles, bi_encoder_model, cache_name, ignore_cache=True)

###Bi-Encoder

In [None]:
# let's do bi-encoder
# initialize a new index, using a HNSW index on Dot Product
titles_index = nmslib.init(method='hnsw', space='negdotprod')
normalized_embeddings = list(map(normalize_embedding, embeddings))

# All the embeddings can be added in a single batch
titles_index.addDataPointBatch(normalized_embeddings)
# Commits the index to memory. This must be done before you can query for nearest neighbors
titles_index.createIndex(print_progress=True)

In [None]:
def semantic_search(query, phrases, log=False):
  matches = bi_encoder_embedding_search(titles_index, query, phrases, k=5, min_similarity=0.6)
  if log:
    print_labels(query, matches)

In [None]:
semantic_search("mountain hike", outdoor_titles, log=True)

### Cross-Encoder

In [44]:
query = "mountain hike"

# Generates a pair of query + document title to score for each document
search_results = cross_encoder_semantic_search(cross_encoder_model, outdoor_titles, query)

Results for: mountain hike
1.53	An x-hour hike. Is that to the peak/summit or back and forth?
1.32	Timberland classic 6 inch boots for mountain hiking/trekking
0.43	What are the simplest 5000 m mountains to hike?
0.32	What constitutes mountain exposure when hiking or scrambling?
0.31	Rifugio (Mountain Hut) trek in Austria
