<a href="https://colab.research.google.com/github/rahiakela/nlp-research-and-practice/blob/main/ai-powered-search/13_3_semantic_search_with__quantization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

In this notebook, we"re going to install a transformer model, analyze the embedding output, and compare some vectors

In [1]:
#outdoors
![ ! -d 'outdoors' ] && git clone --depth=1 https://github.com/ai-powered-search/outdoors.git
! cd outdoors && git pull
! cd outdoors && cat outdoors.tgz.part* > outdoors.tgz
! cd outdoors && mkdir -p '../data/outdoors/' && tar -xvf outdoors.tgz -C '../data/outdoors/'

Cloning into 'outdoors'...
remote: Enumerating objects: 25, done.[K
remote: Counting objects: 100% (25/25), done.[K
remote: Compressing objects: 100% (24/24), done.[K
remote: Total 25 (delta 0), reused 22 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (25/25), 491.39 MiB | 24.78 MiB/s, done.
Updating files: 100% (23/23), done.
Already up to date.
README.md
concepts.pickle
._guesses.csv
guesses.csv
._guesses_all.json
guesses_all.json
outdoors_concepts.pickle
outdoors_embeddings.pickle
._outdoors_golden_answers.csv
outdoors_golden_answers.csv
._outdoors_golden_answers.xlsx
outdoors_golden_answers.xlsx
._outdoors_golden_answers_20210130.csv
outdoors_golden_answers_20210130.csv
outdoors_labels.pickle
outdoors_question_answering_contexts.json
outdoors_questionanswering_test_set.json
outdoors_questionanswering_train_set.json
._posts.csv
posts.csv
predicates.pickle
pull_aips_dependency.py
._question-answer-seed-contexts.csv
question-answer-seed-contexts.csv
question-answer-sq

In [None]:
%%capture

!pip install faiss-cpu --no-cache
# !pip install faiss-gpu

In [8]:
import sys
import os
import time
sys.path.append("../..")
# from aips import *
import pandas as pd
import numpy as np
import pickle
import json
import tqdm

import faiss
import sentence_transformers
from sentence_transformers import SentenceTransformer, SimilarityFunction
from sentence_transformers.quantization import quantize_embeddings

from IPython.display import display, HTML

In [None]:
model = SentenceTransformer(
    "mixedbread-ai/mxbai-embed-large-v1",
    similarity_fn_name=SimilarityFunction.DOT_PRODUCT,
    truncate_dim=1024
)

## Get embeddings

In [9]:
def get_embeddings(texts, model, cache_name, ignore_cache=False):
  cache_file_name = f"data/outdoors/{cache_name}.pickle"
  if ignore_cache or not os.path.isfile(cache_file_name):
    embeddings = model.encode(texts, normalize_embeddings=True)
    os.makedirs(os.path.dirname(cache_file_name), exist_ok=True)
    with open(cache_file_name, "wb") as cache_file:
      pickle.dump(embeddings, cache_file)
  else:
    with open(cache_file_name, "rb") as cache_file:
      embeddings = pickle.load(cache_file)
  return embeddings

In [10]:
def display_results(scores, ids, data):
    results = generate_search_results(scores, ids, data)
    display(results)
    return results

def get_outdoors_data():
    outdoors_dataframe = pd.read_csv("data/outdoors/posts.csv")
    outdoors_data = list(outdoors_dataframe.to_dict())
    return outdoors_data

def display_statistics(search_results, baseline_search_results=None, start_message="Recall"):
    index_name = search_results["index_name"]
    time_taken = search_results["time_taken"]
    index_size = search_results["size"]
    improvement_ms = ""
    improvement_size = ""
    recall = 1.0
    if baseline_search_results:
        full_search_time = baseline_search_results["time_taken"]
        time_imp = round((full_search_time - time_taken) * 100 / full_search_time, 2)
        improvement_ms = f" ({time_imp}% improvement)"
        improvement_size = f" ({round((baseline_search_results['size'] - index_size) * 100 / baseline_search_results['size'], 2)}% improvement)"
        recall = calculate_recall(baseline_search_results["results"], search_results["results"])

    print(f"{index_name} search took: {time_taken:.3f} ms{improvement_ms}")
    print(f"{index_name} index size: {round(index_size / 1000000, 2)} MB{improvement_size}")
    print(f"{start_message}: {round(recall, 4)}")

def calculate_recall(scored_full_results, scored_quantized_results):
    recalls = []
    for i in range(len(scored_full_results)):
        full_ids = [r["id"] for r in scored_full_results[i]]
        quantized_ids = [r["id"] for r in scored_quantized_results[i]]
        recalls.append((len(set(full_ids).intersection(set(quantized_ids))) /
                       len(set(quantized_ids))))
    return sum(recalls) / len(recalls)

def generate_search_results(faiss_scores, faiss_ids):
    outdoors_data = get_outdoors_data()
    faiss_results = []
    for i in range(len(faiss_scores)):
        results = []
        for j, id in enumerate(faiss_ids[i]):
            id = int(id)
            result = {"score": faiss_scores[i][j],
                      "title": outdoors_data[id]["title"],
                      "body": outdoors_data[id]["body"],
                      "id": id}
            results.append(result)
        faiss_results.append(results)
    return faiss_results

def time_and_execute_search(index, index_name, query_embeddings, k=25, num_runs=100):
    search_times = []
    faiss_scores = None
    faiss_ids = None

    for i in range(num_runs):
        start_time = time.time()
        faiss_scores, faiss_ids = index.search(query_embeddings, k=k)
        time_taken = ((time.time() - start_time) * 1000)
        search_times.append(time_taken)

    results = {"results": generate_search_results(faiss_scores, faiss_ids),
               "time_taken": np.average(search_times),
               "faiss_scores": faiss_scores, "faiss_ids": faiss_ids}
    index_stats = {}
    if index_name:
        index_stats ={
            "index_name": index_name,
            "size": os.path.getsize(index_name)
        }
    return results | index_stats

##Scalar quantization

In [None]:
# let's index full-precision embeddings using FAISS
def index_full_precision_embeddings(doc_embeddings, name):
  # IndexFlatIP is a simple, unoptimized index supporting different embedding formats
  index = faiss.IndexFlatIP(doc_embeddings.shape[1])
  index.add(doc_embeddings)      # Adds documents to the index
  faiss.write_index(index, name) # Writes the index to disk
  return index

def get_outdoors_embeddings(model):
  outdoors_dataframe = pd.read_csv("data/outdoors/posts.csv")
  post_texts = [
      post["title"] + " " + post["body"]
      for post in outdoors_dataframe.to_dict()
  ]
  return np.array(get_embeddings(post_texts, model, "outdoors_mrl_normed"))

# Generates embeddings for the outdoors dataset
outdoors_embeddings = get_outdoors_embeddings(model)
# Creates a full-precision(Float32) FAISS index
full_index = index_full_precision_embeddings(outdoors_embeddings, "full_embeddings")

In [None]:
semantic_search("mountain hike", titles, log=True)

0.723 | How is elevation gain and change measured for hiking trails?
0.715 | How do I Plan a Hiking Trip to Rocky Mountain National Park, CO
0.698 | Hints for hiking the west highland way
0.694 | New Hampshire A.T. Section Hike in May? Logistics and Trail Conditions
0.678 | Long distance hiking trail markings in North America or parts thereof


In [None]:
semantic_search("dehyd", titles, log=True)

0.633 | The re-hydration time for deydrated foods


In [None]:
semantic_search("polar bear", titles, log=True)

0.611 | Bear spray vs. rifles against polar bears?


In [None]:
semantic_search("bear", titles, log=True)

0.63 | Running in bear country
