<a href="https://colab.research.google.com/github/rahiakela/nlp-research-and-practice/blob/main/ai-powered-search/13_2_semantic_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

In this notebook, we"re going to install a transformer model, analyze the embedding output, and compare some vectors

In [1]:
#outdoors
![ ! -d 'outdoors' ] && git clone --depth=1 https://github.com/ai-powered-search/outdoors.git
! cd outdoors && git pull
! cd outdoors && cat outdoors.tgz.part* > outdoors.tgz
! cd outdoors && mkdir -p '../data/outdoors/' && tar -xvf outdoors.tgz -C '../data/outdoors/'

Cloning into 'outdoors'...
remote: Enumerating objects: 25, done.[K
remote: Counting objects: 100% (25/25), done.[K
remote: Compressing objects: 100% (24/24), done.[K
remote: Total 25 (delta 0), reused 22 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (25/25), 491.39 MiB | 18.04 MiB/s, done.
Updating files: 100% (23/23), done.
Already up to date.
README.md
concepts.pickle
._guesses.csv
guesses.csv
._guesses_all.json
guesses_all.json
outdoors_concepts.pickle
outdoors_embeddings.pickle
._outdoors_golden_answers.csv
outdoors_golden_answers.csv
._outdoors_golden_answers.xlsx
outdoors_golden_answers.xlsx
._outdoors_golden_answers_20210130.csv
outdoors_golden_answers_20210130.csv
outdoors_labels.pickle
outdoors_question_answering_contexts.json
outdoors_questionanswering_test_set.json
outdoors_questionanswering_train_set.json
._posts.csv
posts.csv
predicates.pickle
pull_aips_dependency.py
._question-answer-seed-contexts.csv
question-answer-seed-contexts.csv
question-answer-sq

In [2]:
%%capture

!pip install nmslib

In [3]:
import sys
import os
sys.path.append("../..")
# from aips import *
import pandas as pd
import numpy as np
import pickle
import json
import tqdm

import nmslib
import sentence_transformers
from IPython.display import display, HTML

In [None]:
from sentence_transformers import SentenceTransformer
transformer = SentenceTransformer("roberta-base-nli-stsb-mean-tokens")

## Get embeddings

In [54]:
def get_embeddings(texts, model, cache_name, ignore_cache=False):
  cache_file_name = f"data/outdoors/{cache_name}.pickle"
  if ignore_cache or not os.path.isfile(cache_file_name):
    embeddings = model.encode(texts)
    os.makedirs(os.path.dirname(cache_file_name), exist_ok=True)
    with open(cache_file_name, "wb") as cache_file:
      pickle.dump(embeddings, cache_file)
  else:
    with open(cache_file_name, "rb") as cache_file:
      embeddings = pickle.load(cache_file)
  return embeddings

In [6]:
def normalize_embedding(embedding):
  normalized = np.divide(embedding, np.linalg.norm(embedding))
  return list(map(float, normalized))

In [7]:
def rank_similarities(phrases, similarities):
  a_phrases = []
  b_phrases = []
  scores = []
  for a in range(len(similarities) - 1):
    for b in range(a + 1, len(similarities)):
      a_phrases.append(phrases[a])
      b_phrases.append(phrases[b])
      scores.append(float(similarities[a][b]))
  dataframe = pd.DataFrame({
      "score": scores,
      "phrase a": a_phrases,
      "phrase b": b_phrases
  })
  dataframe["idx"] = range(len(dataframe))
  dataframe = dataframe.reindex(columns=["idx", "score", "phrase a", "phrase b"])
  return dataframe.sort_values(by=["score"], ascending=False, ignore_index=True)

In [8]:
outdoors_dataframe = pd.read_csv("data/outdoors/posts.csv")
# filter NaN title column
titles = outdoors_dataframe[outdoors_dataframe['title'].notna()]["title"]
# titles = list(filter(None, titles))
titles.head(10)

Unnamed: 0,title
0,How do I treat hot spots and blisters when I h...
1,Where in the Alps is it safe to drink the wate...
2,Is it legal to camp on private property in Rus...
3,What are the critical dimensions to a safe bea...
4,Can I sail a raft on a European river with com...
6,What is the safest way to purify water?
8,How can you navigate without a compass or GPS
9,What is the fastest method to 'break in' full ...
10,How do I know what size ice axe I should get?
12,What can I do to prevent altitude sickness?


In [55]:
# Encoding the titles into embeddings
outdoors_dataframe = pd.read_csv("data/outdoors/posts.csv")
titles = outdoors_dataframe[outdoors_dataframe['title'].notna()]["title"]
titles = list(filter(None, titles))

cache_name = "outdoors_semantic_search_embeddings"
embeddings = get_embeddings(titles, transformer, cache_name, ignore_cache=True)

print(f"Number of titles: {len(titles)}")
print(f"Number of embeddings: {len(embeddings)}")
print(f"Dimensions per embedding: {len(embeddings[0])}")

Number of titles: 5331
Number of embeddings: 5331
Dimensions per embedding: 768


In [56]:
# Explore the top similarities for the titles
normalized_embeddings = list(map(normalize_embedding, embeddings))
# Find the pairs with the highest dot product scores
similarities = sentence_transformers.util.dot_score(
    normalized_embeddings[0:100],
    normalized_embeddings[0:100]
)
comparisons = rank_similarities(titles, similarities)
display(HTML(comparisons[:10].to_html(index=False)))

idx,score,phrase a,phrase b
4515,0.846395,How do I recognize if someone is suffering from hypothermia?,How should I treat hypothermia?
1237,0.811995,How should I treat poison ivy?,What can I do to prevent getting poison ivy?
4872,0.800817,What is the difference between the different types of snowboards? (all-mountain/freestyle/freeride/etc),What is the difference between camber and rocker shaped snowboards?
4204,0.794242,How do I tie a sleeping bag to my backpack?,What is the best way to store my sleeping bag for long periods of time?
3568,0.790016,What should I look for if I want to buy a winter-proofed tent?,What is the best way to store my tent?
4864,0.753913,How do I set a top rope anchor?,How do I inspect a climbing rope?
496,0.745218,What is the safest way to purify water?,What are the different methods to purify water?
2974,0.710362,"What do I need to look for in good, quality hiking boots?",What is the difference between men's and women's hiking boots?
3292,0.704151,"What to look for in a durable, 3-season sleeping bag?",What is the best way to store my sleeping bag for long periods of time?
3760,0.698881,How should I check that the anchor is secure when I anchor a small yacht off unfamiliar land?,How do I set a top rope anchor?


In [11]:
# Fix rendering of this image
from plotnine import *
{
    ggplot(comparisons, aes("idx", "score")) +
    geom_point(alpha=.05)
}

{<plotnine.ggplot.ggplot at 0x7812a34b8640>}

In [12]:
from plotnine import *
{
    ggplot(comparisons, aes("idx", "score")) +
    geom_violin(color="blue") +
    scale_y_continuous(limits=[-0.4, 1.0], breaks=[-0.4, -0.2, 0, 0.2, 0.4, 0.6, 0.8, 1.0])
}

{<plotnine.ggplot.ggplot at 0x7812a363ad40>}

##Searching ANN Index

In [57]:
# initialize a new index, using a HNSW index on Dot Product
titles_index = nmslib.init(method='hnsw', space='negdotprod')
normalized_embeddings = list(map(normalize_embedding, embeddings))

# All the embeddings can be added in a single batch
titles_index.addDataPointBatch(normalized_embeddings)
# Commits the index to memory. This must be done before you can query for nearest neighbors
titles_index.createIndex(print_progress=True)

In [61]:
# let's do encoding a query and returning the k-nearest-neighbor concepts
def print_labels(query, matches):
  display(HTML(f"<h4>Results for: <em>{query}</em></h4>"))
  for (l, d) in matches:
    print(str(int(d * 1000) / 1000), "|", l)

def embedding_search(index, query, phrases, k=20, min_similarity=0.75):
  matches = []
  # Gets the embeddings for query
  query_embedding = transformer.encode(query, convert_to_tensor=True)
  query_embedding = normalize_embedding(query_embedding)
  ids, distances = index.knnQuery(query_embedding, k=k)
  for i in range(len(ids)):
    # Converts negative dot product distance into a positive dot product
    distance = distances[i] * -1
    if distance > min_similarity:
      matches.append((phrases[ids[i]], distance))
  if not len(matches):
    # No neighbors found! Returns just the original term
    matches.append((phrases[ids[1]], distances[1] * -1))
  return matches

In [59]:
def semantic_search(query, phrases, log=False):
  matches = embedding_search(titles_index, query, phrases, k=5, min_similarity=0.6)
  if log:
    print_labels(query, matches)

In [62]:
semantic_search("mountain hike", titles, log=True)

0.723 | How is elevation gain and change measured for hiking trails?
0.715 | How do I Plan a Hiking Trip to Rocky Mountain National Park, CO
0.698 | Hints for hiking the west highland way
0.694 | New Hampshire A.T. Section Hike in May? Logistics and Trail Conditions
0.678 | Long distance hiking trail markings in North America or parts thereof


In [63]:
semantic_search("dehyd", titles, log=True)

0.633 | The re-hydration time for deydrated foods


In [64]:
semantic_search("polar bear", titles, log=True)

0.611 | Bear spray vs. rifles against polar bears?


In [65]:
semantic_search("bear", titles, log=True)

0.63 | Running in bear country
