<a href="https://colab.research.google.com/github/rahiakela/nlp-research-and-practice/blob/main/ai-powered-search/13_natural_language_autocomplete.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

In this notebook, we"re going to install a transformer model, analyze the embedding output, and compare some vectors

In [1]:
#outdoors
![ ! -d 'outdoors' ] && git clone --depth=1 https://github.com/ai-powered-search/outdoors.git
! cd outdoors && git pull
! cd outdoors && cat outdoors.tgz.part* > outdoors.tgz
! cd outdoors && mkdir -p '../data/outdoors/' && tar -xvf outdoors.tgz -C '../data/outdoors/'

Cloning into 'outdoors'...
remote: Enumerating objects: 25, done.[K
remote: Counting objects: 100% (25/25), done.[K
remote: Compressing objects: 100% (24/24), done.[K
remote: Total 25 (delta 0), reused 22 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (25/25), 491.39 MiB | 25.64 MiB/s, done.
Updating files: 100% (23/23), done.
Already up to date.
README.md
concepts.pickle
._guesses.csv
guesses.csv
._guesses_all.json
guesses_all.json
outdoors_concepts.pickle
outdoors_embeddings.pickle
._outdoors_golden_answers.csv
outdoors_golden_answers.csv
._outdoors_golden_answers.xlsx
outdoors_golden_answers.xlsx
._outdoors_golden_answers_20210130.csv
outdoors_golden_answers_20210130.csv
outdoors_labels.pickle
outdoors_question_answering_contexts.json
outdoors_questionanswering_test_set.json
outdoors_questionanswering_train_set.json
._posts.csv
posts.csv
predicates.pickle
pull_aips_dependency.py
._question-answer-seed-contexts.csv
question-answer-seed-contexts.csv
question-answer-sq

In [9]:
import sys
import os
sys.path.append("../..")
# from aips import *
import pandas
import numpy
import pickle
import json
import tqdm

import sentence_transformers
from IPython.display import display, HTML

In [None]:
from sentence_transformers import SentenceTransformer
transformer = SentenceTransformer("roberta-base-nli-stsb-mean-tokens")

## Introduction to Transformers

In [4]:
phrases = [
    "it's raining hard",
    "it is wet outside",
    "cars drive fast",
    "motorcycles are loud"
]

embeddings = transformer.encode(phrases, convert_to_tensor=True)
print("Number of embeddings:", len(embeddings))
print("Dimensions per embedding:", len(embeddings[0]))
print("The embedding feature values of \"it's raining hard\":")
print(embeddings[0][10])

Number of embeddings: 4
Dimensions per embedding: 768
The embedding feature values of "it's raining hard":
tensor(0.5095)


In [5]:
def normalize_embedding(embedding):
    normalized = numpy.divide(embedding, numpy.linalg.norm(embedding))
    return list(map(float, normalized))

# Unit-normalizes embeddings for speed
normalized_embeddings = list(map(normalize_embedding, embeddings))
similarities = sentence_transformers.util.dot_score(normalized_embeddings,
                                                    normalized_embeddings)
print("The shape of the resulting similarities:", similarities.shape)

The shape of the resulting similarities: torch.Size([4, 4])


In [6]:
def rank_similarities(phrases, similarities):
    a_phrases = []
    b_phrases = []
    scores = []
    for a in range(len(similarities) - 1):
        for b in range(a + 1, len(similarities)):
            a_phrases.append(phrases[a])
            b_phrases.append(phrases[b])
            scores.append(float(similarities[a][b]))
    dataframe = pandas.DataFrame({"score": scores,
                                  "phrase a": a_phrases, "phrase b": b_phrases})
    dataframe["idx"] = dataframe.index
    dataframe = dataframe.reindex(columns=["idx", "score", "phrase a", "phrase b"])
    return dataframe.sort_values(by=["score"], ascending=False, ignore_index=True)

dataframe = rank_similarities(phrases, similarities)
display(HTML(dataframe.to_html(index=False)))

idx,score,phrase a,phrase b
0,0.66906,it's raining hard,it is wet outside
5,0.590783,cars drive fast,motorcycles are loud
1,0.281166,it's raining hard,cars drive fast
2,0.2808,it's raining hard,motorcycles are loud
4,0.204867,it is wet outside,motorcycles are loud
3,0.138172,it is wet outside,cars drive fast


## Get embeddings

In [7]:
transformer = SentenceTransformer("roberta-base-nli-stsb-mean-tokens")

In [11]:
def get_embeddings(text, model, cache_name, ignore_cache=False):
  cache_file_name = f"data/outdoors/{cache_name}.pickle"
  if ignore_cache or not os.path.isfile(cache_file_name):
    return numpy.load(cache_file_name)
    embeddings = model.encode(texts)
    os.makedirs(os.path.dirname(cache_file_name), exist_ok=True)
    with open(cache_file_name, "wb") as cache_file:
      pickle.dump(embeddings, cache_file)
  else:
    with open(cache_file_name, "rb") as cache_file:
      embeddings = pickle.load(cache_file)
  return embeddings

In [12]:
with open("data/outdoors/outdoors_concepts.pickle", "rb") as concepts:
  concepts = pickle.load(concepts)

In [13]:
#Note!  This is a hyperparameter.
#We are ignoring terms that occur less than this numner in the entire corpus.
#Lowering this number may lower precision
#Raising this number may lower recall
minimum_frequency = 6
phrases = [key for (key, tf) in concepts.items() if tf >= minimum_frequency]

cache_name = "outdoors_embeddings"
# set ignore_cache=True to regenerate the embeddings rather than loading from the cache
embeddings = get_embeddings(phrases, transformer, cache_name, ignore_cache=False)

print(f"Number of embeddings: {len(embeddings)}")
print(f"Dimensions per embedding: {len(embeddings[0])}")

Number of embeddings: 12375
Dimensions per embedding: 768


## Calculate similarity score

In [14]:
def normalize_embedding(embedding):
  normalized = numpy.divide(embedding, numpy.linalg.norm(embedding))
  return list(map(float, normalized))

In [17]:
def rank_similarities(phrases, similarities):
  a_phrases = []
  b_phrases = []
  scores = []
  for a in range(len(similarities) - 1):
    for b in range(a + 1, len(similarities)):
      a_phrases.append(phrases[a])
      b_phrases.append(phrases[b])
      scores.append(float(similarities[a][b]))
  dataframe = pandas.DataFrame({
      "score": scores,
      "phrase a": a_phrases,
      "phrase b": b_phrases
  })
  dataframe["idx"] = range(len(dataframe))
  dataframe = dataframe.reindex(columns=["idx", "score", "phrase a", "phrase b"])
  return dataframe.sort_values(by=["score"], ascending=False, ignore_index=True)

In [18]:
# Find the pairs with the highest cosine similarity scores
normalized_embeddings = list(map(normalize_embedding, embeddings))
similarities = sentence_transformers.util.dot_score(
    normalized_embeddings[0:250],
    normalized_embeddings[0:250]
)
# Ranks similarities
comparisons = rank_similarities(phrases, similarities)
display(HTML(comparisons[:10].to_html(index=False)))

idx,score,phrase a,phrase b
31096,0.92815,protect,protection
13241,0.92357,climbing,climber
18096,0.878894,camp,camping
7282,0.833662,climb,climbing
10312,0.821081,something,someone
8813,0.815187,hike,hiking
4182,0.784663,people,person
7354,0.782962,climb,climber
1027,0.770643,go,leave
4422,0.768612,keep,stay


In [24]:
from plotnine import *
from plotnine.data import mpg
import matplotlib.pyplot as plt
candidate_synonyms = comparisons[comparisons["score"] > 0.0]
{
    ggplot(comparisons, aes("idx", "score")) +
    geom_violin(color="blue") +
    scale_y_continuous(limits=[-0.4, 1.0],
                       breaks=[-0.4, -0.2, 0, 0.2, 0.4, 0.6, 0.8, 1.0])
}

{<plotnine.ggplot.ggplot at 0x7df99a8fae30>}

##Search embeddings