<a href="https://colab.research.google.com/github/rahiakela/nlp-research-and-practice/blob/main/ai-powered-search/13_natural_language_autocomplete.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Introduction to Transformers

In this notebook, we"re going to install a transformer model, analyze the embedding output, and compare some vectors

In [None]:
#outdoors
![ ! -d 'outdoors' ] && git clone --depth=1 https://github.com/ai-powered-search/outdoors.git
! cd outdoors && git pull
! cd outdoors && cat outdoors.tgz.part* > outdoors.tgz
! cd outdoors && mkdir -p '../data/outdoors/' && tar -xvf outdoors.tgz -C '../data/outdoors/'

In [None]:
import sys
sys.path.append("../..")
# from aips import *
import pandas
import numpy
import sentence_transformers
from IPython.display import display, HTML

## Listing 13.5


In [None]:
from sentence_transformers import SentenceTransformer
transformer = SentenceTransformer("roberta-base-nli-stsb-mean-tokens")

## Listing 13.6

In [None]:
phrases = [
    "it's raining hard",
    "it is wet outside",
    "cars drive fast",
    "motorcycles are loud"
]

embeddings = transformer.encode(phrases, convert_to_tensor=True)
print("Number of embeddings:", len(embeddings))
print("Dimensions per embedding:", len(embeddings[0]))
print("The embedding feature values of \"it's raining hard\":")
print(embeddings[0][10])

Number of embeddings: 4
Dimensions per embedding: 768
The embedding feature values of "it's raining hard":
tensor(0.5095)


## Listing 13.7


In [None]:
def normalize_embedding(embedding):
    normalized = numpy.divide(embedding, numpy.linalg.norm(embedding))
    return list(map(float, normalized))

# Unit-normalizes embeddings for speed
normalized_embeddings = list(map(normalize_embedding, embeddings))
similarities = sentence_transformers.util.dot_score(normalized_embeddings,
                                                    normalized_embeddings)
print("The shape of the resulting similarities:", similarities.shape)

The shape of the resulting similarities: torch.Size([4, 4])


## Listing 13.8

In [None]:
def rank_similarities(phrases, similarities):
    a_phrases = []
    b_phrases = []
    scores = []
    for a in range(len(similarities) - 1):
        for b in range(a + 1, len(similarities)):
            a_phrases.append(phrases[a])
            b_phrases.append(phrases[b])
            scores.append(float(similarities[a][b]))
    dataframe = pandas.DataFrame({"score": scores,
                                  "phrase a": a_phrases, "phrase b": b_phrases})
    dataframe["idx"] = dataframe.index
    dataframe = dataframe.reindex(columns=["idx", "score", "phrase a", "phrase b"])
    return dataframe.sort_values(by=["score"], ascending=False, ignore_index=True)

dataframe = rank_similarities(phrases, similarities)
display(HTML(dataframe.to_html(index=False)))

idx,score,phrase a,phrase b
0,0.66906,it's raining hard,it is wet outside
5,0.590783,cars drive fast,motorcycles are loud
1,0.281166,it's raining hard,cars drive fast
2,0.2808,it's raining hard,motorcycles are loud
4,0.204867,it is wet outside,motorcycles are loud
3,0.138172,it is wet outside,cars drive fast
