In [1]:
!pip install -U sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m45.2 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.98-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5

In [2]:
# Define a passage of text. 
example_text = """In computer science, lexical analysis, lexing or tokenization is the process of converting a sequence of characters (such as in a computer program or web page) into a sequence of lexical tokens (strings with an assigned and thus identified meaning). 
A program that performs lexical analysis may be termed a lexer, tokenizer, or scanner, although scanner is also a term for the first stage of a lexer. 
A lexer is generally combined with a parser, which together analyze the syntax of programming languages, web pages, and so forth.
"""

In [3]:
# Split the sentences in the text. 
sentences = example_text.splitlines()
sentences

['In computer science, lexical analysis, lexing or tokenization is the process of converting a sequence of characters (such as in a computer program or web page) into a sequence of lexical tokens (strings with an assigned and thus identified meaning). ',
 'A program that performs lexical analysis may be termed a lexer, tokenizer, or scanner, although scanner is also a term for the first stage of a lexer. ',
 'A lexer is generally combined with a parser, which together analyze the syntax of programming languages, web pages, and so forth.']

In [4]:
# Import the SentenceTransformer class from the sentence_transformers module and use the `all-MiniLM-L6-v2` model.  
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [5]:
# Get the vector embeddings for the sentences. 
search_index = model.encode(sentences)
search_index

array([[ 0.04982998, -0.01590517,  0.00104865, ...,  0.03247731,
         0.05229097, -0.06946301],
       [ 0.01099093, -0.00992691, -0.06491146, ...,  0.03713554,
         0.12090363, -0.02447495],
       [-0.05205596, -0.04969758, -0.00858654, ...,  0.0932892 ,
         0.10074473, -0.00279654]], dtype=float32)

In [6]:
#  Each sentence has its own vector.
print(len(search_index))
# Get the vector length for the first sentence
print(len(search_index[0]))

3
384


In [8]:
# Create a query and encode the query with the model. 
query = "Transformers use tokenization"
search_query = model.encode([query])
# Get the first 50 embeddings of the query. 
search_query[0][0:50]

array([-0.10889484,  0.04733806, -0.00586653, -0.0086275 , -0.03373502,
        0.00658819,  0.04443799,  0.06396496,  0.02747097, -0.00242299,
        0.00679788,  0.00863447,  0.0675915 ,  0.0686243 ,  0.1120787 ,
        0.03739886, -0.01468724,  0.13086697, -0.14074002, -0.06395736,
        0.08881904,  0.03747569, -0.07724141,  0.0035688 ,  0.0546027 ,
        0.01195859,  0.00973718, -0.06396056,  0.05817872, -0.00197977,
       -0.03806328,  0.02242777, -0.1320797 ,  0.04958661, -0.037674  ,
        0.11181648,  0.03043763, -0.02283042,  0.08054754, -0.08275352,
        0.02735648, -0.04127552, -0.0260495 , -0.00325895, -0.01204655,
        0.03959356,  0.06496468, -0.02567808, -0.0623095 , -0.05053309],
      dtype=float32)

In [9]:
# The length of the vector embeddings is the same as each sentence embedding.
len(search_query[0])

384

In [10]:
# Import the util module from the sentence_transformers class, 
# which will be used to determine the similarity measures. 
from sentence_transformers import util

In [11]:
# Loop through the sentence embeddings and compare each sentence embedding with our query embedding.
for i in range(len(search_index)):
  index_embedding = search_index[i]
  cosine_similarity_score = util.cos_sim(index_embedding, search_query)
  print(f"Query: {query}")
  print(f"Sentence {i+1}: {sentences[i]}")
  print(f"Similarity score: {cosine_similarity_score}")
  print()

Query: Transformers use tokenization
Sentence 1: In computer science, lexical analysis, lexing or tokenization is the process of converting a sequence of characters (such as in a computer program or web page) into a sequence of lexical tokens (strings with an assigned and thus identified meaning). 
Similarity score: tensor([[0.4248]])

Query: Transformers use tokenization
Sentence 2: A program that performs lexical analysis may be termed a lexer, tokenizer, or scanner, although scanner is also a term for the first stage of a lexer. 
Similarity score: tensor([[0.2817]])

Query: Transformers use tokenization
Sentence 3: A lexer is generally combined with a parser, which together analyze the syntax of programming languages, web pages, and so forth.
Similarity score: tensor([[0.1905]])

