## 14.3 Semantic searching with OpenAI Embeddings

In [1]:
# inspired from 
#  https://github.com/openai/openai-cookbook/blob/502429c7c85fe78e0bc481e02d0ca44e2b9ad2c1/examples/Obtain_dataset.ipynb
#  https://github.com/openai/openai-cookbook/blob/main/examples/Semantic_text_search_using_embeddings.ipynb

import openai
from openai.embeddings_utils import get_embedding

import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
import numpy as np
from sentence_transformers import util
from transformers import pipeline

In [2]:
PERSON = 'Sinan Ozdemir'

# Note this is NOT an efficient way to search on google. This is done simply for education purposes
google_html = BeautifulSoup(requests.get(f'https://www.google.com/search?q={PERSON}').text).get_text()[:1024]

nlp = pipeline('question-answering', 
               model='deepset/roberta-base-squad2', 
               tokenizer='deepset/roberta-base-squad2', 
               max_length=10)

nlp(f'Who is {PERSON}?', google_html)

{'score': 0.09814409166574478,
 'start': 545,
 'end': 591,
 'answer': 'data scientist, start-up founder, and educator'}

In [3]:
# Our good old textbook about insects
text = urlopen('https://www.gutenberg.org/cache/epub/10834/pg10834.txt').read().decode()

# Only keep documents of at least 100 characters, same as before
documents = list(filter(lambda x: len(x) > 100, text.split('\r\n\r\n')))

print(f'There are {len(documents)} documents/paragraphs')

There are 79 documents/paragraphs


In [6]:
# define the engine we will use for embeddings
ENGINE = 'text-embedding-ada-002'

# list embedding engines
[e for e in openai.Engine.list().data if 'embed' in e.id]

[<Engine engine id=text-embedding-ada-002 at 0x280ac10d0> JSON: {
   "created": null,
   "id": "text-embedding-ada-002",
   "object": "engine",
   "owner": "openai-internal",
   "permissions": null,
   "ready": true
 }]

In [7]:
# Make sure you have your API key set in your environment per the README: 
#  https://github.com/openai/openai-python#usage

# This could take time if you have hundreds or thousands of documents
embeddings = [get_embedding(document, engine=ENGINE) for document in documents]

In [8]:
# Transform list of lists to numpy
document_embeddings = np.array(embeddings)

document_embeddings.shape

(79, 1536)

In [9]:
# This next part will look pretty familiar
QUESTION = 'How many horns does a flea have?'  # a natural language query

In [10]:
# Encode the query using OpenAI and find relevant documents
question_embedding = np.array(get_embedding(QUESTION, engine=ENGINE))

# Sentence Transformers semantic search is ready to go. We could rewrite it otherwise
hits = util.semantic_search(question_embedding, document_embeddings, top_k=3)[0]

hits

[{'corpus_id': 14, 'score': 0.8606351385894533},
 {'corpus_id': 16, 'score': 0.8235643926092285},
 {'corpus_id': 18, 'score': 0.7948247130014698}]

In [11]:
print(f'Question: {QUESTION}\n')

for i, hit in enumerate(hits):
    
    print(f'Document {i + 1} Cos_Sim {hit["score"]:.3f}:\n\n{documents[hit["corpus_id"]]}')
    print('\n')

Question: How many horns does a flea have?

Document 1 Cos_Sim 0.861:

When examined by a microscope, the flea is a pleasant object. The body
is curiously adorned with a suit of polished armour, neatly jointed, and
beset with a great number of sharp pins almost like the quills of a
porcupine: it has a small head, large eyes, two horns, or feelers, which
proceed from the head, and four long legs from the breast; they are very
hairy and long, and have several joints, which fold as it were one
within another.


Document 2 Cos_Sim 0.824:

In examining the louse with a microscope, its external deformity strikes
us with disgust. It has six feet, two eyes, and a sort of sting,
proboscis, or sucker, with which it pierces the skin, and sucks the
blood. The skin of the louse is hard and transparent, with here and
there several bristly hairs: at the end of each leg are two claws, by
which it is enabled to lay hold of the hairs, on which it climbs. There
is scarcely any animal known to

In [12]:
# answer the question from the top document
nlp(QUESTION, str(documents[hits[0]['corpus_id']]))

{'score': 0.8524730801582336, 'start': 259, 'end': 262, 'answer': 'two'}

# Let's use GPT3 to answer instead
![](../data/gptqa.png)

In [20]:
context = documents[hits[0]['corpus_id']]

response = openai.Completion.create(
  model="text-davinci-003",
  prompt=f"Given this context, answer the question.\n\nContext: {context}\nQuery: {QUESTION}\nAnswer:",
  temperature=0.7,
  max_tokens=25,
  top_p=1,
  frequency_penalty=0,
  presence_penalty=0
)

In [21]:
response

<OpenAIObject text_completion id=cmpl-6XBeNcL9GBIvqlExVG2dztFBMFIrZ at 0x28288a930> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "text": " A flea has two horns."
    }
  ],
  "created": 1673367627,
  "id": "cmpl-6XBeNcL9GBIvqlExVG2dztFBMFIrZ",
  "model": "text-davinci-003",
  "object": "text_completion",
  "usage": {
    "completion_tokens": 7,
    "prompt_tokens": 143,
    "total_tokens": 150
  }
}

In [22]:
# Get the completion
response['choices'][0]['text']

' A flea has two horns.'

In [24]:
# Some more fun asking GPT to respond to a 2nd grader

context = documents[hits[0]['corpus_id']]

response = openai.Completion.create(
  model="text-davinci-003",
  prompt=f"Given this context, answer the question so a second grader can understand.\n\nContext: {context}\nQuery: {QUESTION}\nAnswer:",
  temperature=0.7,
  max_tokens=25,
  top_p=1,
  frequency_penalty=0,
  presence_penalty=0
)

In [25]:
# Get the completion, with some more flavor
response['choices'][0]['text']

' A flea has two horns, which look like feelers coming out of its head.'