## 14.3 Semantic searching with OpenAI Embeddings

In [1]:
# inspired from 
#  https://github.com/openai/openai-cookbook/blob/502429c7c85fe78e0bc481e02d0ca44e2b9ad2c1/examples/Obtain_dataset.ipynb
#  https://github.com/openai/openai-cookbook/blob/main/examples/Semantic_text_search_using_embeddings.ipynb

import openai
from openai.embeddings_utils import get_embedding

import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
import numpy as np
from sentence_transformers import util
from transformers import pipeline

In [2]:
PERSON = 'Sinan Ozdemir'

# Note this is NOT an efficient way to search on google. This is done simply for education purposes
google_html = BeautifulSoup(requests.get(f'https://www.google.com/search?q={PERSON}').text).get_text()[:1024]

nlp = pipeline('question-answering', 
               model='deepset/roberta-base-squad2', 
               tokenizer='deepset/roberta-base-squad2', 
               max_length=10)

nlp(f'Who is {PERSON}?', google_html)

{'score': 0.09953926503658295,
 'start': 358,
 'end': 395,
 'answer': 'data scientist/lecturer/mathematician'}

In [3]:
# Our good old textbook about insects
text = urlopen('https://www.gutenberg.org/cache/epub/10834/pg10834.txt').read().decode()

# Only keep documents of at least 100 characters, same as before
documents = list(filter(lambda x: len(x) > 100, text.split('\r\n\r\n')))

print(f'There are {len(documents)} documents/paragraphs')

There are 79 documents/paragraphs


In [4]:
openai.api_key = '<<API_KEY>>'  # replace with your key or save it as an environment variable named "OPENAI_API_KEY"

In [5]:
openai.Engine.list().data

[<Engine engine id=babbage at 0x2883ad670> JSON: {
   "created": null,
   "id": "babbage",
   "object": "engine",
   "owner": "openai",
   "permissions": null,
   "ready": true
 },
 <Engine engine id=ada at 0x288113a10> JSON: {
   "created": null,
   "id": "ada",
   "object": "engine",
   "owner": "openai",
   "permissions": null,
   "ready": true
 },
 <Engine engine id=davinci at 0x2883ad710> JSON: {
   "created": null,
   "id": "davinci",
   "object": "engine",
   "owner": "openai",
   "permissions": null,
   "ready": true
 },
 <Engine engine id=text-embedding-ada-002 at 0x2883ad7b0> JSON: {
   "created": null,
   "id": "text-embedding-ada-002",
   "object": "engine",
   "owner": "openai-internal",
   "permissions": null,
   "ready": true
 },
 <Engine engine id=babbage-code-search-code at 0x2883adb20> JSON: {
   "created": null,
   "id": "babbage-code-search-code",
   "object": "engine",
   "owner": "openai-dev",
   "permissions": null,
   "ready": true
 },
 <Engine engine id=text-si

In [6]:
# define the engine we will use for embeddings
ENGINE = 'text-embedding-ada-002'

# list embedding engines
[e for e in openai.Engine.list().data if 'embed' in e.id or 'search' in e.id]

[<Engine engine id=text-embedding-ada-002 at 0x2883c54e0> JSON: {
   "created": null,
   "id": "text-embedding-ada-002",
   "object": "engine",
   "owner": "openai-internal",
   "permissions": null,
   "ready": true
 },
 <Engine engine id=babbage-code-search-code at 0x2883c55d0> JSON: {
   "created": null,
   "id": "babbage-code-search-code",
   "object": "engine",
   "owner": "openai-dev",
   "permissions": null,
   "ready": true
 },
 <Engine engine id=babbage-code-search-text at 0x2883c58a0> JSON: {
   "created": null,
   "id": "babbage-code-search-text",
   "object": "engine",
   "owner": "openai-dev",
   "permissions": null,
   "ready": true
 },
 <Engine engine id=curie-search-query at 0x2883c5a30> JSON: {
   "created": null,
   "id": "curie-search-query",
   "object": "engine",
   "owner": "openai-dev",
   "permissions": null,
   "ready": true
 },
 <Engine engine id=code-search-babbage-text-001 at 0x2883c5b20> JSON: {
   "created": null,
   "id": "code-search-babbage-text-001",
  

In [None]:
# Make sure you have your API key set in your environment per the README: 
#  https://github.com/openai/openai-python#usage

# This could take time if you have hundreds or thousands of documents
embeddings = [get_embedding(document, engine=ENGINE) for document in documents]

In [None]:
# Transform list of lists to numpy
document_embeddings = np.array(embeddings)

document_embeddings.shape

In [None]:
# This next part will look pretty familiar
QUESTION = 'How many horns does a flea have?'  # a natural language query

In [None]:
# Encode the query using OpenAI and find relevant documents
question_embedding = np.array(get_embedding(QUESTION, engine=ENGINE))

# Sentence Transformers semantic search is ready to go. We could rewrite it otherwise
hits = util.semantic_search(question_embedding, document_embeddings, top_k=3)[0]

hits

In [None]:
print(f'Question: {QUESTION}\n')

for i, hit in enumerate(hits):
    
    print(f'Document {i + 1} Cos_Sim {hit["score"]:.3f}:\n\n{documents[hit["corpus_id"]]}')
    print('\n')

In [None]:
# answer the question from the top document
nlp(QUESTION, str(documents[hits[0]['corpus_id']]))

# Let's use GPT3 to answer instead
![](../data/gptqa.png)

In [None]:
context = documents[hits[0]['corpus_id']]

PROMPT = f"Given this context, answer the question.\n\nContext: {context}\nQuery: {QUESTION}\nAnswer:"
print(PROMPT)

In [None]:
# Call the OpenAI API to extract the answer from our context

response = openai.Completion.create(
  model="text-davinci-003",
  prompt=PROMPT,
  temperature=0.7,
  max_tokens=25,
  top_p=1,
  frequency_penalty=0,
  presence_penalty=0
)

In [None]:
response

In [None]:
# Get the completion
response['choices'][0]['text']

In [None]:
FUN_PROMPT = f"Given this context, answer the question in a fun way for a second grader.\n\nContext: {context}\nQuery: {QUESTION}\nAnswer:"
print(FUN_PROMPT)

In [None]:
# Some more fun asking GPT to respond to a 2nd grader

context = documents[hits[0]['corpus_id']]

response = openai.Completion.create(
  model="text-davinci-003",
  prompt=FUN_PROMPT,
  temperature=0.7,
  max_tokens=25,
  top_p=1,
  frequency_penalty=0,
  presence_penalty=0
)

In [None]:
# Get the completion, with some more flavor
response['choices'][0]['text']