<h1>Semantic search using OpenAI and Pinecone</h1>

In [1]:
!pip install datasets
!pip install pinecone_client==2.2.1
!pip install openai==0.27.4

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Collec

<h2>Creating embeddings using OpenAI text embeddings</h2>

In [None]:
import openai
import os

openai.api_key = "<<YOUR SECRET KEY>>"
# We can get the api keys from the openai website.
openai.Engine.list()
# This checks whether we are authenticated.

<h2>We will be using the text-embedding-ada-002 model for creating the embeddings</h2>

In [None]:
MODEL = "text-embedding-ada-002"

res = openai.Embedding.create(
    input=[
        "Sample document text goes here",
        "there will be several phrases in each batch"
    ], engine=MODEL
)
res

In [None]:
print(f"vector 0: {len(res['data'][0]['embedding'])}\nvector 1: {len(res['data'][1]['embedding'])}")

<h1>Creating the vector database in Pinecone storing the vector embeddings</h1>

In [None]:
import pinecone

index_name = 'Semantic-Search-openai'

# initialize connection to pinecone
pinecone.init(
    api_key="PINECONE_API_KEY",
    environment="YOUR_ENV"  # find next to api key in console
)
# check if 'openai' index already exists (only create index if not)
if index_name not in pinecone.list_indexes():
    pinecone.create_index(index_name, dimension=1536)
# connect to index
index = pinecone.Index(index_name)

In [17]:
from datasets import load_dataset

dataset = load_dataset("wiki_qa", split="train[:15000]")
dataset



Dataset({
    features: ['question_id', 'question', 'document_title', 'answer', 'label'],
    num_rows: 15000
})

In [18]:
dataset[0]

{'question_id': 'Q1',
 'question': 'how are glacier caves formed?',
 'document_title': 'Glacier cave',
 'answer': 'A partly submerged glacier cave on Perito Moreno Glacier .',
 'label': 0}

In [19]:
columns = dataset.column_names
columns_to_keep = ["document_title", "question", "answer"]
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
dataset = dataset.remove_columns(columns_to_remove)
dataset

Dataset({
    features: ['question', 'document_title', 'answer'],
    num_rows: 15000
})

In [20]:
dataset[0]

{'question': 'how are glacier caves formed?',
 'document_title': 'Glacier cave',
 'answer': 'A partly submerged glacier cave on Perito Moreno Glacier .'}

In [None]:
sample = openai.Embedding.create(
    input = [
        dataset[0]['answer']
    ], engine = MODEL
)
len(sample['data'][0]['embedding'])

In [None]:
sample['data'][0]

In [None]:
sample['data'][1]

<h3>Then we create a vector embedding for each phrase using OpenAI, and upsert the ID, vector embedding, and original text for each phrase to Pinecone.</h3>

In [None]:
from tqdm.auto import tqdm
from openai.embeddings_utils import get_embedding
import pandas as pd

count = 0  # we'll use the count to create unique IDs
batch_size = 32  # process everything in batches of 32
for i in tqdm(range(0, len(dataset['answer']), batch_size)):
    # set end position of batch
    i_end = min(i+batch_size, len(dataset['answer']))
    # get batch of lines and IDs
    answer_batch = dataset['answer'][i: i+batch_size]
    question_batch = dataset['question'][i: i+batch_size]
    document_title_batch = dataset['document_title'][i: i+batch_size]
    ids_batch = [str(n) for n in range(i, i_end)]
    # create embeddings
    res = openai.Embedding.create(input=answer_batch, engine=MODEL)
    embeds = [record['embedding'] for record in res['data']]
    # prep metadata and upsert batch
    meta = [{'document_title': document_title, 'question': question, 'answer': answer} for document_title, question, answer in zip(document_title_batch, question_batch, answer_batch)]
    to_upsert = zip(ids_batch, embeds, meta)
    # upsert to Pinecone
    index.upsert(vectors=list(to_upsert))

df = pd.read_json(res['data'])
df.to_csv("answer_embeddings.csv")
df

<h1>Querying</h1>

In [None]:
query = "how are glacier caves formed?"

xq = openai.Embedding.create(input=query, engine=MODEL)['data'][0]['embedding']

In [None]:
res = index.query([xq], top_k=5, include_metadata=True)
res

<h3>The response from Pinecone includes our original answer in the metadata field, let's print out the top_k most similar answers and their respective similarity scores</h3>

In [None]:
for match in res['matches']:
    print(f"{match['score']:.2f}: {match['metadata']['answer']}")

<h1>Translation</h1>

In [None]:
query1 = "The ice facade is approximately 60 m high"

In [None]:
response = openai.Completion.create(
  model="text-davinci-003",
  prompt="Translate this to English: " + query1,
  temperature=0.3,
  max_tokens=100,
  top_p=1.0,
  frequency_penalty=0.0,
  presence_penalty=0.0
)

In [None]:
response['choices'][0]['text']

In [None]:
xq = openai.Embedding.create(input=response['choices'][0]['text'], engine=MODEL)['data'][0]['embedding']

In [None]:
res = index.query([xq], top_k=5, include_metadata=True)
res

In [None]:
for match in res['matches']:
    print(f"{match['score']:.2f}: {match['metadata']['answer']}")