In [1]:
# This notebook has been updated from my book to use the latest openai package version! At the time, 1.6.1

In [2]:
from openai import OpenAI
from datetime import datetime
import hashlib
import re
from random import sample
import os
from tqdm import tqdm
import numpy as np

import logging

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)


In [6]:
pinecone_key = os.environ.get('PINECONE_API_KEY')
client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY")
)

INDEX_NAME = 'semantic-search'
NAMESPACE = 'default'
ENGINE = 'text-embedding-ada-002'

In [7]:
import os
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(
    api_key=pinecone_key
)


In [8]:
pc.list_indexes().names()

['semantic-search-serverless', 'semantic-search']

In [9]:
# Now do stuff
if INDEX_NAME not in pc.list_indexes().names():
    print(f'Creating index {INDEX_NAME}')
    pc.create_index(
        name=INDEX_NAME,  # The name of the index
        dimension=1536,  # The dimensionality of the vectors
        metric='cosine',  # The similarity metric to use when searching the index
        spec=ServerlessSpec(
            cloud='aws',
            region='us-west-2'
        )
    )

In [10]:
# helper functions to get lists of embeddings from the OpenAI API
def get_embeddings(texts, engine=ENGINE):
    response = client.embeddings.create(
        input=texts,
        model=engine
    )
    
    return [d.embedding for d in list(response.data)]

def get_embedding(text, engine=ENGINE):
    return get_embeddings([text], engine)[0]
    
len(get_embedding('hi')), len(get_embeddings(['hi', 'hello']))

(1536, 2)

In [11]:
# Store the index as a variable
index = pc.Index(name=INDEX_NAME)
index

<pinecone.data.index.Index at 0x11ebc7fd0>

In [12]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'default': {'vector_count': 3097}},
 'total_vector_count': 3097}

In [13]:
def my_hash(s):
    # Return the MD5 hash of the input string as a hexadecimal string
    return hashlib.md5(s.encode()).hexdigest()

my_hash('I love to hash it hash it')

'c8c5b4f30923402163322b36c733a613'

In [14]:
def prepare_for_pinecone(texts, engine=ENGINE):
    # Get the current UTC date and time
    now = datetime.utcnow()
    
    # Generate vector embeddings for each string in the input list, using the specified engine
    embeddings = get_embeddings(texts, engine=engine)
    
    # Create tuples of (hash, embedding, metadata) for each input string and its corresponding vector embedding
    # The my_hash() function is used to generate a unique hash for each string, and the datetime.utcnow() function is used to generate the current UTC date and time
    return [
        (
            my_hash(text),  # A unique ID for each string, generated using the my_hash() function
            embedding,  # The vector embedding of the string
            dict(text=text, date_uploaded=now)  # A dictionary of metadata, including the original text and the current UTC date and time
        ) 
        for text, embedding in zip(texts, embeddings)  # Iterate over each input string and its corresponding vector embedding
    ]


In [15]:
texts = ['hi']

_id, embedding, metadata = prepare_for_pinecone(texts)[0]

print('ID:  ',_id, '\nLEN: ', len(embedding), '\nMETA:', metadata)

ID:   49f68a5c8493ec2c0bf489821c21fc3b 
LEN:  1536 
META: {'text': 'hi', 'date_uploaded': datetime.datetime(2024, 1, 22, 18, 35, 7, 573042)}


In [16]:
def upload_texts_to_pinecone(texts, namespace=NAMESPACE, batch_size=None, show_progress_bar=False):
    # Call the prepare_for_pinecone function to prepare the input texts for indexing
    total_upserted = 0
    if not batch_size:
        batch_size = len(texts)

    _range = range(0, len(texts), batch_size)
    for i in tqdm(_range) if show_progress_bar else _range:
        batch = texts[i: i + batch_size]
        prepared_texts = prepare_for_pinecone(batch)

        # Use the upsert() method of the index object to upload the prepared texts to Pinecone
        total_upserted = index.upsert(
            vectors=prepared_texts,
            namespace=namespace
        )['upserted_count']
       

    return total_upserted

# Call the upload_texts_to_pinecone() function with the input texts
upload_texts_to_pinecone(texts)


1

In [17]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'default': {'vector_count': 3097}},
 'total_vector_count': 3097}

In [18]:
def query_from_pinecone(query, top_k=3):
    # get embedding from THE SAME embedder as the documents
    query_embedding = get_embedding(query, engine=ENGINE)

    return index.query(
      vector=query_embedding,
      top_k=top_k,
      namespace=NAMESPACE,
      include_metadata=True   # gets the metadata (dates, text, etc)
    ).get('matches')

query_from_pinecone('hello')

[{'id': '093601540a641d12a6f734a9fa624ce5',
  'metadata': {'date_uploaded': '2024-01-22T17:40:42.771556',
               'text': "Alexander Graham Bell originally suggested 'ahoy-hoy' "
                       'be adopted as the standard greeting when answering a '
                       "telephone, before 'hello' (suggested by Thomas Edison) "
                       'became common.'},
  'score': 0.781582236,
  'values': []},
 {'id': '9588c26cecaaf486eae14858827a6699',
  'metadata': {'date_uploaded': '2024-01-22T17:39:17.866368',
               'text': 'The Abbott family -- wife Evelyn, husband Lee, '
                       'congenitally deaf daughter Regan, and sons Marcus and '
                       'Beau -- silently scavenge for supplies in a deserted '
                       'town. While out in the open, the family communicates '
                       'with American Sign Language (ASL). Four-year-old Beau '
                       'is drawn to a battery-powered space shuttle toy, b

In [19]:
import hashlib

def delete_texts_from_pinecone(texts, namespace=NAMESPACE):
    # Compute the hash (id) for each text
    hashes = [hashlib.md5(text.encode()).hexdigest() for text in texts]
    
    # The ids parameter is used to specify the list of IDs (hashes) to delete
    return index.delete(ids=hashes, namespace=namespace)

# delete our text
delete_texts_from_pinecone(texts)

# test that the index is empty
query_from_pinecone('hello')

[{'id': '093601540a641d12a6f734a9fa624ce5',
  'metadata': {'date_uploaded': '2024-01-22T17:40:42.771556',
               'text': "Alexander Graham Bell originally suggested 'ahoy-hoy' "
                       'be adopted as the standard greeting when answering a '
                       "telephone, before 'hello' (suggested by Thomas Edison) "
                       'became common.'},
  'score': 0.781582236,
  'values': []},
 {'id': '9588c26cecaaf486eae14858827a6699',
  'metadata': {'date_uploaded': '2024-01-22T17:39:17.866368',
               'text': 'The Abbott family -- wife Evelyn, husband Lee, '
                       'congenitally deaf daughter Regan, and sons Marcus and '
                       'Beau -- silently scavenge for supplies in a deserted '
                       'town. While out in the open, the family communicates '
                       'with American Sign Language (ASL). Four-year-old Beau '
                       'is drawn to a battery-powered space shuttle toy, b

In [20]:
# Importing the tiktoken library
import tiktoken

# Initializing a tokenizer for the 'cl100k_base' model
# This tokenizer is designed to work with the 'ada-002' embedding model
tokenizer = tiktoken.encoding_for_model("gpt-4")

# Using the tokenizer to encode the text 'hey there'
# The resulting output is a list of integers representing the encoded text
# This is the input format required for embedding using the 'ada-002' model
tokenizer.encode('hey there')


[36661, 1070]

In [21]:
# Function to split the text into chunks of a maximum number of tokens. Inspired by OpenAI
def overlapping_chunks(text, max_tokens = 500, overlapping_factor = 5):
    '''
    max_tokens: tokens we want per chunk
    overlapping_factor: number of sentences to start each chunk with that overlaps with the previous chunk
    '''

    # Split the text using punctuation
    sentences = re.split(r'[.?!]', text)

    # Get the number of tokens for each sentence
    n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences]
    
    chunks, tokens_so_far, chunk = [], 0, []

    # Loop through the sentences and tokens joined together in a tuple
    for sentence, token in zip(sentences, n_tokens):

        # If the number of tokens so far plus the number of tokens in the current sentence is greater 
        # than the max number of tokens, then add the chunk to the list of chunks and reset
        # the chunk and tokens so far
        if tokens_so_far + token > max_tokens:
            chunks.append(". ".join(chunk) + ".")
            if overlapping_factor > 0:
                chunk = chunk[-overlapping_factor:]
                tokens_so_far = sum([len(tokenizer.encode(c)) for c in chunk])
            else:
                chunk = []
                tokens_so_far = 0

        # If the number of tokens in the current sentence is greater than the max number of 
        # tokens, go to the next sentence
        if token > max_tokens:
            continue

        # Otherwise, add the sentence to the chunk and add the number of tokens to the total
        chunk.append(sentence)
        tokens_so_far += token + 1
    if chunk:
        chunks.append(". ".join(chunk) + ".")

    return chunks

In [22]:
from urllib.request import urlopen

# A textbook about insects
text = urlopen('https://www.gutenberg.org/cache/epub/10834/pg10834.txt').read().decode()


In [23]:
split = overlapping_chunks(text, overlapping_factor=0)
avg_length = sum([len(tokenizer.encode(t)) for t in split]) / len(split)
print(f'non-overlapping chunking approach has {len(split)} documents with average length {avg_length:.1f} tokens')

non-overlapping chunking approach has 17 documents with average length 476.7 tokens


In [24]:
overlapping_split = overlapping_chunks(text)
avg_length = sum([len(tokenizer.encode(t)) for t in overlapping_split]) / len(overlapping_split)
print(f'overlapping chunking approach has {len(overlapping_split)} documents with average length {avg_length:.1f} tokens')

overlapping chunking approach has 24 documents with average length 477.4 tokens


In [25]:
overlapping_split = overlapping_chunks(text, max_tokens=128, overlapping_factor=2)
avg_length = sum([len(tokenizer.encode(t)) for t in overlapping_split]) / len(overlapping_split)
print(f'overlapping chunking approach has {len(overlapping_split)} documents with average length {avg_length:.1f} tokens')

overlapping chunking approach has 127 documents with average length 121.1 tokens


In [26]:
upload_texts_to_pinecone(overlapping_split, batch_size=32)

31

In [27]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'default': {'vector_count': 3192}},
 'total_vector_count': 3192}

In [28]:
query = 'How many horns does a flea have?'

results_from_pinecone = query_from_pinecone(query, top_k=5)

for result_from_pinecone in results_from_pinecone:
    print(f"{result_from_pinecone['id']}\t{result_from_pinecone['score']:.2f}\t{result_from_pinecone['metadata']['text'][:50]}")
    

5d253b260c308e67cb3ebcff1282c352	0.86	

When examined by a microscope, the flea is a p
3012bd43cb3d30722c203a3cb10cb88d	0.83	 15. 




FLEA. 

[Illustration]


This 
39eb0968545f5a7094621fb1a940dfe9	0.83	 They abound in warm countries, particularly in th
ff09a697b4f85c5a1f5a3288d0f1298b	0.83	

[Illustration]


This very troublesome litt
d51731e901109705d5629ff6ff461214	0.82	




FLEA. 

[Illustration]


This very 


In [29]:
def get_results_from_pinecone(query, top_k=3, verbose=True):

    results_from_pinecone = query_from_pinecone(query, top_k=top_k)
    if not results_from_pinecone:
        return []

    if verbose:
        print("Query:", query)
    
    
    final_results = []

    if verbose:
        print('Document ID (Hash)\t\tRetrieval Score\tText')
    for result_from_pinecone in results_from_pinecone:
        final_results.append(result_from_pinecone)
        if verbose:
            print(f"{result_from_pinecone['id']}\t{result_from_pinecone['score']:.2f}\t{result_from_pinecone['metadata']['text'][:50]}")

    return final_results

In [30]:
final_results = get_results_from_pinecone(query, top_k=3)

Query: How many horns does a flea have?
Document ID (Hash)		Retrieval Score	Text
5d253b260c308e67cb3ebcff1282c352	0.86	

When examined by a microscope, the flea is a p
3012bd43cb3d30722c203a3cb10cb88d	0.83	 15. 




FLEA. 

[Illustration]


This 
39eb0968545f5a7094621fb1a940dfe9	0.83	 They abound in warm countries, particularly in th


In [31]:
delete_texts_from_pinecone(overlapping_split)

{}

In [32]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'default': {'vector_count': 3223}},
 'total_vector_count': 3223}

# BoolQ

In [33]:
from datasets import load_dataset
from evaluate import load


dataset = load_dataset("boolq")

Found cached dataset parquet (/Users/sinanozdemir/.cache/huggingface/datasets/parquet/boolq-2485a98eacedc33a/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


  0%|          | 0/2 [00:00<?, ?it/s]

In [34]:
dataset['validation'][0]

{'question': 'does ethanol take more energy make that produces',
 'answer': False,
 'passage': "All biomass goes through at least some of these steps: it needs to be grown, collected, dried, fermented, distilled, and burned. All of these steps require resources and an infrastructure. The total amount of energy input into the process compared to the energy released by burning the resulting ethanol fuel is known as the energy balance (or ``energy returned on energy invested''). Figures compiled in a 2007 report by National Geographic Magazine point to modest results for corn ethanol produced in the US: one unit of fossil-fuel energy is required to create 1.3 energy units from the resulting ethanol. The energy balance for sugarcane ethanol produced in Brazil is more favorable, with one unit of fossil-fuel energy required to create 8 from the ethanol. Energy balance estimates are not easily produced, thus numerous such reports have been generated that are contradictory. For instance, a sep

In [35]:
upload_texts_to_pinecone(dataset['validation']['passage'], batch_size=32, show_progress_bar=True)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 103/103 [04:26<00:00,  2.59s/it]


6

In [36]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'default': {'vector_count': 5742}},
 'total_vector_count': 5742}

In [37]:
len(dataset['validation'])

3270

In [38]:
query = sample(dataset['validation']['question'], 1)[0]
print(query)
final_results = get_results_from_pinecone(query, top_k=3)


do supreme court justices have to be confirmed
Query: do supreme court justices have to be confirmed
Document ID (Hash)		Retrieval Score	Text
a79dddf6ba6fa59ec7f3736e7b06800e	0.86	The appointment and confirmation of Justices to th
0df8bb7f1d03ce29966ffd7c2d7140b4	0.84	In modern times, the confirmation process has attr
f412128f4187b611b6d1e2219e589f19	0.82	The Constitution provides that justices ``shall ho


In [39]:
q_to_hash = {data['question']: my_hash(data['passage']) for data in dataset['validation']}

q_to_hash[query]

'0df8bb7f1d03ce29966ffd7c2d7140b4'

In [40]:
val_sample = dataset['validation']

In [41]:
logger.setLevel(logging.CRITICAL)

predictions = []

# Note we will keep top_k the same so latency from Pinecone is consistent
#  and the only major time difference will be in the re-ranking

for question in tqdm(val_sample['question'][:10]):
    retrieved_hash = get_results_from_pinecone(question, top_k=1, verbose=False)[0]['id']
    correct_hash = q_to_hash[question]
    predictions.append(retrieved_hash == correct_hash)

openai_accuracy = sum(predictions)/len(predictions)

print(f'Accuracy of retrieval: {openai_accuracy}')

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:03<00:00,  3.24it/s]

Accuracy of retrieval: 0.9





In [42]:
document_embeddings = []
documents = val_sample['passage']

for i in tqdm(list(range(0, len(documents), 32))):
    batch = documents[i: i + 32]
    document_embeddings += get_embeddings(batch)


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 103/103 [00:47<00:00,  2.15it/s]


In [43]:
get_results_from_pinecone('when was halo 2 realsed?', 1)

Query: when was halo 2 realsed?
Document ID (Hash)		Retrieval Score	Text
10e8faab3d30566f4541b297a8d9e206	0.84	On February 9, 2006, Nick Baron announced that a v


[{'id': '10e8faab3d30566f4541b297a8d9e206',
  'metadata': {'date_uploaded': '2024-01-22T18:39:14.298962',
               'text': 'On February 9, 2006, Nick Baron announced that a '
                       'version of Halo 2 would be released on PC, exclusively '
                       'for the Windows Vista operating system. While this was '
                       'a deliberate decision by Microsoft to push sales of '
                       'Vista, the game could be enabled to play on Windows XP '
                       'through an unauthorized third-party patch. The game was '
                       'ported by a small team at Microsoft Game Studios '
                       '(codenamed Hired Gun) who worked closely with Bungie. '
                       'As one of the launch titles of Games for Windows -- '
                       'Live, the game offered Live features not available in '
                       'the Xbox version, such as guide support and '
                       'achieveme

In [47]:
from sklearn.metrics.pairwise import cosine_similarity

def simulate_pinecone_query(query):
    query_embedding = get_embedding(query)
    
    # Concatenate the query vector with document vectors
    all_embeddings = np.vstack([query_embedding] + document_embeddings)
    
    # Compute the cosine similarity matrix
    similarity_matrix = cosine_similarity(all_embeddings)
    
    # Retrieve similarities for the query (first row, skipping the first element which is the query self-comparison)
    query_similarities = similarity_matrix[0, 1:]
    
    # Find the index of the top document
    top_document_score , top_document_index = np.max(query_similarities), np.argmax(query_similarities)
    top_document = documents[top_document_index]
    
    return top_document_score, top_document_index, my_hash(top_document), top_document

In [48]:
simulate_pinecone_query('when was halo 2 realsed?')

(0.8436302508956575,
 3034,
 '10e8faab3d30566f4541b297a8d9e206',
 'On February 9, 2006, Nick Baron announced that a version of Halo 2 would be released on PC, exclusively for the Windows Vista operating system. While this was a deliberate decision by Microsoft to push sales of Vista, the game could be enabled to play on Windows XP through an unauthorized third-party patch. The game was ported by a small team at Microsoft Game Studios (codenamed Hired Gun) who worked closely with Bungie. As one of the launch titles of Games for Windows -- Live, the game offered Live features not available in the Xbox version, such as guide support and achievements. The Windows port also added two exclusive multiplayer maps and a map editor.')

In [49]:
from sklearn.metrics.pairwise import cosine_similarity
openai_predictions = []
for query in tqdm(val_sample['question']):
    top_document_score, top_document_index, retrieved_hash, top_document = simulate_pinecone_query(query)
    correct_hash = q_to_hash[query]
    
    openai_predictions.append(retrieved_hash == correct_hash)
    

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3270/3270 [35:13<00:00,  1.55it/s]


In [50]:
openai_accuracy = sum(openai_predictions)/len(openai_predictions)

print(f'Accuracy of retrieval: {openai_accuracy}')

Accuracy of retrieval: 0.8532110091743119


# OPEN SOURCE ALTERNATIVE TO EMBEDDING

In [51]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

docs = ["Around 9 Million people live in London", "London is known for its financial district"]

doc_emb = model.encode(docs, batch_size=32, show_progress_bar=True)

doc_emb.shape  #  == ('2, 768')

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

(2, 768)

In [52]:
#Encode query and documents
docs = dataset['validation']['passage']
doc_emb = model.encode(docs, batch_size=32, show_progress_bar=True)

Batches:   0%|          | 0/103 [00:00<?, ?it/s]

In [53]:
query = sample(dataset['validation']['question'], 1)[0]
print(query)
final_results = get_results_from_pinecone(query, top_k=3)


can carbon 14 dating be used to detect the age of a live animal
Query: can carbon 14 dating be used to detect the age of a live animal
Document ID (Hash)		Retrieval Score	Text
04c16b27e72a49ff264c782230b3199f	0.84	The method was developed in the late 1940s by Will
e11909982c098d4b058616623548eadf	0.73	A fossil fuel is a fuel formed by natural processe
7659a2874b2c633dee6b8227e95e9ac3	0.72	Chimpanzees and other apes -- species which have b


In [54]:
query_emb = model.encode(query)

#Compute dot score between query and all document embeddings
scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()

#Combine docs & scores
doc_score_pairs = list(zip(docs, scores))

#Sort by decreasing score
doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)

#Output passages & scores
for doc, score in doc_score_pairs[:3]:
    print(score, doc)


0.5774650573730469 The method was developed in the late 1940s by Willard Libby, who received the Nobel Prize in Chemistry for his work in 1960. It is based on the fact that radiocarbon ( C) is constantly being created in the atmosphere by the interaction of cosmic rays with atmospheric nitrogen. The resulting C combines with atmospheric oxygen to form radioactive carbon dioxide, which is incorporated into plants by photosynthesis; animals then acquire C by eating the plants. When the animal or plant dies, it stops exchanging carbon with its environment, and from that point onwards the amount of C it contains begins to decrease as the C undergoes radioactive decay. Measuring the amount of C in a sample from a dead plant or animal such as a piece of wood or a fragment of bone provides information that can be used to calculate when the animal or plant died. The older a sample is, the less C there is to be detected, and because the half-life of C (the period of time after which half of a g

In [55]:
logger.setLevel(logging.CRITICAL)  # just to suppress some logs


def eval_ranking_open_source(query, top_k=3):
    query_emb = np.array(model.encode(query))

    #Compute dot score between query and all document embeddings
    scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()

    #Combine docs & scores
    doc_score_pairs = list(zip(docs, scores))

    #Sort by decreasing score
    doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)[:top_k]

    retrieved_hash = my_hash(doc_score_pairs[0][0])
    return retrieved_hash


In [56]:
eval_ranking_open_source(query)

'04c16b27e72a49ff264c782230b3199f'

In [57]:
logger.setLevel(logging.CRITICAL)

predictions = []
for question in tqdm(val_sample['question']):
    retrieved_hash = eval_ranking_open_source(question, top_k=3)
    correct_hash = q_to_hash[question]
    predictions.append(retrieved_hash == correct_hash)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3270/3270 [02:46<00:00, 19.70it/s]


In [58]:
open_source_accuracy = sum(predictions)/len(predictions)

print(f'Open-source accuracy: {open_source_accuracy}')

Open-source accuracy: 0.8235474006116208
