In [1]:
import openai
from openai.embeddings_utils import get_embeddings, get_embedding
from datetime import datetime
import hashlib
import re
import os
from tqdm import tqdm

import logging

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)


In [3]:
openai.api_key = os.environ.get('OPENAI_API_KEY')
pinecone_key = os.environ.get('PINECONE_API_KEY')

INDEX_NAME = 'semantic-search'
NAMESPACE = 'default'
ENGINE = 'text-embedding-ada-002'

In [4]:
import pinecone

pinecone.init(api_key=pinecone_key, environment="us-west1-gcp")

In [5]:
if not INDEX_NAME in pinecone.list_indexes():
    pinecone.create_index(
        INDEX_NAME,  # The name of the index
        dimension=1536,  # The dimensionality of the vectors
        metric='cosine',  # The similarity metric to use when searching the index
        pod_type="p1"  # The type of Pinecone pod
    )

# Store the index as a variable
index = pinecone.Index(INDEX_NAME)

In [6]:
def my_hash(s):
    # Return the MD5 hash of the input string as a hexadecimal string
    return hashlib.md5(s.encode()).hexdigest()

my_hash('I like to hash it')

'4ed8caa969fba0d4b9500f0866996b76'

In [13]:
def prepare_for_pinecone(texts, engine=ENGINE):
    # Get the current UTC date and time
    now = datetime.utcnow()
    
    # Generate vector embeddings for each string in the input list, using the specified engine
    embeddings = get_embeddings(texts, engine=engine)
    
    # Create tuples of (hash, embedding, metadata) for each input string and its corresponding vector embedding
    # The my_hash() function is used to generate a unique hash for each string, and the datetime.utcnow() function is used to generate the current UTC date and time
    return [
        (
            my_hash(text),  # A unique ID for each string, generated using the my_hash() function
            embedding,  # The vector embedding of the string
            dict(text=text, date_uploaded=now)  # A dictionary of metadata, including the original text and the current UTC date and time
        ) 
        for text, embedding in zip(texts, embeddings)  # Iterate over each input string and its corresponding vector embedding
    ]


In [14]:
texts = ['hi']

In [20]:
_id, embedding, metadata = prepare_for_pinecone(texts)[0]

print('ID:  ',_id, '\nLEN: ', len(embedding), '\nMETA:', metadata)

ID:   49f68a5c8493ec2c0bf489821c21fc3b 
LEN:  1536 
META: {'text': 'hi', 'date_uploaded': datetime.datetime(2023, 8, 14, 14, 57, 11, 817858)}


In [12]:
def upload_texts_to_pinecone(texts, namespace=NAMESPACE, batch_size=None, show_progress_bar=False):
    # Call the prepare_for_pinecone function to prepare the input texts for indexing
    total_upserted = 0
    if not batch_size:
        batch_size = len(texts)

    _range = range(0, len(texts), batch_size)
    for i in tqdm(_range) if show_progress_bar else _range:
        batch = texts[i: i + batch_size]
        prepared_texts = prepare_for_pinecone(batch)

        # Use the upsert() method of the index object to upload the prepared texts to Pinecone
        total_upserted += index.upsert(
            prepared_texts,
            namespace=namespace
        )['upserted_count']

    return total_upserted


# Call the upload_texts_to_pinecone() function with the input texts
upload_texts_to_pinecone(texts)


In [26]:
def query_from_pinecone(query, top_k=3):
    # get embedding from THE SAME embedder as the documents
    query_embedding = get_embedding(query, engine=ENGINE)

    return index.query(
      vector=query_embedding,
      top_k=top_k,
      namespace=NAMESPACE,
      include_metadata=True   # gets the metadata (dates, text, etc)
    ).get('matches')

query_from_pinecone('hello')

[{'id': '49f68a5c8493ec2c0bf489821c21fc3b',
  'metadata': {'date_uploaded': datetime.datetime(2023, 8, 14, 14, 57, 43, 111756),
               'text': 'hi'},
  'score': 0.924787819,
  'values': []}]

In [28]:
query_from_pinecone('What are fixed costs?')  # something will ALWAYS be returned

[{'id': '49f68a5c8493ec2c0bf489821c21fc3b',
  'metadata': {'date_uploaded': datetime.datetime(2023, 8, 14, 14, 57, 43, 111756),
               'text': 'hi'},
  'score': 0.749424815,
  'values': []}]

In [29]:
import hashlib

def delete_texts_from_pinecone(texts, namespace=NAMESPACE):
    # Compute the hash (id) for each text
    hashes = [hashlib.md5(text.encode()).hexdigest() for text in texts]
    
    # The ids parameter is used to specify the list of IDs (hashes) to delete
    return index.delete(ids=hashes, namespace=namespace)

# delete our text
delete_texts_from_pinecone(texts)

# test that the index is empty
query_from_pinecone('hello')

[]

In [31]:
# Importing the tiktoken library
import tiktoken

# Initializing a tokenizer for the 'cl100k_base' model
# This tokenizer is designed to work with the 'ada-002' embedding model
tokenizer = tiktoken.get_encoding("cl100k_base")

# Using the tokenizer to encode the text 'hey there'
# The resulting output is a list of integers representing the encoded text
# This is the input format required for embedding using the 'ada-002' model
tokenizer.encode('hey there')


[36661, 1070]

In [9]:
from datasets import load_dataset
from evaluate import load


dataset = load_dataset("boolq")

Found cached dataset boolq (/Users/sinanozdemir/.cache/huggingface/datasets/boolq/default/0.1.0/bf0dd57da941c50de94ae3ce3cef7fea48c08f337a4b7aac484e9dddc5aa24e5)


  0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
dataset['validation'][0]

{'question': 'does ethanol take more energy make that produces',
 'answer': False,
 'passage': "All biomass goes through at least some of these steps: it needs to be grown, collected, dried, fermented, distilled, and burned. All of these steps require resources and an infrastructure. The total amount of energy input into the process compared to the energy released by burning the resulting ethanol fuel is known as the energy balance (or ``energy returned on energy invested''). Figures compiled in a 2007 report by National Geographic Magazine point to modest results for corn ethanol produced in the US: one unit of fossil-fuel energy is required to create 1.3 energy units from the resulting ethanol. The energy balance for sugarcane ethanol produced in Brazil is more favorable, with one unit of fossil-fuel energy required to create 8 from the ethanol. Energy balance estimates are not easily produced, thus numerous such reports have been generated that are contradictory. For instance, a sep

In [36]:
for idx in tqdm(range(0, len(dataset['validation']), 256)):
    data_sample = dataset['validation'][idx:idx + 256]

    passages = data_sample['passage']

    upload_texts_to_pinecone(passages)

100%|███████████████████████████████████████████| 13/13 [06:29<00:00, 29.99s/it]


In [7]:
def get_results_from_pinecone(query, top_k=3, verbose=True):

    results_from_pinecone = query_from_pinecone(query, top_k=top_k)
    if not results_from_pinecone:
        return []

    if verbose:
        print("Query:", query)
    
    
    final_results = []

    if verbose:
        print('Document ID (Hash)\t\tRetrieval Score\tText')
    for result_from_pinecone in results_from_pinecone:
        final_results.append(result_from_pinecone)
        if verbose:
            print(f"{result_from_pinecone['id']}\t{result_from_pinecone['score']:.2f}\t{result_from_pinecone['metadata']['text'][:50]}")

    return final_results

In [14]:
from random import sample

query = sample(dataset['validation']['question'], 1)[0]
print(query)
final_results = get_results_from_pinecone(query, top_k=3)


is it possible to thunder and lightning while snowing
Query: is it possible to thunder and lightning while snowing
Document ID (Hash)		Retrieval Score	Text
cfa9150bac58fea67fec83e20d7ac29f	0.85	Thundersnow, also known as a winter thunderstorm o
e64ea582a771606ccfcde248c189a0a6	0.81	On somewhat rare occasions, a thunderstorm can bec
9b2a2c201b62136f19d6f4e8409cef14	0.81	A dry thunderstorm or heat storm is a thunderstorm


In [15]:
q_to_hash = {data['question']: my_hash(data['passage']) for data in dataset['validation']}

q_to_hash[query]

'cfa9150bac58fea67fec83e20d7ac29f'

In [47]:
logger.setLevel(logging.CRITICAL)

predictions = []

# Note we will keep top_k the same so latency from Pinecone is consistent
#  and the only major time difference will be in the re-ranking

for question in tqdm(dataset['validation']['question']):
    retrieved_hash = get_results_from_pinecone(question, top_k=1, verbose=False)[0]['id']
    correct_hash = q_to_hash[question]
    predictions.append(retrieved_hash == correct_hash)
    
accuracy = sum(predictions)/len(predictions)

print(f'Accuracy with OpenAI embeddings: {accuracy}')

100%|███████████████████████████████████████| 3270/3270 [26:09<00:00,  2.08it/s]

Accuracy with OpenAI embeddings: 0.8532110091743119





# OPEN SOURCE ALTERNATIVE TO EMBEDDING

In [36]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/multi-qa-mpnet-base-cos-v1')

docs = ["Around 9 Million people live in London", "London is known for its financial district"]

doc_emb = model.encode(docs, batch_size=32, show_progress_bar=True)

doc_emb.shape#  == ('2, 768')


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

(2, 768)

In [17]:
#Encode query and documents
docs = dataset['validation']['passage']
doc_emb = model.encode(docs, batch_size=32, show_progress_bar=True)

Batches:   0%|          | 0/103 [00:00<?, ?it/s]

In [18]:
from random import sample

query = sample(dataset['validation']['question'], 1)[0]
print(query)
# Using OpenAI
final_results = get_results_from_pinecone(query, top_k=3)


can you really hear the ocean in a seashell
Query: can you really hear the ocean in a seashell
Document ID (Hash)		Retrieval Score	Text
0612760f30f0cb57c712eeea99a3b0e9	0.84	The rushing sound that one hears is in fact the no
a0b84b6b6637a89fdde0cfb285371469	0.76	Marlin and Nemo attempt to rescue Dory. With the h
8691b3d6b92277de6cb7bd9cff973c00	0.76	The most common marine fish in the Sound include p


In [19]:
from sentence_transformers import util

In [20]:
query_emb = model.encode(query)

#Compute dot score between query and all document embeddings
scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()

#Combine docs & scores
doc_score_pairs = list(zip(docs, scores))

#Sort by decreasing score
doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)

#Output passages & scores
for doc, score in doc_score_pairs[:3]:
    print(score, doc)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

0.6290125846862793 The rushing sound that one hears is in fact the noise of the surrounding environment, resonating within the cavity of the shell. The same effect can be produced with any resonant cavity, such as an empty cup or even by simply cupping one's hand over one's ear. The similarity of the noise produced by the resonator to that of the oceans is due to the resemblance between ocean movements and airflow.
0.4079173803329468 The most common marine fish in the Sound include porgy, butterfish, winter flounder, summer flounder, windowpane flounder, fourspot flounder, northern and striped sea robin, little skate, menhaden, Atlantic silversides, black seabass, blackfish (tautog), cunner, bluefish, and smooth dogfish. Frequently Atlantic bonito and false albacore, both members of the tuna family, enter the sound and can be caught by anglers from small boats and shore. Many species have declined rapidly since 1975 due to over fishing. Winter flounder may not be currently present exce

In [21]:
import numpy as np
logger.setLevel(logging.CRITICAL)  # just to suppress some logs


def eval_ranking_open_source(query, top_k=3):
    query_emb = np.array(model.encode(query))

    #Compute dot score between query and all document embeddings
    scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()

    #Combine docs & scores
    doc_score_pairs = list(zip(docs, scores))

    #Sort by decreasing score
    doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)[:top_k]

    retrieved_hash = my_hash(doc_score_pairs[0][0])
    return retrieved_hash


In [22]:
eval_ranking_open_source(query)

'0612760f30f0cb57c712eeea99a3b0e9'

In [31]:
logger.setLevel(logging.CRITICAL)

i = 0
print_every = 100
predictions = []
for question in tqdm(dataset['validation']['question']):
    retrieved_hash = eval_ranking_open_source(question, top_k=3)
    correct_hash = q_to_hash[question]
    predictions.append(retrieved_hash == correct_hash)
    i += 1
    if i % print_every == 0:
        print(f'Step {i}')
        raw_accuracy = sum(predictions)/len(predictions)

        print(f'Accuracy: {raw_accuracy}')


 13%|█████▏                                   | 103/814 [00:05<00:38, 18.50it/s]

Step 100
Accuracy: 0.78


 25%|██████████▏                              | 202/814 [00:10<00:31, 19.53it/s]

Step 200
Accuracy: 0.81


 37%|███████████████▏                         | 302/814 [00:16<00:26, 19.56it/s]

Step 300
Accuracy: 0.81


 50%|████████████████████▎                    | 403/814 [00:21<00:20, 19.99it/s]

Step 400
Accuracy: 0.8125


 62%|█████████████████████████▎               | 503/814 [00:26<00:16, 18.64it/s]

Step 500
Accuracy: 0.806


 74%|██████████████████████████████▎          | 603/814 [00:31<00:10, 19.63it/s]

Step 600
Accuracy: 0.8133333333333334


 86%|███████████████████████████████████▎     | 702/814 [00:36<00:05, 18.86it/s]

Step 700
Accuracy: 0.8228571428571428


 99%|████████████████████████████████████████▍| 802/814 [00:42<00:00, 17.97it/s]

Step 800
Accuracy: 0.82


100%|█████████████████████████████████████████| 814/814 [00:42<00:00, 19.02it/s]


In [34]:
raw_accuracy = sum(predictions)/len(predictions)
print(f'Accuracy with open source embedder: {raw_accuracy}')

Accuracy with open source embedder: 0.8206388206388207
