# How to Query your OPENAI embeddings

In [1]:
import pandas as pd
import tiktoken
import openai
import os
from uuid import uuid4

os.environ["PINECONE_ENVIRONMENT"]="XXXXX"
os.environ["PINECONE_API_KEY"]="XXXXX"
os.environ["OPENAI_API_KEY"]="XXXXX"

openai.api_key = os.environ["OPENAI_API_KEY"]

In [2]:
contents = []


tiktoken_encoding = tiktoken.get_encoding("gpt2")
for file in os.listdir("./Store"):
    with open(f"./store/{file}",'r') as f:
        file_content = f.read()

        tokens = tiktoken_encoding.encode(file_content)
        total_tokens = len(tokens)
        contents.append((file,file_content,total_tokens))
        

In [3]:
df = pd.DataFrame(contents,columns=['filename','file_content','tokens'])

In [4]:
df.head()

Unnamed: 0,filename,file_content,tokens
0,context4.txt,"Finally, it is important to address the stigma...",186
1,context.txt,Homelessness is a pervasive problem that affec...,130
2,context1.txt,"In many areas, the cost of living has\nskyrock...",118
3,context2.txt,Poverty is another significant factor that\nco...,293
4,context3.txt,"Additionally, policies that address systemic ...",125


In [5]:
df['embeddings'] = df.file_content.apply(lambda x:openai.Embedding.create(input=x, engine='text-embedding-ada-002')['data'][0]['embedding'])
df['id']=[str(uuid4()) for _ in range(len(df))]

df.to_csv('myEmbeddings.csv')
df.head()

Unnamed: 0,filename,file_content,tokens,embeddings,id
0,context4.txt,"Finally, it is important to address the stigma...",186,"[0.006436343304812908, -0.007102828472852707, ...",7a9a085d-d26b-4885-a045-6587cc59b2fd
1,context.txt,Homelessness is a pervasive problem that affec...,130,"[0.025118481367826462, -0.008370761759579182, ...",f68289ba-ca08-478c-83c5-5cbd53ca49f4
2,context1.txt,"In many areas, the cost of living has\nskyrock...",118,"[0.020307492464780807, -0.01854877546429634, 0...",34aa72ab-b158-4e70-a147-54981843d95d
3,context2.txt,Poverty is another significant factor that\nco...,293,"[0.021396443247795105, -0.010628752410411835, ...",e7f33e6e-8f8f-4df0-955b-da5f85c2929b
4,context3.txt,"Additionally, policies that address systemic ...",125,"[0.008739287033677101, -0.0062975212931632996,...",dfa88351-0f58-4688-b7f4-1d1a28dff8ca


In [6]:
from langchain.vectorstores import  pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

EMBEDDING_MODEL="text-embedding-ada-002"
embeddings = OpenAIEmbeddings(openai_api_key=os.environ["OPENAI_API_KEY"])

  from tqdm.autonotebook import tqdm


In [7]:
pinecone.init(
    api_key=os.environ["PINECONE_API_KEY"],
    environment=os.environ["PINECONE_ENVIRONMENT"]
)

index_name="homelessness"

if index_name not in pinecone.list_indexes():
    pinecone.create_index(index_name, dimension=1536, metric='cosine')


In [8]:
index = pinecone.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 15}},
 'total_vector_count': 15}

In [9]:
from tqdm.auto import tqdm

batch_size = 100 #how many embeddings we create and insert at once

#convert the Dataframe to a list of dictionaries
chunks = df.to_dict(orient='records')

#upsert embeddings into Pinecone in batches of 100
for i in tqdm(range(0, len(chunks), batch_size)):
    i_end = min(len(chunks), i+batch_size)
    meta_batch = chunks[i:i_end]
    ids_batch = [x['id'] for x in meta_batch]
    embeddings = [x['embeddings'] for x in meta_batch]

    data=[{
            'filename': x['filename'],
            'file_content': x['file_content']    
    } for x in meta_batch ]

    to_upsert = list(zip(ids_batch,embeddings,data))
    index.upsert(vectors=to_upsert)


100%|██████████| 1/1 [00:01<00:00,  1.06s/it]


In [10]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 20}},
 'total_vector_count': 20}

##### 1. take query and generate embedding
##### use (1) and query db -> list of docs/texts\n
##### use (2) and create completion prompt\n


In [11]:
limit = 3750


def contruct_prompt(query):
    embedding_model = "text-embedding-ada-002"
    embed_query = openai.Embedding.create(input=query, engine=embedding_model)
    

    # retrive from pinecone
    query_embeds = embed_query['data'][0]['embedding']

    print(query_embeds)

    # get relevant contexts (including the questions)
    response = index.query(query_embeds, top_k=3, include_metadata=True)

    print(response)
    
    contexts =[x['metadata']['file_content'] for x in response['matches']]

    
    prompt_start = (
        """   
       Answer the questions based on the context below in markdown format.\n\nContext:\n
       """
    )

    prompt_end = (f"\n\nQuestion:{query}.\nIf the AI does not know the answer to a question, it truthfully says it does not know.\nAnswer: ")

    # append contexts until hitting limits
    for i in range(1, len(contexts)):
        if len("-".join(contexts[:i])) >= limit:
            prompt= prompt_start + "-".join(contexts[:i-1]) + prompt_end
            break
        elif i == len(contexts)-1:
            prompt= prompt_start + "-".join(contexts) + prompt_end
    
    return prompt

In [12]:
query = "what is life?"
prompt_with_contexts = contruct_prompt(query)

prompt_with_contexts

[0.008120162412524223, -0.007696217857301235, -0.029793493449687958, -0.016631657257676125, -0.02100154384970665, -0.0014944033464416862, -0.0213667880743742, 0.011029071174561977, -0.0177534781396389, -0.008478883653879166, -0.008113639429211617, 0.020884143188595772, -0.017727389931678772, 0.007872317917644978, 0.018131768330931664, 0.005935218650847673, 0.04197699949145317, -0.018053501844406128, 0.012770503759384155, -0.013944502919912338, -0.010546427220106125, 0.012340037152171135, 0.003665486816316843, -0.005801513325423002, 0.0008747925749048591, 0.00930720567703247, 0.022645141929388046, -0.03026309423148632, 0.015927257016301155, -0.019057922065258026, 0.008641939610242844, -0.02027105540037155, -0.017427368089556694, -0.020623255521059036, -0.009033272974193096, -0.01273789256811142, 0.006492868531495333, -0.009235461242496967, 0.007167918141931295, 0.009528961032629013, 0.033341582864522934, 0.008622372522950172, -0.00048386710113845766, 0.01348794810473919, -0.040594287216

'   \n       Answer the questions based on the context below in markdown format.\n\nContext:\n\n       In many areas, the cost of living has\nskyrocketed, while wages have stagnated, leaving many people unable to afford rent or\nmortgages. This is particularly true in major cities like San Francisco and New York,\nwhere the cost of living is much higher than the national average. As a result, many\npeople are forced to choose between paying for housing and other basic necessities\nlike food and healthcare. Additionally, the affordable housing that is available is\noften in poor condition or located in unsafe neighborhoods, making it difficult for\npeople to secure stable housing. -In many areas, the cost of living has\nskyrocketed, while wages have stagnated, leaving many people unable to afford rent or\nmortgages. This is particularly true in major cities like San Francisco and New York,\nwhere the cost of living is much higher than the national average. As a result, many\npeople are 

In [13]:
response = openai.Completion.create(
    engine="text-davinci-003", 
    prompt=prompt_with_contexts,
    temperature=0,
    max_tokens=350,
    top_p=1
)

response['choices'][0]['text']

" I don't know."