In [7]:
import os
from dotenv import load_dotenv, find_dotenv
import pinecone
from langchain.vectorstores import Pinecone
from langchain_openai import OpenAIEmbeddings
import openai
import numpy as np

In [129]:
load_dotenv(find_dotenv(), override=True)

True

In [130]:
def load_document(file):
    nombre, extension = os.path.splitext(file) 
    if extension == '.html':
        from langchain.document_loaders import UnstructuredHTMLLoader
        print(f'load {file}...')
        loader = UnstructuredHTMLLoader(file)
    elif extension == '.txt':
        from langchain.document_loaders import TextLoader  
        print(f'load {file}...')
        loader = TextLoader(file)
    elif extension == '.pdf':
        from langchain.document_loaders import PyPDFLoader
        print(f'load {file}...')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader
        print(f'load {file}...')
        loader = Docx2txtLoader(file)
    else:
        print('The document format is not supported!')
        return None

    data = loader.load()
    return data

In [131]:
document = "../test.pdf"
content = load_document(document)
print(content)

load ../test.pdf...
[Document(page_content="Title: Whiskers' Midnight Adventure  \nIn the quiet town of Meadowville, under the glow of a silver moon, there lived a curious cat \nnamed Whiskers. With fur as black as the night and eyes that shimmered like stars, \nWhiskers was known for his adventurous spirit.  \nOne night, while his human family slept soundly, Whiskers heard a peculiar sound coming \nfrom the kitchen. His ears perked up, and his paws silently carried him towards the source. \nThe moonlight streamed through the window, casting shadows that danced on th e walls as \nWhiskers crept closer.  \nAs he peered around the corner, he saw a small mouse, its fur glistening under the moon’s \nlight. The mouse, seemingly unaware of the cat’s presence, continued nibbling on a piece \nof cheese it had found. Whiskers, with a flick of his tail, prepared to pounc e. \nBut just as he leaped, the mouse scurried away with astonishing speed, disappearing \nunder the refrigerator. Whiskers, p

In [132]:
def split (data, chunk_size=150):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=20)
    fragments = text_splitter.split_documents(data)
    return fragments

In [133]:
fragments = split(content)
print(len(fragments))

17


In [8]:
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key='')

In [135]:
# Assume `fragments` is a list of Document objects
text_fragments = [doc.page_content for doc in fragments]
print(len(text_fragments))

17


In [136]:
def batch_embed(text):
    batch_result = embeddings.embed_query(text)
    return batch_result

In [137]:
vectors = []

for item in text_fragments:
    vectors.append(batch_embed(item))
    print(item)

Title: Whiskers' Midnight Adventure  
In the quiet town of Meadowville, under the glow of a silver moon, there lived a curious cat
named Whiskers. With fur as black as the night and eyes that shimmered like stars, 
Whiskers was known for his adventurous spirit.
One night, while his human family slept soundly, Whiskers heard a peculiar sound coming
from the kitchen. His ears perked up, and his paws silently carried him towards the source.
The moonlight streamed through the window, casting shadows that danced on th e walls as 
Whiskers crept closer.
As he peered around the corner, he saw a small mouse, its fur glistening under the moon’s
light. The mouse, seemingly unaware of the cat’s presence, continued nibbling on a piece
of cheese it had found. Whiskers, with a flick of his tail, prepared to pounc e.
But just as he leaped, the mouse scurried away with astonishing speed, disappearing
under the refrigerator. Whiskers, puzzled but intrigued, decided to wait. Minutes turned into
hours, a

In [138]:
print(vectors)

[[0.0019333917485302883, -0.00017494611497669918, -0.009536127939149399, 0.002947735950253168, -0.020235198423478314, 0.030805052552052497, 0.0026343875681165384, -0.03558603815720267, 0.0018607078394565688, -0.02203129692090425, 0.025713949555736193, 0.017405370942659756, 0.0023517278958486675, -0.008547626775746813, 0.011816787072163618, -0.012210894861739669, 0.012107522708458497, -0.011597119733041884, -0.0017314920657784708, -0.01774133253629944, -0.005104025097553081, 0.015169937766990346, 0.01036956947564631, -0.016591311161034097, 0.011978306352703767, 0.00940691158339467, 0.01719862663609741, -0.022974573989607252, 0.021682414157350397, 0.009613656821279626, 0.004283504730970338, -0.02075206198385201, 0.002547166877228075, -0.024757751317119027, -0.010376030991926005, 0.001217051574009659, 0.003850631569006562, -0.01255331704033644, 0.01274714110830723, -0.016681763075723714, 0.007714184773588597, 0.005068490483305203, -0.011758639944904642, -0.012152747734480693, 0.0039669258

In [139]:
ids = [f"id_{i}" for i in range(len(vectors))]
print(ids)

['id_0', 'id_1', 'id_2', 'id_3', 'id_4', 'id_5', 'id_6', 'id_7', 'id_8', 'id_9', 'id_10', 'id_11', 'id_12', 'id_13', 'id_14', 'id_15', 'id_16']


In [140]:
data = list(zip(ids, vectors))

In [141]:
data

[('id_0',
  [0.0019333917485302883,
   -0.00017494611497669918,
   -0.009536127939149399,
   0.002947735950253168,
   -0.020235198423478314,
   0.030805052552052497,
   0.0026343875681165384,
   -0.03558603815720267,
   0.0018607078394565688,
   -0.02203129692090425,
   0.025713949555736193,
   0.017405370942659756,
   0.0023517278958486675,
   -0.008547626775746813,
   0.011816787072163618,
   -0.012210894861739669,
   0.012107522708458497,
   -0.011597119733041884,
   -0.0017314920657784708,
   -0.01774133253629944,
   -0.005104025097553081,
   0.015169937766990346,
   0.01036956947564631,
   -0.016591311161034097,
   0.011978306352703767,
   0.00940691158339467,
   0.01719862663609741,
   -0.022974573989607252,
   0.021682414157350397,
   0.009613656821279626,
   0.004283504730970338,
   -0.02075206198385201,
   0.002547166877228075,
   -0.024757751317119027,
   -0.010376030991926005,
   0.001217051574009659,
   0.003850631569006562,
   -0.01255331704033644,
   0.01274714110830723,


In [142]:
from pinecone import Pinecone

pc = Pinecone(api_key="")
index = pc.Index("langchain-test")

index.upsert(
  vectors=data,
  namespace="ns1"
)

{'upserted_count': 17}

In [9]:
user_question = "What is the story about?"

In [10]:
user_vector = embeddings.embed_query(user_question)
print(user_vector)

[0.024610956908560227, -0.032308990426663145, 0.0014339011186947524, -0.008191012392028229, -0.020553372307666927, 0.017443821996858164, -0.013626405946192292, -0.00022950317435928123, -0.01965590138617366, -0.011610253556570823, 0.016040732290614403, 0.012368680802373926, -0.005078301172190534, -0.01404354069855334, 0.022664328685090433, 0.012533007093682363, 0.034988767005608316, -0.01629354168298965, 0.00044834098740233067, -0.008709270156281266, -0.015686799886347167, -0.015610956696105538, -0.02365028270765051, 0.009372894345604968, -0.006241222328206866, 0.008740871795989489, 0.023056182684478482, -0.011287922792011813, 0.03964045162967365, -0.022664328685090433, 0.01146488899414543, -0.00585884872759902, -0.01087078710832813, 0.010042837559018625, -0.009372894345604968, -0.0025201897885348597, 0.016066012112264764, -0.008557584707120645, -0.0030115872766230695, -0.005716643677218603, 0.010453652355967081, -0.002006671525179948, -0.002880442613876004, -0.018859551613927106, -0.02

In [11]:
from pinecone import Pinecone

pc = Pinecone(api_key="")
index = pc.Index("langchain-test")

index.query(
    namespace="ns1",
    vector=user_vector,
    top_k=3,
    include_values=True
)

{'matches': [{'id': 'id_0',
              'score': 0.768978596,
              'values': [0.00193339179,
                         -0.000174946108,
                         -0.00953612756,
                         0.00294773607,
                         -0.0202351976,
                         0.0308050532,
                         0.00263438746,
                         -0.0355860367,
                         0.00186070788,
                         -0.022031296,
                         0.0257139504,
                         0.0174053703,
                         0.0023517278,
                         -0.00854762644,
                         0.0118167875,
                         -0.0122108953,
                         0.0121075232,
                         -0.0115971193,
                         -0.00173149211,
                         -0.0177413318,
                         -0.00510402489,
                         0.0151699381,
                         0.0103695691,
                   

# WIP

In [None]:
def batch_embedding_request(strings, model="text-embedding-ada-002", max_tokens=4096):
    try:
        # Create batch request for embeddings
        response = openai.embeddings.create(
            model=model,
            input=strings,
            max_tokens=max_tokens
        )
        # Extract embeddings
        embeddings = [embedding['embedding'] for embedding in response['data']]
        return embeddings
    except Exception as e:
        print("An error occurred:", e)
        return None

In [None]:
print(text_fragments)
test_vector = []

In [None]:
# Get embeddings and store them in a list
embeddings_list = batch_embedding_request(text_fragments)

# Convert list of embeddings to a NumPy array for further manipulation
embeddings_array = np.array(embeddings_list)

# Print embeddings
print("Embeddings Array:")
print(embeddings_array)