In [22]:
import os
from dotenv import load_dotenv, find_dotenv
import pinecone
from langchain.vectorstores import Pinecone
from langchain_openai import OpenAIEmbeddings
import openai
import numpy as np

In [23]:
load_dotenv(find_dotenv(), override=True)

True

In [24]:
def load_document(file):
    nombre, extension = os.path.splitext(file) 
    if extension == '.html':
        from langchain.document_loaders import UnstructuredHTMLLoader
        print(f'load {file}...')
        loader = UnstructuredHTMLLoader(file)
    elif extension == '.txt':
        from langchain.document_loaders import TextLoader  
        print(f'load {file}...')
        loader = TextLoader(file)
    elif extension == '.pdf':
        from langchain.document_loaders import PyPDFLoader
        print(f'load {file}...')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader
        print(f'load {file}...')
        loader = Docx2txtLoader(file)
    else:
        print('The document format is not supported!')
        return None

    data = loader.load()
    return data

In [25]:
document = "../test.pdf"
content = load_document(document)
print(content)

load ../test.pdf...
[Document(page_content="Title: Whiskers' Midnight Adventure  \nIn the quiet town of Meadowville, under the glow of a silver moon, there lived a curious cat \nnamed Whiskers. With fur as black as the night and eyes that shimmered like stars, \nWhiskers was known for his adventurous spirit.  \nOne night, while his human family slept soundly, Whiskers heard a peculiar sound coming \nfrom the kitchen. His ears perked up, and his paws silently carried him towards the source. \nThe moonlight streamed through the window, casting shadows that danced on th e walls as \nWhiskers crept closer.  \nAs he peered around the corner, he saw a small mouse, its fur glistening under the moon’s \nlight. The mouse, seemingly unaware of the cat’s presence, continued nibbling on a piece \nof cheese it had found. Whiskers, with a flick of his tail, prepared to pounc e. \nBut just as he leaped, the mouse scurried away with astonishing speed, disappearing \nunder the refrigerator. Whiskers, p

In [26]:
def split (data, chunk_size=150):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=20)
    fragments = text_splitter.split_documents(data)
    return fragments

In [27]:
fragments = split(content)
print(len(fragments))

17


In [28]:
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key='')

In [29]:
# Assume `fragments` is a list of Document objects
text_fragments = [doc.page_content for doc in fragments]
print(len(text_fragments))

17


In [30]:
def batch_embed(text):
    batch_result = embeddings.embed_query(text)
    return batch_result

In [31]:
vectors = []
metadata = []

for item in text_fragments:
    # Assuming `batch_embed` function returns a vector for the item
    vector = batch_embed(item)
    vectors.append(vector)
    # Create metadata for each item; here, we just store the item text itself
    metadata.append({"content": item})
    print(item)

Title: Whiskers' Midnight Adventure  
In the quiet town of Meadowville, under the glow of a silver moon, there lived a curious cat
named Whiskers. With fur as black as the night and eyes that shimmered like stars, 
Whiskers was known for his adventurous spirit.
One night, while his human family slept soundly, Whiskers heard a peculiar sound coming
from the kitchen. His ears perked up, and his paws silently carried him towards the source.
The moonlight streamed through the window, casting shadows that danced on th e walls as 
Whiskers crept closer.
As he peered around the corner, he saw a small mouse, its fur glistening under the moon’s
light. The mouse, seemingly unaware of the cat’s presence, continued nibbling on a piece
of cheese it had found. Whiskers, with a flick of his tail, prepared to pounc e.
But just as he leaped, the mouse scurried away with astonishing speed, disappearing
under the refrigerator. Whiskers, puzzled but intrigued, decided to wait. Minutes turned into
hours, a

In [32]:
vectors

[[0.0019333917485302883,
  -0.00017494611497669918,
  -0.009536127939149399,
  0.002947735950253168,
  -0.020235198423478314,
  0.030805052552052497,
  0.0026343875681165384,
  -0.03558603815720267,
  0.0018607078394565688,
  -0.02203129692090425,
  0.025713949555736193,
  0.017405370942659756,
  0.0023517278958486675,
  -0.008547626775746813,
  0.011816787072163618,
  -0.012210894861739669,
  0.012107522708458497,
  -0.011597119733041884,
  -0.0017314920657784708,
  -0.01774133253629944,
  -0.005104025097553081,
  0.015169937766990346,
  0.01036956947564631,
  -0.016591311161034097,
  0.011978306352703767,
  0.00940691158339467,
  0.01719862663609741,
  -0.022974573989607252,
  0.021682414157350397,
  0.009613656821279626,
  0.004283504730970338,
  -0.02075206198385201,
  0.002547166877228075,
  -0.024757751317119027,
  -0.010376030991926005,
  0.001217051574009659,
  0.003850631569006562,
  -0.01255331704033644,
  0.01274714110830723,
  -0.016681763075723714,
  0.007714184773588597,


In [33]:
ids = [f"id_{i}" for i in range(len(vectors))]

In [34]:
ids

['id_0',
 'id_1',
 'id_2',
 'id_3',
 'id_4',
 'id_5',
 'id_6',
 'id_7',
 'id_8',
 'id_9',
 'id_10',
 'id_11',
 'id_12',
 'id_13',
 'id_14',
 'id_15',
 'id_16']

In [35]:
# Combine ids, vectors, and metadata into the format Pinecone expects
data = [{"id": id, "values": vector, "metadata": meta} for id, vector, meta in zip(ids, vectors, metadata)]

In [36]:
data

[{'id': 'id_0',
  'values': [0.0019333917485302883,
   -0.00017494611497669918,
   -0.009536127939149399,
   0.002947735950253168,
   -0.020235198423478314,
   0.030805052552052497,
   0.0026343875681165384,
   -0.03558603815720267,
   0.0018607078394565688,
   -0.02203129692090425,
   0.025713949555736193,
   0.017405370942659756,
   0.0023517278958486675,
   -0.008547626775746813,
   0.011816787072163618,
   -0.012210894861739669,
   0.012107522708458497,
   -0.011597119733041884,
   -0.0017314920657784708,
   -0.01774133253629944,
   -0.005104025097553081,
   0.015169937766990346,
   0.01036956947564631,
   -0.016591311161034097,
   0.011978306352703767,
   0.00940691158339467,
   0.01719862663609741,
   -0.022974573989607252,
   0.021682414157350397,
   0.009613656821279626,
   0.004283504730970338,
   -0.02075206198385201,
   0.002547166877228075,
   -0.024757751317119027,
   -0.010376030991926005,
   0.001217051574009659,
   0.003850631569006562,
   -0.01255331704033644,
   0.012

In [37]:
from pinecone import Pinecone

pc = Pinecone(api_key="")
index = pc.Index("langchain-test")

index.upsert(
  vectors=data,
  namespace="ns1"
)

{'upserted_count': 17}

In [38]:
user_question = "Whos is Whisker?"

In [39]:
user_vector = embeddings.embed_query(user_question)
print(user_vector)

[-0.003449627360314288, -0.02702716864937226, -0.007821868541185705, 0.011308807479559407, -0.005098121945864569, 0.008907297113207396, 0.004541839686288124, -0.016607056220609262, 0.005074377875709444, -0.031748784334650546, 0.020867361735979816, 0.00012889462837565994, -0.019415601137316132, 0.012468859207699926, -0.008893729605303108, -0.015277405987052035, 0.018275902533677283, 0.010365841465823226, 0.0041856832905743475, -0.018506555755985914, -0.009816343425860236, -0.004585935018299682, -0.028926668184748913, 0.002389638564105103, -0.009544986748516122, 0.013377905520352765, 0.019117108512840823, -0.004606287211478735, -0.006824631564509748, -0.00913116658156388, 0.011932928675641224, -0.0020504422517636516, -0.006794103740402479, -0.028302547919989714, -0.023703043531140507, 0.008439205983315379, 0.020989473032408893, -0.02366234100742764, 0.0004871708162322386, -0.012150014390045562, 0.027461340078180938, -0.0015492795064476373, -0.0012278908974765474, -0.015304541002860612, -

In [50]:
from pinecone import Pinecone

pc = Pinecone(api_key="3dc828a3-5d4d-4ece-a759-a8eb2700e294")
index = pc.Index("langchain-test")

response = index.query(
    namespace="ns1",
    vector=user_vector,
    top_k=3,
    include_values=True,
    include_metadata=True
)

In [51]:
response.matches[0]

{'id': 'id_1',
 'metadata': {'content': 'named Whiskers. With fur as black as the night and '
                         'eyes that shimmered like stars, \n'
                         'Whiskers was known for his adventurous spirit.'},
 'score': 0.872714,
 'values': [-0.00336470292,
            -0.0246689115,
            -0.0033630298,
            -0.0061538429,
            -0.00821850821,
            0.0156606752,
            0.00803111587,
            -0.0221391097,
            0.00739531917,
            -0.0357117,
            0.0036106559,
            0.00357384677,
            0.00166227377,
            -0.000419960445,
            -0.0175613742,
            0.00018352356,
            0.0288584772,
            0.00791064836,
            0.000641234452,
            -0.0213895403,
            -0.0168118011,
            0.0066758655,
            -0.00736185629,
            0.00871376134,
            -0.0142820012,
            0.00460785301,
            0.0239059553,
            -0.015138

In [55]:
# Extract the matches and their documents from the query response
matches = response['matches']
documents = [match['metadata']['content'] for match in matches]  # Adjust according to your metadata structure

# Initialize an empty string to store the formatted document descriptions
formatted_documents = ""

for i, doc in enumerate(documents, 1):
    formatted_documents += f"Chunk Reference {i}: {doc}\n"  # Adding a newline for better readability

In [56]:
formatted_documents

"Chunk Reference 1: named Whiskers. With fur as black as the night and eyes that shimmered like stars, \nWhiskers was known for his adventurous spirit.\nChunk Reference 2: Title: Whiskers' Midnight Adventure  \nIn the quiet town of Meadowville, under the glow of a silver moon, there lived a curious cat\nChunk Reference 3: of cheese it had found. Whiskers, with a flick of his tail, prepared to pounc e.\n"

In [63]:
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain

# Prompt Generation

prompt = PromptTemplate(
    input_variables = ["formatted_documents", "user_question"],
    template = '''Based on this reference below: 
    
{formatted_documents}

Answer the user question: {user_question}
    '''
)

print(prompt.format(formatted_documents = formatted_documents, user_question = user_question))

Based on this reference below: 
    
Chunk Reference 1: named Whiskers. With fur as black as the night and eyes that shimmered like stars, 
Whiskers was known for his adventurous spirit.
Chunk Reference 2: Title: Whiskers' Midnight Adventure  
In the quiet town of Meadowville, under the glow of a silver moon, there lived a curious cat
Chunk Reference 3: of cheese it had found. Whiskers, with a flick of his tail, prepared to pounc e.


Answer the user question: Whos is Whisker?
    


In [66]:
# OpenAI API

chatopenai = ChatOpenAI(model_name = "gpt-3.5-turbo")
llmchain_chat = LLMChain(llm = chatopenai, prompt = prompt)
llmchain_chat.run({"formatted_documents": formatted_documents, "user_question": user_question})

'Whiskers is a curious and adventurous cat with black fur and shimmering eyes, known for his adventurous spirit. He lives in the quiet town of Meadowville and is always ready for a midnight adventure.'

# WIP

In [None]:
def batch_embedding_request(strings, model="text-embedding-ada-002", max_tokens=4096):
    try:
        # Create batch request for embeddings
        response = openai.embeddings.create(
            model=model,
            input=strings,
            max_tokens=max_tokens
        )
        # Extract embeddings
        embeddings = [embedding['embedding'] for embedding in response['data']]
        return embeddings
    except Exception as e:
        print("An error occurred:", e)
        return None

In [None]:
print(text_fragments)
test_vector = []

In [None]:
# Get embeddings and store them in a list
embeddings_list = batch_embedding_request(text_fragments)

# Convert list of embeddings to a NumPy array for further manipulation
embeddings_array = np.array(embeddings_list)

# Print embeddings
print("Embeddings Array:")
print(embeddings_array)