In [1]:
import os
from dotenv import load_dotenv, find_dotenv
import pinecone
from langchain.vectorstores import Pinecone
from langchain_openai import OpenAIEmbeddings
import openai
import numpy as np

  from tqdm.autonotebook import tqdm


In [2]:
load_dotenv(find_dotenv(), override=True)

True

In [3]:
# Retrieve the API key from environment variables
pinecone_api_key = os.getenv('PINECONE_API_KEY')
openai_api_key = os.getenv('OPENAI_API_KEY')

In [4]:
def load_document(file):
    nombre, extension = os.path.splitext(file) 
    if extension == '.html':
        from langchain.document_loaders import UnstructuredHTMLLoader
        print(f'load {file}...')
        loader = UnstructuredHTMLLoader(file)
    elif extension == '.txt':
        from langchain.document_loaders import TextLoader  
        print(f'load {file}...')
        loader = TextLoader(file)
    elif extension == '.pdf':
        from langchain.document_loaders import PyPDFLoader
        print(f'load {file}...')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader
        print(f'load {file}...')
        loader = Docx2txtLoader(file)
    else:
        print('The document format is not supported!')
        return None

    data = loader.load()
    return data

In [5]:
document = "../test.pdf"
content = load_document(document)
print(content)

load ../test.pdf...
[Document(page_content="Title: Whiskers' Midnight Adventure  \nIn the quiet town of Meadowville, under the glow of a silver moon, there lived a curious cat \nnamed Whiskers. With fur as black as the night and eyes that shimmered like stars, \nWhiskers was known for his adventurous spirit.  \nOne night, while his human family slept soundly, Whiskers heard a peculiar sound coming \nfrom the kitchen. His ears perked up, and his paws silently carried him towards the source. \nThe moonlight streamed through the window, casting shadows that danced on th e walls as \nWhiskers crept closer.  \nAs he peered around the corner, he saw a small mouse, its fur glistening under the moon’s \nlight. The mouse, seemingly unaware of the cat’s presence, continued nibbling on a piece \nof cheese it had found. Whiskers, with a flick of his tail, prepared to pounc e. \nBut just as he leaped, the mouse scurried away with astonishing speed, disappearing \nunder the refrigerator. Whiskers, p

In [6]:
# def dynamic_chunk_size(document_length, complexity_rating=None):
#     # Define thresholds for document length
#     if document_length < 5000:  # example threshold for characters
#         return 2000  # Larger chunks for shorter documents
#     elif document_length < 20000:
#         return 1500  # Moderate chunk size
#     else:
#         return 1000  # Smaller chunks for very long documents

# def split_document(document, complexity_rating=None):
#     document_length = len(document)  # Measure document length
#     chunk_size = dynamic_chunk_size(document_length, complexity_rating)
    
#     # Your existing splitting logic, adjusted for dynamic chunk size
#     from langchain.text_splitter import RecursiveCharacterTextSplitter
#     text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=20)
#     fragments = text_splitter.split_documents(document)
#     return fragments

In [7]:
def dynamic_chunk_size(document_length):
    """Determine dynamic chunk size based on document length."""
    if document_length < 1000:  # Assuming the story is around 500-1000 words
        return max(500, document_length)  # Use one chunk for very short texts or divide into manageable pieces
    else:
        return 700  # Larger chunks for slightly longer narratives

def split_document(document):
    """Split a document into chunks."""
    document_length = len(document)
    chunk_size = dynamic_chunk_size(document_length)
    
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=50)  # Slight overlap to prevent breaking sentences
    fragments = text_splitter.split_documents(document)
    return fragments

In [8]:
fragments = split_document(content)

In [9]:
len(fragments)

4

In [10]:
fragments

[Document(page_content="Title: Whiskers' Midnight Adventure  \nIn the quiet town of Meadowville, under the glow of a silver moon, there lived a curious cat \nnamed Whiskers. With fur as black as the night and eyes that shimmered like stars, \nWhiskers was known for his adventurous spirit.  \nOne night, while his human family slept soundly, Whiskers heard a peculiar sound coming \nfrom the kitchen. His ears perked up, and his paws silently carried him towards the source.", metadata={'source': '../test.pdf', 'page': 0}),
 Document(page_content='The moonlight streamed through the window, casting shadows that danced on th e walls as \nWhiskers crept closer.  \nAs he peered around the corner, he saw a small mouse, its fur glistening under the moon’s \nlight. The mouse, seemingly unaware of the cat’s presence, continued nibbling on a piece \nof cheese it had found. Whiskers, with a flick of his tail, prepared to pounc e. \nBut just as he leaped, the mouse scurried away with astonishing speed

In [11]:
# def split (data, chunk_size=1500):
#     from langchain.text_splitter import RecursiveCharacterTextSplitter
#     text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=20)
#     fragments = text_splitter.split_documents(data)
#     return fragments

In [12]:
# fragments = split(content)
# print(len(fragments))

In [13]:
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=openai_api_key)

In [14]:
# Assume `fragments` is a list of Document objects
text_fragments = [doc.page_content for doc in fragments]
print(len(text_fragments))

4


In [15]:
def batch_embed(text):
    batch_result = embeddings.embed_query(text)
    return batch_result

In [16]:
vectors = []
metadata = []

for item in text_fragments:
    # Assuming `batch_embed` function returns a vector for the item
    vector = batch_embed(item)
    vectors.append(vector)
    # Create metadata for each item; here, we just store the item text itself
    metadata.append({"content": item})
    print(item)

Title: Whiskers' Midnight Adventure  
In the quiet town of Meadowville, under the glow of a silver moon, there lived a curious cat 
named Whiskers. With fur as black as the night and eyes that shimmered like stars, 
Whiskers was known for his adventurous spirit.  
One night, while his human family slept soundly, Whiskers heard a peculiar sound coming 
from the kitchen. His ears perked up, and his paws silently carried him towards the source.
The moonlight streamed through the window, casting shadows that danced on th e walls as 
Whiskers crept closer.  
As he peered around the corner, he saw a small mouse, its fur glistening under the moon’s 
light. The mouse, seemingly unaware of the cat’s presence, continued nibbling on a piece 
of cheese it had found. Whiskers, with a flick of his tail, prepared to pounc e. 
But just as he leaped, the mouse scurried away with astonishing speed, disappearing
under the refrigerator. Whiskers, puzzled but intrigued, decided to wait. Minutes turned into

In [17]:
vectors

[[0.01833194412390035,
  -0.015852131852333997,
  -0.011012498916548707,
  -0.0018265282145688525,
  -0.012732368687753674,
  0.025504733957074737,
  0.00729944691656795,
  -0.03725050974450974,
  -0.00540292386934887,
  -0.0183452769386379,
  0.025584727120209514,
  0.014532232553416045,
  -0.013645632969661094,
  -0.0032930837025713627,
  0.0002876865201099239,
  -0.0036397240800614713,
  0.021065070562253835,
  -0.0049562908737870075,
  -0.0013473978800681592,
  -0.028557836773257832,
  -0.00843936066711003,
  0.017038709523134868,
  0.003909703566778564,
  -0.000755359403503019,
  0.004719642259760499,
  0.007619422828736249,
  0.02595803289434406,
  -0.01934520079104887,
  0.007899401228014531,
  0.007832739948294675,
  0.004229679478946862,
  -0.011859100987413643,
  -0.001554048941503798,
  -0.013198999508437918,
  -0.013312324708416562,
  0.003926369119539185,
  0.009999242249400192,
  -0.027651238898719187,
  0.007192788123958079,
  -0.024571472315706253,
  0.01128581137411953

In [18]:
ids = [f"id_{i}" for i in range(len(vectors))]

In [19]:
ids

['id_0', 'id_1', 'id_2', 'id_3']

In [20]:
# Combine ids, vectors, and metadata into the format Pinecone expects
data = [{"id": id, "values": vector, "metadata": meta} for id, vector, meta in zip(ids, vectors, metadata)]

In [21]:
data

[{'id': 'id_0',
  'values': [0.01833194412390035,
   -0.015852131852333997,
   -0.011012498916548707,
   -0.0018265282145688525,
   -0.012732368687753674,
   0.025504733957074737,
   0.00729944691656795,
   -0.03725050974450974,
   -0.00540292386934887,
   -0.0183452769386379,
   0.025584727120209514,
   0.014532232553416045,
   -0.013645632969661094,
   -0.0032930837025713627,
   0.0002876865201099239,
   -0.0036397240800614713,
   0.021065070562253835,
   -0.0049562908737870075,
   -0.0013473978800681592,
   -0.028557836773257832,
   -0.00843936066711003,
   0.017038709523134868,
   0.003909703566778564,
   -0.000755359403503019,
   0.004719642259760499,
   0.007619422828736249,
   0.02595803289434406,
   -0.01934520079104887,
   0.007899401228014531,
   0.007832739948294675,
   0.004229679478946862,
   -0.011859100987413643,
   -0.001554048941503798,
   -0.013198999508437918,
   -0.013312324708416562,
   0.003926369119539185,
   0.009999242249400192,
   -0.027651238898719187,
   0.0

In [22]:
from pinecone import Pinecone

pc = Pinecone(api_key=pinecone_api_key)
index = pc.Index("langchain-index")

index.upsert(
  vectors=data,
  namespace="ns1"
)

{'upserted_count': 4}

In [32]:
user_question = "What is the ending of Whisker's story?"

In [33]:
user_vector = embeddings.embed_query(user_question)
print(user_vector)

[0.0032966352481828796, -0.01797857513739633, -0.00283703249321907, 0.004819069039306482, -0.01261203702515786, 0.021290417484711322, 0.0006226602083481478, -0.029468641450354747, 0.005873451350748651, -0.009374542151506493, 0.04387853598925224, 0.02771133651140824, -0.005714618026892505, 0.006532440702853592, -0.0013737388437963579, -0.006488507893115432, 0.02738691032512836, 0.007914628038536953, 0.0039268988896817725, -0.032902144316016554, -0.008502649220600825, 0.0061201498284198744, -0.018640942861801346, -0.0035348850011364787, 0.013308200579755854, -0.005839657383200633, 0.009529995706078844, -0.021817607476279303, 0.008441818961427417, -0.002274357252477451, 0.010219399722109248, 0.02007382161446797, -0.006079596787965509, -0.014301753097685855, -0.01873556708852277, -0.0031242842441752783, 0.017316205550346356, -0.0251700033297099, 0.005045490763919903, -0.01274721475799489, 0.01838410647326246, -0.0124295481102826, -0.01431527124349855, -0.009212329989689032, -0.023520841694

In [34]:
from pinecone import Pinecone

pc = Pinecone(api_key=pinecone_api_key)
index = pc.Index("langchain-index")

response = index.query(
    namespace="ns1",
    vector=user_vector,
    top_k=5,
    include_values=True,
    include_metadata=True
)

In [35]:
response.matches[0]

{'id': 'id_3',
 'metadata': {'content': 'And so, Whiskers’ midnight adventure ended not with '
                         'a feast, but with a new respect for the \n'
                         'clever creature that shared his home. From that '
                         'night on, whenever Whiskers went on his \n'
                         'nightly prowls, he always kept an eye out for his '
                         'little friend, hoping perh aps to meet again \n'
                         'under the moonlit sky.'},
 'score': 0.880823433,
 'values': [0.0222122595,
            -0.0208189208,
            0.00461628102,
            -0.0017315282,
            0.00170278212,
            0.0204401482,
            0.0126820914,
            -0.0188033134,
            -0.00561393891,
            -0.0158813596,
            0.0233215205,
            0.00740634138,
            0.00643235678,
            -0.00468391879,
            -0.00756867183,
            -0.00224219379,
            0.0254588742,
   

In [36]:
# Extract the matches and their documents from the query response
matches = response['matches']
documents = [match['metadata']['content'] for match in matches]  # Adjust according to your metadata structure

# Initialize an empty string to store the formatted document descriptions
formatted_documents = ""

for i, doc in enumerate(documents, 1):
    formatted_documents += f"Chunk Reference {i}: {doc}\n"  # Adding a newline for better readability

In [37]:
formatted_documents

"Chunk Reference 1: And so, Whiskers’ midnight adventure ended not with a feast, but with a new respect for the \nclever creature that shared his home. From that night on, whenever Whiskers went on his \nnightly prowls, he always kept an eye out for his little friend, hoping perh aps to meet again \nunder the moonlit sky.\nChunk Reference 2: Title: Whiskers' Midnight Adventure  \nIn the quiet town of Meadowville, under the glow of a silver moon, there lived a curious cat \nnamed Whiskers. With fur as black as the night and eyes that shimmered like stars, \nWhiskers was known for his adventurous spirit.  \nOne night, while his human family slept soundly, Whiskers heard a peculiar sound coming \nfrom the kitchen. His ears perked up, and his paws silently carried him towards the source.\nChunk Reference 3: under the refrigerator. Whiskers, puzzled but intrigued, decided to wait. Minutes turned into \nhours, and still, the mouse did not reappear. As the first light of dawn crept  into the 

In [38]:
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain

# Prompt Generation

prompt = PromptTemplate(
    input_variables = ["formatted_documents", "user_question"],
    template = '''Based on this reference below: 
    
{formatted_documents}

Answer the user question: {user_question}
    '''
)

print(prompt.format(formatted_documents = formatted_documents, user_question = user_question))

Based on this reference below: 
    
Chunk Reference 1: And so, Whiskers’ midnight adventure ended not with a feast, but with a new respect for the 
clever creature that shared his home. From that night on, whenever Whiskers went on his 
nightly prowls, he always kept an eye out for his little friend, hoping perh aps to meet again 
under the moonlit sky.
Chunk Reference 2: Title: Whiskers' Midnight Adventure  
In the quiet town of Meadowville, under the glow of a silver moon, there lived a curious cat 
named Whiskers. With fur as black as the night and eyes that shimmered like stars, 
Whiskers was known for his adventurous spirit.  
One night, while his human family slept soundly, Whiskers heard a peculiar sound coming 
from the kitchen. His ears perked up, and his paws silently carried him towards the source.
Chunk Reference 3: under the refrigerator. Whiskers, puzzled but intrigued, decided to wait. Minutes turned into 
hours, and still, the mouse did not reappear. As the first light

In [39]:
# OpenAI API

chatopenai = ChatOpenAI(model_name = "gpt-3.5-turbo")
llmchain_chat = LLMChain(llm = chatopenai, prompt = prompt)
llmchain_chat.run({"formatted_documents": formatted_documents, "user_question": user_question})

"The ending of Whiskers' story is that his midnight adventure ended with a new respect for the clever mouse that shared his home. Whiskers always kept an eye out for his little friend whenever he went on his nightly prowls, hoping to meet again under the moonlit sky."

# WIP Batch Embeddings

In [None]:
def batch_embedding_request(strings, model="text-embedding-ada-002", max_tokens=4096):
    try:
        # Create batch request for embeddings
        response = openai.embeddings.create(
            model=model,
            input=strings,
            max_tokens=max_tokens
        )
        # Extract embeddings
        embeddings = [embedding['embedding'] for embedding in response['data']]
        return embeddings
    except Exception as e:
        print("An error occurred:", e)
        return None

In [None]:
print(text_fragments)
test_vector = []

In [None]:
# Get embeddings and store them in a list
embeddings_list = batch_embedding_request(text_fragments)

# Convert list of embeddings to a NumPy array for further manipulation
embeddings_array = np.array(embeddings_list)

# Print embeddings
print("Embeddings Array:")
print(embeddings_array)