In [1]:
import os
from dotenv import load_dotenv, find_dotenv
import pinecone
from langchain.vectorstores import Pinecone
from langchain_openai import OpenAIEmbeddings
import openai
import numpy as np

  from tqdm.autonotebook import tqdm


In [2]:
load_dotenv(find_dotenv(), override=True)

True

In [3]:
# Retrieve the API key from environment variables
pinecone_api_key = os.getenv('PINECONE_API_KEY')
openai_api_key = os.getenv('OPENAI_API_KEY')

In [4]:
def load_document(file):
    nombre, extension = os.path.splitext(file) 
    if extension == '.html':
        from langchain.document_loaders import UnstructuredHTMLLoader
        print(f'load {file}...')
        loader = UnstructuredHTMLLoader(file)
    elif extension == '.txt':
        from langchain.document_loaders import TextLoader  
        print(f'load {file}...')
        loader = TextLoader(file)
    elif extension == '.pdf':
        from langchain.document_loaders import PyPDFLoader
        print(f'load {file}...')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader
        print(f'load {file}...')
        loader = Docx2txtLoader(file)
    else:
        print('The document format is not supported!')
        return None

    data = loader.load()
    return data

In [5]:
document = "../test.pdf"
content = load_document(document)
print(content)

load ../test.pdf...
[Document(page_content="Title: Whiskers' Midnight Adventure  \nIn the quiet town of Meadowville, under the glow of a silver moon, there lived a curious cat \nnamed Whiskers. With fur as black as the night and eyes that shimmered like stars, \nWhiskers was known for his adventurous spirit.  \nOne night, while his human family slept soundly, Whiskers heard a peculiar sound coming \nfrom the kitchen. His ears perked up, and his paws silently carried him towards the source. \nThe moonlight streamed through the window, casting shadows that danced on th e walls as \nWhiskers crept closer.  \nAs he peered around the corner, he saw a small mouse, its fur glistening under the moon’s \nlight. The mouse, seemingly unaware of the cat’s presence, continued nibbling on a piece \nof cheese it had found. Whiskers, with a flick of his tail, prepared to pounc e. \nBut just as he leaped, the mouse scurried away with astonishing speed, disappearing \nunder the refrigerator. Whiskers, p

In [10]:
def dynamic_chunk_size(document_length, complexity_rating=None):
    # Define thresholds for document length
    if document_length < 5000:  # example threshold for characters
        return 2000  # Larger chunks for shorter documents
    elif document_length < 20000:
        return 1500  # Moderate chunk size
    else:
        return 1000  # Smaller chunks for very long documents

def split_document(document, complexity_rating=None):
    document_length = len(document)  # Measure document length
    chunk_size = dynamic_chunk_size(document_length, complexity_rating)
    
    # Your existing splitting logic, adjusted for dynamic chunk size
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=20)
    fragments = text_splitter.split_documents(document)
    return fragments

In [11]:
fragments = split_document(content)

In [12]:
len(fragments)

1

In [13]:
fragments

[Document(page_content="Title: Whiskers' Midnight Adventure  \nIn the quiet town of Meadowville, under the glow of a silver moon, there lived a curious cat \nnamed Whiskers. With fur as black as the night and eyes that shimmered like stars, \nWhiskers was known for his adventurous spirit.  \nOne night, while his human family slept soundly, Whiskers heard a peculiar sound coming \nfrom the kitchen. His ears perked up, and his paws silently carried him towards the source. \nThe moonlight streamed through the window, casting shadows that danced on th e walls as \nWhiskers crept closer.  \nAs he peered around the corner, he saw a small mouse, its fur glistening under the moon’s \nlight. The mouse, seemingly unaware of the cat’s presence, continued nibbling on a piece \nof cheese it had found. Whiskers, with a flick of his tail, prepared to pounc e. \nBut just as he leaped, the mouse scurried away with astonishing speed, disappearing \nunder the refrigerator. Whiskers, puzzled but intrigued

In [None]:
# def split (data, chunk_size=1500):
#     from langchain.text_splitter import RecursiveCharacterTextSplitter
#     text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=20)
#     fragments = text_splitter.split_documents(data)
#     return fragments

In [None]:
# fragments = split(content)
# print(len(fragments))

In [None]:
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=openai_api_key)

In [None]:
# Assume `fragments` is a list of Document objects
text_fragments = [doc.page_content for doc in fragments]
print(len(text_fragments))

In [None]:
def batch_embed(text):
    batch_result = embeddings.embed_query(text)
    return batch_result

In [None]:
vectors = []
metadata = []

for item in text_fragments:
    # Assuming `batch_embed` function returns a vector for the item
    vector = batch_embed(item)
    vectors.append(vector)
    # Create metadata for each item; here, we just store the item text itself
    metadata.append({"content": item})
    print(item)

In [None]:
vectors

In [None]:
ids = [f"id_{i}" for i in range(len(vectors))]

In [None]:
ids

In [None]:
# Combine ids, vectors, and metadata into the format Pinecone expects
data = [{"id": id, "values": vector, "metadata": meta} for id, vector, meta in zip(ids, vectors, metadata)]

In [None]:
data

In [None]:
from pinecone import Pinecone

pc = Pinecone(api_key=pinecone_api_key)
index = pc.Index("langchain-test")

index.upsert(
  vectors=data,
  namespace="ns1"
)

In [None]:
user_question = "How is Whisker's described as?"

In [None]:
user_vector = embeddings.embed_query(user_question)
print(user_vector)

In [None]:
from pinecone import Pinecone

pc = Pinecone(api_key=pinecone_api_key)
index = pc.Index("langchain-test")

response = index.query(
    namespace="ns1",
    vector=user_vector,
    top_k=5,
    include_values=True,
    include_metadata=True
)

In [None]:
response.matches[0]

In [None]:
# Extract the matches and their documents from the query response
matches = response['matches']
documents = [match['metadata']['content'] for match in matches]  # Adjust according to your metadata structure

# Initialize an empty string to store the formatted document descriptions
formatted_documents = ""

for i, doc in enumerate(documents, 1):
    formatted_documents += f"Chunk Reference {i}: {doc}\n"  # Adding a newline for better readability

In [None]:
formatted_documents

In [None]:
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain

# Prompt Generation

prompt = PromptTemplate(
    input_variables = ["formatted_documents", "user_question"],
    template = '''Based on this reference below: 
    
{formatted_documents}

Answer the user question: {user_question}
    '''
)

print(prompt.format(formatted_documents = formatted_documents, user_question = user_question))

In [None]:
# OpenAI API

chatopenai = ChatOpenAI(model_name = "gpt-3.5-turbo")
llmchain_chat = LLMChain(llm = chatopenai, prompt = prompt)
llmchain_chat.run({"formatted_documents": formatted_documents, "user_question": user_question})

# WIP

In [None]:
def batch_embedding_request(strings, model="text-embedding-ada-002", max_tokens=4096):
    try:
        # Create batch request for embeddings
        response = openai.embeddings.create(
            model=model,
            input=strings,
            max_tokens=max_tokens
        )
        # Extract embeddings
        embeddings = [embedding['embedding'] for embedding in response['data']]
        return embeddings
    except Exception as e:
        print("An error occurred:", e)
        return None

In [None]:
print(text_fragments)
test_vector = []

In [None]:
# Get embeddings and store them in a list
embeddings_list = batch_embedding_request(text_fragments)

# Convert list of embeddings to a NumPy array for further manipulation
embeddings_array = np.array(embeddings_list)

# Print embeddings
print("Embeddings Array:")
print(embeddings_array)