In [2]:
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Get the OpenAI API key
openai_key = os.getenv('OPENAI_API_KEY')

# Print the OpenAI API key
#print(openai_key)

In [9]:
import openai
from rich import print

def generate_embedding(text, model="text-embedding-ada-002", api_key=openai_key):
    """Generate embeddings for a given text using OpenAI's API."""
    client = openai.OpenAI(api_key=api_key)
    response = client.embeddings.create(input=text, model=model)
    return response.data[0].embedding

#print(generate_embedding("Hello, world!"))

In [10]:
import bs4
from langchain_community.document_loaders import WebBaseLoader

# Only keep post title, headers, and content from the full HTML.
bs4_strainer = bs4.SoupStrainer(class_=("post-title", "post-header", "post-content"))
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs={"parse_only": bs4_strainer},
)
docs = loader.load()

assert len(docs) == 1
#print(f"Total characters: {len(docs[0].page_content)}")
#print(docs)

In [11]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)
all_splits = text_splitter.split_documents(docs)

#print(f"Split blog post into {len(all_splits)} sub-documents.")

In [12]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large", api_key=openai_key)

In [13]:
from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore(embeddings)

In [14]:
document_ids = vector_store.add_documents(documents=all_splits)

In [None]:
from langchain import hub

prompt = hub.pull("rlm/rag-prompt")

example_messages = prompt.invoke(
    {"context": "(context goes here)", "question": "(question goes here)"}
).to_messages()

assert len(example_messages) == 1
print(example_messages[0].content)

In [None]:
from langchain.llms import OpenAI

llm = OpenAI(api_key=openai_key)

question = "what is this post about?"

retrieved_docs = vector_store.similarity_search(question)
docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)
promptAnswer = prompt.invoke({"question": question, "context": docs_content})
answer = llm.invoke(promptAnswer)
print(answer)