# Vector testing

### Install needed packages

In [85]:
%pip install --upgrade --quiet langchain langchain-community langchain-openai python-dotenv requests

Note: you may need to restart the kernel to use updated packages.


In [1]:
llm_model_name = 'gpt-4o'
embeddings_model_name = 'text-embedding-ada-002'  
vectorstore_index_name = 'test1'
vectorstore_text_field = 'text'

### Load API keys into environment variables

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

### Split text into chunks

In [3]:
from langchain_text_splitters.character import RecursiveCharacterTextSplitter

def split_text(text, max_chunk_size=1000, chunk_overlap=200):
    # print(f'Splitting text of length {len(text)} into chunks of size {max_chunk_size} with overlap {chunk_overlap}')
    r_splitter = RecursiveCharacterTextSplitter(
        chunk_size=max_chunk_size, chunk_overlap=chunk_overlap, separators=["\n\n", "\n", " ", ""]
    )

    chunks = r_splitter.split_text(text)
    return chunks

### Generate embeddings

In [4]:
import os
from langchain.embeddings import OpenAIEmbeddings

# embeddings = OpenAIEmbeddings( model=embeddings_model_name, openai_api_key=os.getenv('OPENAPI_API_KEY') )
embeddings = OpenAIEmbeddings( openai_api_key=os.getenv('OPENAPI_API_KEY') )

  embeddings = OpenAIEmbeddings( openai_api_key=os.getenv('OPENAPI_API_KEY') )


#### Define Pinecone vectorstore index

In [5]:
import os
from pinecone import Pinecone

pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))
index = pc.Index(vectorstore_index_name)


  from tqdm.autonotebook import tqdm


#### Load PDF content into vectorstore

In [56]:
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings

loader = PyPDFLoader('https://rsnyder.github.io/schh-ai-chatbot/knowledge-base/pdfs/2025ScheduleofFees-websingle.pdf')
pages = loader.load()
print(f"Pages: {len(pages)}")

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(pages)
print(f"Splits: {len(splits)}")
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())
retriever = vectorstore.as_retriever()

Pages: 48
Splits: 196


#### Populate vector store from PDFs

In [41]:
from langchain_community.document_loaders import PyPDFLoader
from pinecone import Pinecone
import hashlib, os
    
def generate_short_id(content: str) -> str:
    hash_obj = hashlib.sha256()
    hash_obj.update(content.encode('utf-8'))
    return hash_obj.hexdigest()

loader = PyPDFLoader('https://rsnyder.github.io/schh-ai-chatbot/knowledge-base/pdfs/2025ScheduleofFees-websingle.pdf')
pages = loader.load()
print (f'Pages {len(pages)}')

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = text_splitter.split_documents(pages)
print(f'Splits: {len(docs)}')

doc_embeddings = embeddings.embed_documents([doc.page_content for doc in docs])
    
data_with_metadata = []

for doc, embedding in zip(docs, doc_embeddings):
    # Create a data item dictionary
    data_item = {
        'id': generate_short_id(doc.page_content),
        'values': embedding,
        'metadata': {'text': doc.page_content},  # Include the text as metadata
    }
    data_with_metadata.append(data_item)  # Append the data item to the list

# print(data_with_metadata)
print('Adding data to Pinecone index...')
pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))
index = pc.Index('from-pdfs')
index.upsert(vectors=data_with_metadata)
print('Data added to Pinecone index')
    

Pages 48
Splits: 196
Adding data to Pinecone index...
Data added to Pinecone index


#### Populate vector store from Markdown

In [None]:
from langchain_text_splitters import MarkdownHeaderTextSplitter
from pinecone import Pinecone
import hashlib, json
import requests
    
def generate_short_id(content: str) -> str:
    hash_obj = hashlib.sha256()
    hash_obj.update(content.encode('utf-8'))
    return hash_obj.hexdigest()

markdown = requests.get('https://rsnyder.github.io/schh-ai-chatbot/knowledge-base/2025_Schedule_of_Fees_and_Community-Rules.md').text

# MD splits
md_header_splits = MarkdownHeaderTextSplitter(
    headers_to_split_on = [ ('#', 'Header 1'), ('##', 'Header 2') ], 
    strip_headers=False
).split_text(markdown)

# Char-level splits
docs = RecursiveCharacterTextSplitter(
    chunk_size=1000, 
    chunk_overlap=200
).split_documents(md_header_splits)

doc_embeddings = embeddings.embed_documents([doc.page_content for doc in docs])
    
data_with_metadata = []

for doc, embedding in zip(docs, doc_embeddings):
    # Create a data item dictionary
    data_item = {
        'id': generate_short_id(doc.page_content),
        'values': embedding,
        'metadata': doc.metadata | {'text': doc.page_content},  # add text as metadata
    }
    data_with_metadata.append(data_item)  # Append the data item to the list

print('Adding data to Pinecone index...')
pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))
index = pc.Index('from-markdown')
index.upsert(vectors=data_with_metadata)
print('Data added to Pinecone index')

docs=206
Adding data to Pinecone index...
Data added to Pinecone index


#### Get Vector Store

In [34]:
from langchain_pinecone import PineconeVectorStore

# load_vectorstore_from_pdf('https://rsnyder.github.io/schh-ai-chatbot/knowledge-base/pdfs/2025ScheduleofFees-websingle.pdf')

pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))
index = pc.Index('from-pdfs')

vectorstore = PineconeVectorStore( index, embeddings, vectorstore_text_field )
retriever = vectorstore.as_retriever(search_kwargs={'k': 5})

#### Define LLM

In [35]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model=llm_model_name)

#### Contextualize question

In [36]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains import create_history_aware_retriever

contextualize_q_system_prompt = '''Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is.'''

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ('system', contextualize_q_system_prompt),
        MessagesPlaceholder('chat_history'),
        ('human', '{input}'),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

#### Answer question

In [37]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

system_prompt = '''You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \

{context}'''

qa_prompt = ChatPromptTemplate.from_messages(
  [
    ('system', system_prompt),
    MessagesPlaceholder(variable_name='chat_history'),
    ('human', '{input}'),
  ]
)

question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)


#### Statefully manage chat history

In [38]:
import json
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory

store = {}

def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]

conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key='input',
    history_messages_key='chat_history',
    output_messages_key='answer',
).with_config(tags=['main_chain'])

def print_response(resp):
  as_dict = {
    'input': resp['input'],
    'chat_history': [doc.model_dump() for doc in resp['chat_history']],
    'context': [doc.model_dump() for doc in resp['context']],
    'answer': resp['answer']
  }
  print(json.dumps(as_dict, indent=2) + '\n')

#### Output streamer

In [107]:
from langchain_core.messages import AIMessageChunk

async def generate_chat_events(message, session_id):
  
  def serialize_aimessagechunk(chunk):
    if isinstance(chunk, AIMessageChunk):
      return chunk.content
    else:
      raise TypeError(f'Object of type {type(chunk).__name__} is not correctly formatted for serialization')
  
  try:
    async for event in conversational_rag_chain.astream_events(message, version='v1', config={'configurable': {'session_id': session_id}} ):
      # print(event['tags'], event['event'], event.get('data',{}).get('chunk'))
      # Only get the answer
      sources_tags = ['seq:step:3', 'main_chain']
      if all(value in event['tags'] for value in sources_tags) and event['event'] == 'on_chat_model_stream':
        chunk_content = serialize_aimessagechunk(event['data']['chunk'])
        if len(chunk_content) != 0:
          yield chunk_content
          
  except Exception as e:
    print('error'+ str(e))

#### Ask question

In [42]:
prompt_1 = 'tell me about golf in sun city'
sessionid = 'abc124'

resp = conversational_rag_chain.invoke(
    {'input': prompt_1},
    config={
        'configurable': {'session_id': sessionid}
    },
)

print_response(resp)

# print(resp['answer'])

{
  "input": "tell me about golf in sun city",
  "chat_history": [
    {
      "content": "tell me about golfing in sun city",
      "additional_kwargs": {},
      "response_metadata": {},
      "type": "human",
      "name": null,
      "id": null,
      "example": false
    },
    {
      "content": "Sun City, located in the North West Province of South Africa, is renowned for its world-class golfing facilities. The resort offers two prestigious courses designed by the famous golfer Gary Player. \n\n1. **Gary Player Country Club**: This course is one of the longest in South Africa and is known for hosting the annual Nedbank Golf Challenge, attracting top professional golfers from around the world. The course is challenging with its narrow fairways and strategically placed bunkers.\n\n2. **Lost City Golf Course**: This course is unique with its desert-themed design and features a variety of water hazards. It is famous for the par-3 13th hole, which has a water hazard inhabited by live