### References Langchain
https://python.langchain.com/v0.1/docs/use_cases/question_answering/quickstart/

### References Chroma db  
https://docs.trychroma.com/guides  
https://github.com/neo-con/chromadb-tutorial  
https://python.langchain.com/v0.1/docs/integrations/vectorstores/chroma/  


In [None]:
import bs4
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [None]:
import os
import sys
sys.path.append('../..')

from py3810.myUtils import pickle_dump, pickle_load

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv('.env\my_api_key.env')) # read local .env file

os.environ["OPENAI_API_KEY"]
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"]
os.environ["ToMarkDown_API_KEY"]
os.environ["SECRET_KEY"]
print(f"SECRET_KEY = {os.environ['SECRET_KEY']}")

# Set the path to the directory containing the Excel file
path_lumen_dump = "../langchain/docs/lumen/"
path_lumen_docs = path_lumen_dump + "docs/"


In [None]:
import textwrap

def print_wrapped(text, width=80):
  """
  Prints a long string to the console, wrapped to fit within a specified width.

  Args:
      text: The long string to be wrapped.
      width: The desired width for each line (default: 80 columns).
  """
  wrapped_text = textwrap.wrap(text, width=width)
  for line in wrapped_text:
    print(line)

# Example usage
long_string = "This is a very long string that needs to be wrapped to fit within 80 columns. It can contain spaces, punctuation, and even newlines."
print_wrapped(long_string)

In [None]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-3.5-turbo-0125")

In [None]:
persist_directory = ".chatbot/chroma_openai_ef/"
embedding_function = OpenAIEmbeddings()
collection_name = 'lumen_docs_combined'

In [None]:
my_query = "What is lumen optometric's address?"

#### Only run cell below to add new data


In [None]:
from langchain.docstore.document import Document

# load data
doc_0 = pickle_load(filename_pickle='lumen_docs_website', path_pickle_dump=path_lumen_docs)
doc_1 = pickle_load(filename_pickle='lumen_docs_youtube', path_pickle_dump=path_lumen_docs)
doc_2 = pickle_load(filename_pickle='lumen_docs_videos', path_pickle_dump=path_lumen_docs)
doc_3 = pickle_load(filename_pickle='lumen_docs_pdfs', path_pickle_dump=path_lumen_docs)

docs = doc_0 + doc_1 + doc_2 + doc_3

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

In [None]:
print(f'len(splits): {len(splits)}')
print(f'len(docs): {len(docs)}')

In [None]:
for i, doc in enumerate(docs):
  page_content = doc.page_content
  metadata = doc.metadata
  print(f'{i},  len(page_content): {len(page_content)}, meta_keys: {metadata.keys()}')

In [None]:
for i, doc in enumerate(splits[0:4]):
  page_content = doc.page_content
  metadata = doc.metadata
  print(f'{i},  len(page_content): {len(page_content)}, meta_keys: {metadata.keys()}')
  for key, value in metadata.items():
    print(f'{key}: {value}')
  print('')  


In [None]:
doc = docs[2]
page_content = doc.page_content
print(f'len(page_content): {len(page_content)}')
print_wrapped(doc.page_content[0:200])
print('')
metadata = doc.metadata
for key, value in doc.metadata.items():
    print(f'{key}: {value}')


In [None]:
my_data = []

for i, doc in enumerate(docs[0:2]):
  my_row =[]
  doc_index = i
  page_content = doc.page_content
  page_len = len(page_content)

  metadata_dic = doc.metadata

  try:
    source = metadata_dic['source']
  except KeyError:
    source = None 

  try:
    title = metadata_dic['title']
  except KeyError:
    title = None 

  try:
    description = metadata_dic['description']
  except KeyError:
    description = None

  print(f'doc_index: {doc_index}')
  print(f'page_len: {page_len}')
  print(f'page_content[0:50]: {page_content[0:50]}')  
  print(f'source: {source}')  
  print(f'title: {title}')
  print(f'description: {description}\n')
  my_row.append(doc_index)
  my_row.append(page_len)
  my_row.append(page_content[0:10])
  my_row.append(source)
  my_row.append(description)
  print(f'my_row: {my_row}')
  my_data.append(my_row)

my_data

In [None]:
for i, doc in enumerate(docs[0:2]):
  # doc_index = i
  # page_content = doc.page_content
  # page_len = len(page_content)
  # metadata_dic = doc.metadata
  # source = metadata_dic['source']
  # title = metadata_dic['title']
  # description = metadata_dic['description']


  print(f'docs[{i}], len(page_content): {len(page_content)}')
  # print_wrapped(doc.page_content)
  # print('')
  metadata = doc.metadata
  for key, value in doc.metadata.items():
    print(f'{key}: {value}')
  print(f'{"="*10}')  


In [None]:
# from langchain.docstore.document import Document

# lumen_address_phone_hours = \
#   "Lumen Optometric address is located at 14 West Sierra Madre Blvd, Sierra Madre, CA 91024. \
#    Lumen Optometric office is located at 14 West Sierra Madre Blvd, Sierra Madre, CA 91024. \
#    Lumen Optometric location is 14 West Sierra Madre Blvd, Sierra Madre, CA 91024. \
#    Lumen Optometric phone number is (626) 921-0199. \
#    Lumen Optometric office hours are Tuesday, Wednesday, Friday, and Saturday from 9:45 am to 5:30 pm, \
#    and Thursday from 9:45 am to 1:30 pm."

# doc_lumen_address_phone_hours =  [Document(
#   page_content=lumen_address_phone_hours,
#   metadata={"description": "address, office location, phone number, office hours"}
#   )]

# # load data
# doc_0 = pickle_load(filename_pickle='lumen_docs_website', path_pickle_dump=path_lumen_docs)
# doc_1 = pickle_load(filename_pickle='lumen_docs_pdfs', path_pickle_dump=path_lumen_docs)
# doc_2 = pickle_load(filename_pickle='lumen_docs_videos', path_pickle_dump=path_lumen_docs)
# doc_3 = pickle_load(filename_pickle='lumen_docs_youtube', path_pickle_dump=path_lumen_docs)
# docs = doc_lumen_address_phone_hours + doc_0 + doc_1 + doc_2 + doc_3

# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
# splits = text_splitter.split_documents(docs)

# # save to disk
# db0 = Chroma.from_documents(
#   documents=splits,
#   embedding=embedding_function,
#   collection_name=collection_name,
#   persist_directory=persist_directory
#   )

# db0_ans = db0.similarity_search(my_query)
# # print(docs[0].page_content)
# print(f'db0_ans:{db0_ans}')

In [None]:

# load from disk
vectorstore = Chroma(
  embedding_function=embedding_function,
  collection_name=collection_name,
  persist_directory=persist_directory
  )
ans = vectorstore.similarity_search(my_query)
print(f'ans:{ans}')

In [None]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 1})
prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
queries = [
  # "14 West Sierra Madre Boulevard",
  "What is lumen optometric's phone number?",
  "What is lumen optometric's address?",
  "What is lumen optometric's location?",
  "do you take vision insurance",
  "what type of insurance do you take",
  "what are the names of the insurance that you take",
  "What is Ortho-k",
  "What does research say about Ortho-k",
  "What equipment do they have?",
  "What equipment do you use for othro-k?",
  ]

In [None]:
# for query in queries[0:2]:
for query in queries:
  print(f'query: {query}')
  print_wrapped(f'answr: {rag_chain.invoke(query)}')
  print('')
  print_wrapped(f'relevant docs: {retriever.get_relevant_documents(query)}')
  print(f'{"="*5}\n')