In [1]:
import os
from dotenv import load_dotenv
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Pinecone

In [None]:
load_dotenv()

In [3]:
## LOAD DOCS FROM DIRECTORY
def read_doc(directory_path):
  loader = PyPDFDirectoryLoader(directory_path)
  documents = loader.load()
  return documents


In [4]:
pdf_dir = os.path.join("..", "pdfs")
docs = read_doc(pdf_dir)

In [5]:
# Convert into chunks
# https://python.langchain.com/docs/modules/data_connection/document_transformers/recursive_text_splitter
def chunk_data(docs,chunk_size=800,chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    chunks = text_splitter.split_documents(docs)
    return chunks

In [None]:
chunked_documents = chunk_data(docs,chunk_size=800,chunk_overlap=50)
len(chunked_documents)

In [7]:
## Embedding Technique of OpenAI
embeddings=OpenAIEmbeddings(api_key=os.getenv("OPENAI_API_KEY"))

In [None]:
# Testing how many vectors are generated
# Not needed for the final code
vectors = embeddings.embed_query("How are you?")
len(vectors)

In [9]:
# Vector Insertion/Search DB In Pinecone
# Pinecone env variables are loaded behind the scenes

index_name="docs"
pinecone = Pinecone.from_documents(
  chunked_documents, embeddings, index_name=index_name,
)


In [10]:
# Cosine Similarity Retrieval Results from VectorDB
def retrieve_query(query, k=2):
  matching_results=pinecone.similarity_search(query, k=k)
  return matching_results

In [11]:
# from langchain.chains.question_answering import load_qa_chain
# from langchain_community.chat_models import ChatOpenAI
llm=ChatOpenAI(model_name="gpt-3.5-turbo", openai_api_key=os.getenv("OPENAI_API_KEY")) 
chain=load_qa_chain(llm, chain_type="stuff")

In [12]:
## Search answers from VectorDB
def retrieve_answers(query):
  doc_search=retrieve_query(query)
  response = chain.invoke(input_documents=doc_search, question=query)
  return response

In [14]:
# our_query = "What is the purpose of this runbook?"
our_query = "Which chart do I need to build?"
answer = retrieve_answers(our_query)
print(answer)

You need to build the "oas-portal-api" chart.
