In [1]:
#import openai
from openai import OpenAI
import langchain
import pinecone
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
#from langchain.vectorstores import pinecone
from langchain.llms import OpenAI

  from tqdm.autonotebook import tqdm


In [2]:
import os 
os.environ["OPENAI_API_KEY"] = "sk-***"

In [3]:
import json

def show_json(obj):
    display(json.loads(obj.model_dump_json()))

In [4]:
## read document
def read_doc(directory):
    file_loader=PyPDFDirectoryLoader(directory)
    documents=file_loader.load()
    return documents

In [5]:
doc=read_doc('Documents/')
len(doc)

52

In [8]:
# divide the doc into chunks

def chunk_data(docs,chunk_size=800,chunk_overlap=50):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    doc=text_splitter.split_documents(docs)
    return doc


In [24]:
documents= chunk_data(docs=doc)
len(documents)

242

In [10]:
#Embedding technique of openai
embeddings = OpenAIEmbeddings(api_key="sk-***")
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x00000160FDC51DB0>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x00000160FDB78B80>, model='text-embedding-ada-002', deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=None, openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key='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', openai_organization=None, allowed_special=set(), disallowed_special='all', chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None)

In [11]:
vectors=embeddings.embed_query("what are the risks related to acquisitions?")
len(vectors)

1536

Query with Pinecone and load_qa_chain

In [12]:
import pinecone  # this is pinecone-client package
#vector search DB in pinecon
pinecone.init(
    api_key="***",
    environment="gcp-starter"
)
index_name="langchainvector"

In [13]:
from langchain.vectorstores import Pinecone
index = Pinecone.from_documents(doc,embeddings,index_name=index_name)


In [14]:
## Cosine Similarity Retreive Results from VectorDB
def retrieve_query(query,k=2):
    matching_results=index.similarity_search(query,k=k)
    return matching_results

In [15]:
from langchain.chains.question_answering import load_qa_chain
from langchain import OpenAI
from langchain_community.chat_models import ChatOpenAI

In [23]:

llm=OpenAI(model_name="text-davinci-003",temperature=0.1)
#llm=OpenAI(model_name="gpt-3.5-turbo",temperature=0.1)

chain=load_qa_chain(llm,chain_type="stuff")

In [17]:
## Search answers from VectorDB
def retrieve_answers(query):
    doc_search=retrieve_query(query)
    print(doc_search)
    response=chain.run(input_documents=doc_search,question=query)
    return response

In [18]:
our_query = "what are the risks related to acquisitions"
answer = retrieve_answers(our_query)


[Document(page_content='•stock price impact, fines, fees or reputation harm if we are unable to obtain regulatory approval for an acquisition or are otherwise\nunable to close an acquisition;\n•potential issuances of debt to finance our acquisitions, resulting in increased debt, increased interest expense, and compliance with debt\ncovenants or other restrictions;\n•the potential for our acquisitions to result in dilutive issuances of our equity securities;\n•the potential variability of the amount and form of any performance-based consideration;\n•negative changes in general economic conditions in the regions or the industries in which we or our target operate;\n•exposure to additional cybersecurity risks and vulnerabilities; and\n•impairment of relationships with, or loss of our or our target’s employees, vendors and customers.\nFor example, when integrating acquisition target systems into our own, we have experienced and may continue to experience challenges\nincluding lengthy and c

In [19]:
print(answer)

 There are risks related to acquisitions such as stock price impact, fines, fees or reputation harm if regulatory approval is not obtained or the acquisition is unable to close, potential issuances of debt to finance the acquisition resulting in increased debt and interest expense, the potential for dilutive issuances of equity securities, the potential variability of the amount and form of any performance-based consideration, negative changes in general economic conditions in the regions or industries in which the company or target operate, exposure to additional cybersecurity risks and vulnerabilities, and impairment of relationships with or loss of the company's or target's employees, vendors and customers.


Query with ChromaDB and RetrievalQA

In [20]:
from langchain.chains import RetrievalQA
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

In [21]:
# create the vectorestore to use as the index
db = Chroma.from_documents(documents, embeddings)
# expose this index in a retriever interface
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":2})
# create a chain to answer questions 
qa = RetrievalQA.from_chain_type(
    llm=OpenAI(), chain_type="stuff", retriever=retriever, return_source_documents=True)
result = qa({"query": our_query})

In [22]:
print(result)

{'query': 'what are the risks related to acquisitions', 'result': ' The risks related to acquisitions include the diversion of capital and other resources, difficulty in realizing a satisfactory return, difficulty or inability in obtaining governmental, regulatory approval or restrictions or other consents and approvals or financing, stock price impact, fines, fees or reputation harm if unable to obtain regulatory approval or close an acquisition, potential issuances of debt to finance the acquisition, potential for dilutive issuances of equity securities, potential variability of performance-based consideration, negative changes in general economic conditions, and exposure to additional cybersecurity risks and vulnerabilities.', 'source_documents': [Document(page_content='additional risks related to acquisitions and strategic investments, including the diversion of capital and other resources, including management’s\nattention; difficulty in realizing a satisfactory return and uncerta