In [17]:
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

In [2]:
import os
from dotenv import load_dotenv

In [3]:
load_dotenv()

True

In [4]:
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = os.getenv("LANGCHAIN_API_KEY")
os.environ['OPENAI_API_KEY'] =  os.getenv("OPENAI_API_KEY")

# Loading Document

In [5]:
document = PyPDFLoader("D:\internship\Verisk\DocumentSummarizer\document.pdf")
pages = document.load_and_split()


  document = PyPDFLoader("D:\internship\Verisk\DocumentSummarizer\document.pdf")


In [6]:
pages[0]

Document(metadata={'source': 'D:\\internship\\Verisk\\DocumentSummarizer\\document.pdf', 'page': 0}, page_content='Computer Science I\nDr. Chris Bourke\ncbourke@cse.unl.edu\nDepartment of Computer Science & Engineering\nUniversity of Nebraska–Lincoln\nLincoln, NE 68588, USA\n2018/08/09 16:16:18\nVersion 1.3.6\nφεψεΥδγγΩΨΓφΓ κ\nξ\nγ\nχ\nψ\nΞ\nθ\nρ\nψ\nϑ\nΨ\nΥ\nϵ\n∆\nγ\nκ\nσ\nΓ\nπ\nψ\nχ\nΓ\nω\nχ\nβ ∆πΛΨυ∆ϖχΞϱϑϕ')

In [7]:
text = document.load()

In [8]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1200,chunk_overlap=300)
splitted_doc = text_splitter.split_documents(text)

In [9]:
len(splitted_doc)

1231

# Embedding and Vector Store

In [10]:
vector_store = Chroma.from_documents(documents=splitted_doc,embedding=OpenAIEmbeddings())

In [11]:
question = "What is control flow?"
result = vector_store.similarity_search(question)
# result[0].page_content
result

[Document(metadata={'page': 50, 'source': 'D:\\internship\\Verisk\\DocumentSummarizer\\document.pdf'}, page_content='2. Basics\n2.1. Control Flow\nThe ﬂow of control (or simply control ﬂow) is how a program processes its instructions.\nTypically, programs operate in a linear or sequential ﬂow of control. Executable statements\nor instructions in a program are performed one after another. In source code, the order\nthat instructions are written deﬁnes their order. Just like English, a program is “read”\ntop to bottom. Each statement may modify the state of a program. The state of a\nprogram is the value of all its variables and other information/data stored in memory\nat a given moment during its execution. Further, an executable statement may instead\ninvoke (or call or execute) another procedure (also called subroutine, function, method,\netc.) which is another unit of code that has been encapsulated into one unit so that it\ncan be reused.\nThis type of control ﬂow is usually associa

## Chain and Retrievals

In [12]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo",temperature=0) # temperature-0 : applying softmax and random sampling

In [37]:
template = """
Answer the given question based only on the given context
context: {context},

If you find anything outside of the context reply that you are unable to do so, just request the user to 
ask proper question.

question: {question}
"""
prompt =  ChatPromptTemplate.from_template(template)

In [38]:
document_chain = create_stuff_documents_chain(llm,prompt)

In [39]:
retriever = vector_store.as_retriever()
retrieval_chain = create_retrieval_chain(retriever,document_chain)

In [40]:
retrieval_chain.invoke({"question":"Who are the authors of these document"})

KeyError: 'input'

In [48]:
question = "What are those"

In [49]:
chain = prompt | llm
response = chain.invoke({"context":retriever.get_relevant_documents(question),"question":question})
response.content

'I am unable to provide an answer as the question is not specific enough. Please provide more context or ask a more specific question.'