# Talk To PDF Using Langchain

## Step 1: Install Libraries

In [None]:
!pip install langchain
!pip install openai
!pip install PyPDF2
!pip install faiss-cpu
!pip install tiktoken

## Step 2: Import Libraries

In [None]:
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS

## Step 3: Configure API Keys

In [None]:
import os
os.environ["OPENAI_API_KEY"] = ""
os.environ["SERPAPI_API_KEY"] = ""

## Step 4: Read and Extract Text from PDF

In [None]:
# provide the path of  pdf file/files.
pdfreader = PdfReader('1706.03762.pdf')

In [None]:
from typing_extensions import Concatenate
# read text from pdf
raw_text = ''
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

In [None]:
raw_text

## Step 5: Split the text in the overlapped chunks

In [None]:
# We need to split the text using Character Text Split such that it should not increse token size
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 800,
    chunk_overlap  = 200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

In [None]:
len(texts)

## Step 6: Create Embeddings and load it to the vector store (from the overlapped Chunks )

In [None]:
# Download embeddings from OpenAI
embeddings = OpenAIEmbeddings()

In [None]:
document_search = FAISS.from_texts(texts, embeddings)

In [None]:
document_search


## Step 7: Create Chain using: QA Chain and OpenAI LLM

In [None]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

In [None]:
chain = load_qa_chain(OpenAI(), chain_type="stuff")

## Step 8: Run the chain against the doc(similarity based) and question-query

In [None]:
query = "Vision for Amrit Kaal"
docs = document_search.similarity_search(query)
chain.run(input_documents=docs, question=query)

In [None]:
query = "How much the agriculture target will be increased to and what the focus will be"
docs = document_search.similarity_search(query)
chain.run(input_documents=docs, question=query)