# PDF Analyzer

In [None]:
# pip install pinecone-client


In [None]:
import openai
import os 
from langchain.llms import AzureOpenAI
import pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Pinecone
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import PyPDFLoader
from dotenv import load_dotenv

load_dotenv()
openai.api_key = os.getenv('OPENAI_API_KEY')
openai.api_base= os.getenv('OPENAI_API_BASE')
openai.api_type= "azure"
openai.api_version = os.getenv('OPENAI_API_VERSION')
deployment:str=os.getenv('CHATGPT_MODEL')
embeddings_deployment:str=os.getenv('EMBEDDINGS_MODEL')

llm = AzureOpenAI(
    deployment_name=deployment,
    model_name=deployment
    )


In [None]:
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_ENV = os.getenv('PINECONE_ENV')
print(PINECONE_ENV)


In [None]:
loader = PyPDFLoader("gen_ai.pdf")
data = loader.load()
pages = len(data)
pdf_content = ''

for x in range(pages):
    pdf_content = pdf_content + data[x].page_content

print(pdf_content)    

import pinecone      




In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap = 0,
    length_function = len
)


In [None]:
split_pdf_content = text_splitter.create_documents([pdf_content])
print(len(split_pdf_content))


In [None]:
print(split_pdf_content[3])


In [None]:
pinecone.init(
    api_key = PINECONE_API_KEY,
    environment = PINECONE_ENV
)

index_name = 'langchainclass'


In [None]:
embeddings = OpenAIEmbeddings(
    deployment=embeddings_deployment,
    model=embeddings_deployment,
    engine=embeddings_deployment,
    chunk_size=1)


In [None]:
if index_name not in pinecone.list_indexes():
    print("index does not exist", index_name)

pdf_docsearch = Pinecone.from_texts(
    [t.page_content for t in split_pdf_content],
    embeddings,
    index_name = index_name
)



In [None]:
question = "Who is Sarah Guo?"

docs = pdf_docsearch.similarity_search(question)


chain = load_qa_chain(llm, chain_type="stuff")
chain.run(input_documents=docs, question=question)


In [None]:
question = "Who is John Smith?"

docs = pdf_docsearch.similarity_search(question)


chain = load_qa_chain(llm, chain_type="stuff")
chain.run(input_documents=docs, question=question)
