In [57]:
# %pip install pinecone-client -q
# %pip install langchain -q
# %pip install unstructured -q
# %pip install tiktoken -q

Note: you may need to restart the kernel to use updated packages.


In [32]:
import langchain

from langchain.document_loaders import UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings

from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

from langchain.cache import InMemoryCache


import pinecone
import config_api

In [21]:
loader = UnstructuredPDFLoader('docs/intro-ds.pdf')
data = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [22]:
embeddings = OpenAIEmbeddings(openai_api_key=config_api.OPENAI_API_KEY)

In [25]:
pinecone.init(
    api_key=config_api.PINECONE_API_KEY, environment=config_api.PINECONE_API_ENV
)
index_name = "langchain-pdf-qa"

docsearch = Pinecone.from_texts([text.page_content for text in texts],embedding=embeddings, index_name=index_name)

## if Pinecone index already exists
# docsearch = Pinecone.from_existing_index(index_name, embeddings=embeddings)

In [33]:
llm = OpenAI(openai_api_key=config_api.OPENAI_API_KEY, temperature=0)
langchain.llm_cache = InMemoryCache()

chain = load_qa_chain(llm, chain_type="stuff")

In [27]:
query = "What is data science?"

docs = docsearch.similarity_search(query)
answer = chain.run(input_documents=docs, question=query)

print(answer)


 Data science is a rapidly evolving field that encompasses a variety of skills and disciplines, including engineering, analysis, and modeling/inference. It is used to extract useful information from data and is often associated with machine learning, deep learning, data mining, and pattern recognition.
