In [1]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
loader = UnstructuredPDFLoader("langchain_docs.pdf")
# loader = OnlinePDFLoader("https://wolfpaulus.com/wp-content/uploads/2017/05/field-guide-to-data-science.pdf")

In [3]:
data = loader.load()

detectron2 is not installed. Cannot use the hi_res partitioning strategy. Falling back to partitioning with the fast strategy.


In [4]:
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')

You have 1 document(s) in your data
There are 6124 characters in your document


# CREATE DOCUMENTS

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)
print (f'Now you have {len(texts)} documents')

Now you have 7 documents


# GET EMBEDDINGS FOR DOCUMENTS

In [6]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

  from tqdm.autonotebook import tqdm


In [7]:
import os
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')

PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV')

In [8]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [9]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "langchaintest" # put in the name of your pinecone index here

In [10]:
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

In [11]:
query = "What is Langchain?"
docs = docsearch.similarity_search(query, include_metadata=True)

# AI RESPONSE

In [12]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [13]:
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

In [14]:
query = "What is Langchain?"
docs = docsearch.similarity_search(query, include_metadata=True)

In [15]:
chain.run(input_documents=docs, question=query)

' LangChain is a framework for developing applications powered by language models. It provides a standard interface for chains, lots of integrations with other tools, and end-to-end chains for common applications. It also provides a standard interface for agents, a selection of agents to choose from, and examples of end to end agents. It can be used to create data-aware applications that allow a language model to interact with its environment.'