In [None]:
import numpy as np
import pandas as pd


In [23]:
#!pip install langchain --upgrade
# Version: 0.0.164

#!pip install pypdf



In [25]:
# PDF Loaders. If unstructured gives you a hard time, try PyPDFLoader
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

### Load your data

In [26]:
loader = PyPDFLoader("../data/Samenvatting filmpjes V5 compleet.pdf")

## Other options for loaders 
# loader = UnstructuredPDFLoader("../data/field-guide-to-data-science.pdf")
# loader = OnlinePDFLoader("https://wolfpaulus.com/wp-content/uploads/2017/05/field-guide-to-data-science.pdf")

In [27]:
data = loader.load()

In [29]:
# Note: If you're using PyPDFLoader then it will split by page for you already
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')

You have 18 document(s) in your data
There are 3659 characters in your document


### Chunk your data up into smaller documents

In [30]:
# Note: If you're using PyPDFLoader then we'll be splitting for the 2nd time.
# This is optional, test out on your own data.

text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [31]:
print (f'Now you have {len(texts)} documents')

Now you have 24 documents


### Create embeddings of your documents to get ready for semantic search

In [34]:
from langchain.vectorstores import  Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone


AttributeError: module 'numpy' has no attribute 'double'

In [9]:
# Check to see if there is an environment variable with you API keys, if not, use what you put below
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY', 'YourAPIKey')

PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', 'YourAPIKey')
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', 'us-east1-gcp') # You may need to switch with your env

In [10]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [11]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "langchaintest" # put in the name of your pinecone index here

In [12]:
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

In [14]:
query = "What are examples of good data science teams?"
docs = docsearch.similarity_search(query)

In [17]:
# Here's an example of the first document that was returned
print(docs[0].page_content[:450])

Intelligence and cloud infrastructure development  
work. We saw the need for a  
new approach to distill value 
from our clients’ data. We 
approached the problem 
with a multidisciplinary 
team of computer scientists, 
mathematicians and domain 
experts. They immediately 
produced new insights and 
analysis paths, solidifying the 
validity of the approach. Since 
that time, our Data Science  
team has grown to 250 staff 
supporting dozens of cl


### Query those docs to get your answer back

In [18]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [19]:
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

In [21]:
query = "What is the collect stage of data maturity?"
docs = docsearch.similarity_search(query)

In [22]:
chain.run(input_documents=docs, question=query)

' The collect stage of data maturity focuses on collecting internal or external datasets. Gathering sales records and corresponding weather data is an example of the collect stage.'