In [5]:
# !pip install langchain --upgrade
# Version: 0.0.164

# !pip install pypdf

In [41]:
# PDF Loaders. If unstructured gives you a hard time, try PyPDFLoader
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

### Load your data

In [42]:
loader = PyPDFLoader("gym_data.pdf")

## Other options for loaders 
# loader = UnstructuredPDFLoader("../data/field-guide-to-data-science.pdf")
# loader = OnlinePDFLoader("https://wolfpaulus.com/wp-content/uploads/2017/05/field-guide-to-data-science.pdf")

In [43]:
data = loader.load()

In [44]:
# Note: If you're using PyPDFLoader then it will split by page for you already
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[30].page_content)} characters in your document')

You have 50 document(s) in your data
There are 2132 characters in your document


### Chunk your data up into smaller documents

In [45]:
# Note: If you're using PyPDFLoader then we'll be splitting for the 2nd time.
# This is optional, test out on your own data.

text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [46]:
print (f'Now you have {len(texts)} documents')

Now you have 79 documents


### Create embeddings of your documents to get ready for semantic search

In [47]:
# !pip install pinecone-client

In [48]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

In [49]:
from config import openai_key,pinecone_key

In [58]:
# Check to see if there is an environment variable with you API keys, if not, use what you put below
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY', 'YourAPIKey')

PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', 'YourAPIKey')
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', 'us-east1-gcp') # You may need to switch with your env

In [51]:
# !pip install openai

In [60]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [61]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "pksamftbot" # put in the name of your pinecone index here

In [62]:
# !pip install pip install tiktoken

In [64]:
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

In [75]:
query = "What are examples of good diet?"
docs = docsearch.similarity_search(query)

A good diet typically consists of a balanced and varied selection of nutritious foods. Here are some examples of components that make up a healthy diet 

Fruits and vegetables: Include a variety of colorful fruits and vegetables in your meals. They provide essential vitamins, minerals, and fiber. 

Whole grains: Opt for whole grain products like brown rice, whole wheat bread, and oats. These are higher in fiber and nutrients compared to refined grains.

Lean proteins: Choose lean sources of protein such as poultry, fish, beans, lentils, tofu, and low-fat dairy products. They provide important amino acids for muscle repair and growth


In [76]:
# 

In [71]:
# Here's an example of the first document that was returned
# print(docs[0].page_content[:450])

### Query those docs to get your answer back

In [72]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [73]:
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

In [77]:
query = "best exercise for legs"
docs = docsearch.similarity_search(query)


Squats: Squats are a compound exercise that primarily target the quadriceps, hamstrings, and glutes. They also engage the calves and core. Start with your feet shoulder-width apart, lower your body as if sitting back into a chair, and then return to a standing position.



In [78]:
# 

In [79]:
# chain.run(input_documents=docs, question=query)