In [5]:
# !pip install langchain --upgrade
# Version: 0.0.164

# !pip install pypdf

In [41]:
# PDF Loaders. If unstructured gives you a hard time, try PyPDFLoader
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

### Load your data

In [42]:
loader = PyPDFLoader("gym_data.pdf")

## Other options for loaders 
# loader = UnstructuredPDFLoader("../data/field-guide-to-data-science.pdf")
# loader = OnlinePDFLoader("https://wolfpaulus.com/wp-content/uploads/2017/05/field-guide-to-data-science.pdf")

In [43]:
data = loader.load()

In [115]:
# data

In [44]:
# Note: If you're using PyPDFLoader then it will split by page for you already
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[30].page_content)} characters in your document')

You have 50 document(s) in your data
There are 2132 characters in your document


### Chunk your data up into smaller documents

In [125]:
import PyPDF2

def extract_text_from_pdf(file_path):
    text = ""
    with open(file_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        num_pages = len(reader.pages)
        for page_num in range(num_pages):
            page = reader.pages[page_num]
            text += page.extract_text()
    return text

# Path to your PDF file
pdf_file_path = "gym_data.pdf"

# Extract text from the PDF
pdf_text = extract_text_from_pdf(pdf_file_path)

# Print the extracted text
# print(pdf_text)


In [118]:
# !pip install PyPDF2

In [45]:
# Note: If you're using PyPDFLoader then we'll be splitting for the 2nd time.
# This is optional, test out on your own data.

text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [139]:
print (f'Now you have {len(texts)} documents')

Now you have 79 documents


### Create embeddings of your documents to get ready for semantic search

In [140]:
# !pip install pinecone-client

In [141]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

In [142]:
from config import openai_key,pinecone_key

In [143]:
# Check to see if there is an environment variable with you API keys, if not, use what you put below
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY', 'YourAPIKey')

PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', 'YourAPIKey')
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', 'us-east1-gcp') # You may need to switch with your env

In [144]:
# !pip install openai

In [145]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [146]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "pksamftbot" # put in the name of your pinecone index here

In [147]:
# !pip install pip install tiktoken

In [148]:
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

In [149]:
query = "What are examples of good diet?"
docs = docsearch.similarity_search(query)

In [76]:
# 

In [152]:
# Here's an example of the first document that was returned
print(docs[0].page_content[:450])

for  physical  and  mental  development,  ensures  high  efficiency,  pro-­motes  disease  prevention  and  enhances  the  body's  ability  to  withstand  the  effects  of  ad-­verse  environmental  factors  (Morozov,  2014,  307).  A  healthy  balanced  diet  should  strive  to  include  a  wide  range  of  different  products  from  the  four  main  food  groups  (bread,  other  cereals  and  potatoes,  fruits  and  vegetables,  milk  and  dair


### Query those docs to get your answer back

In [153]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [154]:
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

In [159]:
query = "What is the purpose of comparing the results before and after correct nutrition in the given idea?"
docs = docsearch.similarity_search(query)

In [160]:
print(docs[0].page_content)

our  idea  is  to  compare  the  results  before  correct  nutrition  and  after,  so  that  people  can  see  how  it  works  and  motivate  each  other.    For  these  projects,  the  gym  might  need  to  find  the  specialists  or  researchers  who  can  give  people  valid  information  and  motivate  them.  We  strongly  believe  that  these  activities  can  bring  people  together,  so  that  while  interaction  they  can  learn  from  each  other.  As  well  as  practical  workshops  will  motivate  customers  to  follow  healthy  lifestyle  and  gather  new  in-­formation.  Besides,  it  can  help  to  expand  the  number  of  customers  as  those  activities  will  be  unique  in  the  area.


In [108]:
# !pip install scikit-learn


In [109]:
# chain.run(input_documents=docs, question=query)

In [130]:
import openai


In [134]:
from openai.embeddings_utils import get_embedding


In [136]:
# get_embedding("the fox crossed the road", engine='text-embedding-ada-002')