In [1]:
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS 
from api_key import key, indexname
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

#import warnings
#warnings.filterwarnings('ignore')

import os

In [2]:
os.environ['OPENAI_API_KEY'] = key

In [3]:
doc_reader = PdfReader('Dataset/Publication_1.pdf')

In [4]:
raw_text = ''
for i, page in enumerate(doc_reader.pages):
    text = page.extract_text()
    if text:
        raw_text += text

In [5]:
len(raw_text)

18900

In [6]:
raw_text[:100]

'NREL is a national laboratory of the U.S. Department of Energy  \nOffice of Energy Efficiency & Renew'

In [7]:
# Splitting up the text into smaller chunks for indexing
text_splitter = CharacterTextSplitter(        
    separator = "\n",
    chunk_size = 1000,
    chunk_overlap  = 200, #striding over the text
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

In [8]:
len(texts)

24

In [9]:
# Download embeddings from OpenAI
embeddings = OpenAIEmbeddings()

In [10]:
docsearch = FAISS.from_texts(texts, embeddings)

In [11]:
docsearch.embedding_function

<bound method OpenAIEmbeddings.embed_query of OpenAIEmbeddings(client=<class 'openai.api_resources.embedding.Embedding'>, model='text-embedding-ada-002', deployment='text-embedding-ada-002', openai_api_version='', openai_api_base='', openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key='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', openai_organization='', allowed_special=set(), disallowed_special='all', chunk_size=1000, max_retries=6, request_timeout=None, headers=None, tiktoken_model_name=None)>

In [12]:
chain = load_qa_chain(OpenAI(), 
                      chain_type="stuff") 

In [13]:
query = "who are the authors of this publication?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' The authors of this publication are A. Habte, M. Sengupta, A. Andreas, S. Wilcox, and T. Stoffel.'

In [14]:
query = "Is Ranganath Narasappa one of the authors of this publication?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' Yes, Ranganath Narasappa is listed as one of the authors of this publication.'

In [15]:
query = "What is the title of this publication?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-hcJ73RH54RI6O3nCQj1hlysF on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-hcJ73RH54RI6O3nCQj1hlysF on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/

' The title of this publication is "Characterization of a Low-Cost Multi-Parameter Sensor for Resource Applications".'

In [16]:
query = "summarise this publication in few lines?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-hcJ73RH54RI6O3nCQj1hlysF on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-hcJ73RH54RI6O3nCQj1hlysF on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/

' This publication is a conference paper presented at the 2018 World Conference on Photovoltaic Energy Conversion (WCPEC-7) in Waikoloa, Hawaii. It is written by Aron Habte, Manajit Sengupta, Afshin Andreas, Ranganath Narasappa, Taylor Thomas, Adam Wolf and Christian A. Gueymard and is about the characterisation of a low-cost multi-parameter sensor for resource applications. It is available, free of charge, from the National Renewable Energy Laboratory (NREL) website.'

In [17]:
query = "what are the contact information mentioned in this publication?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-hcJ73RH54RI6O3nCQj1hlysF on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-hcJ73RH54RI6O3nCQj1hlysF on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/

' Phone: 800.553.6847 or 703.605.6000, Fax: 703.605.6900, Email: orders@ntis.gov'