In [3]:
#!pip install langchain-openai unstructured==0.7.12 pinecone-client openai tiktoken
#!pip install langchain
#!pip install --upgrade langchain-pinecone

In [24]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.vectorstores.pinecone import Pinecone
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain.chains import RetrievalQA
from langchain import OpenAI
from langchain_openai import ChatOpenAI
import os
import nltk
import pinecone
import openai
import getpass
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore

Set Pinecone Instance

In [5]:
pine_cone_api_key = getpass.getpass()

··········


In [27]:
os.environ['PINECONE_API_KEY'] = pine_cone_api_key

In [26]:
pc = Pinecone(api_key=pine_cone_api_key)
pc

<pinecone.control.pinecone.Pinecone at 0x7ae5cae79f60>

Enter the urls from which the pdf needs to be extracted

In [59]:
urls = ['https://arxiv.org/pdf/1706.03762', 'https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf' ] #URLs where a pdf document exists
from langchain.document_loaders import UnstructuredURLLoader
loader = UnstructuredURLLoader(urls=urls) #Will read the url which has pdf
urls_pdf_content = loader.load()

In [60]:
# Merging all pdf content from the urls into a single list
documents = []
documents.extend(urls_pdf_content)
#documents[0]

Split the texts 'loaded' from each URL

In [50]:
textsplitter = RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=20)
text = textsplitter.split_documents(documents)
text[:10]
#Here each splitted text has a source associated with them

[Document(page_content='Provided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works.\n\nAttention Is All You Need\n\n3 2 0 2\n\nAshish Vaswani∗ Google Brain avaswani@google.com\n\nNoam Shazeer∗ Google Brain noam@google.com\n\nNiki Parmar∗ Google Research nikip@google.com\n\nJakob Uszkoreit∗ Google Research usz@google.com\n\ng u A 2\n\nLlion Jones∗ Google Research llion@google.com', metadata={'source': 'https://arxiv.org/pdf/1706.03762'}),
 Document(page_content='Aidan N. Gomez∗ † University of Toronto aidan@cs.toronto.edu\n\nŁukasz Kaiser∗ Google Brain lukaszkaiser@google.com\n\n] L C . s c [\n\nIllia Polosukhin∗ ‡ illia.polosukhin@gmail.com\n\nAbstract\n\n7 v 2 6 7 3 0 . 6 0 7 1 : v i X r a', metadata={'source': 'https://arxiv.org/pdf/1706.03762'}),
 Document(page_content='The dominant sequence transduction models are based on complex recurrent or convolutional neural netwo

Instantiate OPEN AI

In [11]:
openai_api_key = getpass.getpass('Enter Open AI API key')

Enter Open AI API key··········


In [12]:
os.environ["OPENAI_API_KEY"] = openai_api_key

In [16]:
pc.list_indexes().names()

['myindex']

In [51]:
#Create embeddings from your splitted text

embeddings = OpenAIEmbeddings()
pinecone_index_name = 'myindex'

if pinecone_index_name not in pc.list_indexes().names():

    #You should see 'myindex' on your pineconde portal
    pc.create_index(name=pinecone_index_name, metric='cosine', dimension=1536,spec=ServerlessSpec(cloud="aws",region="us-east-1"))

In [52]:
pc_vs= PineconeVectorStore(index_name=pinecone_index_name,embedding=embeddings)

In [53]:
docsearch = pc_vs.from_documents(text, #Splitted text object
                                embeddings, #The embeddings object
                                index_name=pinecone_index_name) #name of the index to stor
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x7ae5c5218c70>

Use LLM

In [31]:
llm = ChatOpenAI(temperature=0.1, model_name="gpt-4", max_tokens=256)

In [54]:
# RetrievalQA Retrieves relevant passages from a document corpus in response to a user query.
retriever = RetrievalQA.from_chain_type(llm=llm,
                                        retriever=docsearch.as_retriever(),
                                        verbose=True,
                                        return_source_documents=True)

retriever

RetrievalQA(verbose=True, combine_documents_chain=StuffDocumentsChain(llm_chain=LLMChain(prompt=ChatPromptTemplate(input_variables=['context', 'question'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], template="Use the following pieces of context to answer the user's question. \nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\n----------------\n{context}")), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['question'], template='{question}'))]), llm=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x7ae5c6926a70>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x7ae5c6927070>, model_name='gpt-4', temperature=0.1, openai_api_key=SecretStr('**********'), openai_proxy='', max_tokens=256)), document_variable_name='context'), return_source_documents=True, retriever=VectorStoreRetriever(tags=['PineconeVectorStore', 'OpenAIEmbeddings'],

In [57]:
prompt = "You are a researcher who is going to search the web links, summarize them and share insights as asked"

query = 'Is transformer architecture useful?'

result = retriever({'query':query,'prompt':prompt})



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


In [58]:
#Final result
result

{'query': 'Is transformer architecture useful?',
 'prompt': 'You are a researcher who is going to search the web links, summarize them and share insights as asked',
 'result': 'Yes, the Transformer architecture is useful. It has been shown to perform strongly on various tasks such as machine translation, document generation, and syntactic parsing. It provides a more structured memory for handling long-term dependencies in text, compared to alternatives like recurrent networks. This results in robust transfer performance across diverse tasks. For translation tasks, the Transformer can be trained significantly faster than architectures based on recurrent or convolutional layers and has achieved a new state of the art in translation quality.',
 'source_documents': [Document(page_content='For our model architecture, we use the Transformer [62], which has been shown to perform strongly on various tasks such as machine translation [62], document generation [34], and syntactic parsing [29]. T