In [None]:
"""
Retrieval Augmentation Method
: feeding external knowledge to LLM asssdfs
"""

In [1]:
import os
from dotenv import load_dotenv
load_dotenv(verbose=True)

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_ENV = os.getenv('PINECONE_ENV')
print(f'OPENAI_API_KEY : {OPENAI_API_KEY}')
print(f'PINECONE_API_KEY : {PINECONE_API_KEY}')
print(f'PINECONE_ENV : {PINECONE_ENV}')


OPENAI_API_KEY : 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
PINECONE_API_KEY : dc56b06e-8fee-4750-b991-c8ae1ebd17e5
PINECONE_ENV : us-west1-gcp-free


In [2]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("us_constitution.pdf")
pages = loader.load()
print(pages[0].__dict__)
# print(pages[1].page_content)

{'page_content': 'The\nUnited\nStates\nConstitution\nW e\nthe\nPeople\nof\nthe\nUnited\nStates,\nin\nOrder\nto\nform\na\nmore\nperfect\nUnion,\nestablish\nJustice,\ninsure\ndomestic\nT ranquility ,\nprovide\nfor\nthe\ncommon\ndefence,\npromote\nthe\ngeneral\nW elfare,\nand\nsecure\nthe\nBlessings\nof\nLiberty\nto\nourselves\nand\nour\nPosterity ,\ndo\nordain\nand\nestablish\nthis\nConstitution\nfor\nthe\nUnited\nStates\nof\nAmerica.\nThe\nConstitutional\nCon v ention\nArticle\nI\nSection\n1:\nCongress\nAll\nlegislative\nPowers\nherein\ngranted\nshall\nbe\nvested\nin\na\nCongress\nof\nthe\nUnited\nStates,\nwhich\nshall\nconsist\nof\na\nSenate\nand\nHouse\nof\nRepresentatives.\nSection\n2:\nThe\nHouse\nof\nRepresentatives', 'metadata': {'source': 'us_constitution.pdf', 'page': 0}}


In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 0)
all_splits = text_splitter.split_documents(pages)
print(all_splits)#.__dict__)


[Document(page_content='The\nUnited\nStates\nConstitution\nW e\nthe\nPeople\nof\nthe\nUnited\nStates,\nin\nOrder\nto\nform\na\nmore\nperfect\nUnion,\nestablish\nJustice,\ninsure\ndomestic\nT ranquility ,\nprovide\nfor\nthe\ncommon\ndefence,\npromote\nthe\ngeneral\nW elfare,\nand\nsecure\nthe\nBlessings\nof\nLiberty\nto\nourselves\nand\nour\nPosterity ,\ndo\nordain\nand\nestablish\nthis\nConstitution\nfor\nthe\nUnited\nStates\nof\nAmerica.\nThe\nConstitutional\nCon v ention\nArticle\nI\nSection\n1:\nCongress\nAll\nlegislative\nPowers\nherein\ngranted\nshall\nbe\nvested\nin\na\nCongress\nof\nthe', metadata={'source': 'us_constitution.pdf', 'page': 0}), Document(page_content='United\nStates,\nwhich\nshall\nconsist\nof\na\nSenate\nand\nHouse\nof\nRepresentatives.\nSection\n2:\nThe\nHouse\nof\nRepresentatives', metadata={'source': 'us_constitution.pdf', 'page': 0}), Document(page_content='The\nHouse\nof\nRepresentatives\nshall\nbe\ncomposed\nof\nMembers\nchosen\nevery\nsecond\nY ear\nby\nth

In [12]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.0004:.6f}')

print_embedding_cost(all_splits)

Total Tokens: 16776
Embedding Cost in USD: 0.006710


In [4]:
import os
import pinecone

print('PINECONE_API_KEY', PINECONE_API_KEY)
print('PINECONE_ENV', PINECONE_ENV)
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_ENV,  # next to api key in console
)


index_name = "langchain-demo"

# First, check if our index already exists. If it doesn't, we create it
if index_name not in pinecone.list_indexes():
    # we create a new index
    print('start create index')
    pinecone.create_index(
          name=index_name,
          metric='cosine',
          dimension=1536  
    )
    print('end create index')
else:
    print('index already exists')

  from tqdm.autonotebook import tqdm


PINECONE_API_KEY dc56b06e-8fee-4750-b991-c8ae1ebd17e5
PINECONE_ENV us-west1-gcp-free
index already exists


In [16]:
def insert_or_fetch_embeddings(index_name):
    import pinecone
    from langchain.vectorstores import Pinecone
    from langchain.embeddings.openai import OpenAIEmbeddings
    
    embeddings = OpenAIEmbeddings()
    
    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))
    
    if index_name in pinecone.list_indexes():
        print(f'Index {index_name} already exists. Loading embeddings ... ', end='')
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
        print('Ok')
    else:
        print(f'Creating index {index_name} and embeddings ...', end='')
        pinecone.create_index(index_name, dimension=1536, metric='cosine')
        vector_store = Pinecone.from_documents(all_splits, embeddings, index_name=index_name)
        print('Ok')
        
    return vector_store
    

In [17]:
vector_store = insert_or_fetch_embeddings("langchain-demo")

Creating index langchain-demo and embeddings ...Ok


In [18]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)

retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})

chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

q = 'What is the whole document about?'
answer = chain.run(q)
print(answer)

The document in question is the United States Constitution. It is a legal document that outlines the fundamental principles and framework of the United States government. It establishes the structure of the government, defines the powers and limitations of the different branches, and guarantees certain rights and freedoms to the American people.
