In [18]:
import warnings
warnings.filterwarnings('ignore')


In [19]:
import json
import os
import sys

import boto3

module_path = ".."
sys.path.append(os.path.abspath(module_path))
from utils import bedrock, printing


# ---- ⚠️ Un-comment and edit the below lines as needed for your AWS setup ⚠️ ----

# os.environ["AWS_DEFAULT_REGION"] = "<REGION_NAME>"  # E.g. "us-east-1"
# os.environ["AWS_PROFILE"] = "<YOUR_PROFILE>"
# os.environ["BEDROCK_ASSUME_ROLE"] = "<YOUR_ROLE_ARN>"  # E.g. "arn:aws:..."

boto3_bedrock = bedrock.get_bedrock_client(
    assumed_role=os.environ.get("BEDROCK_ASSUME_ROLE", None),
    region=os.environ.get("AWS_DEFAULT_REGION", None)
)


Create new client
  Using region: None
boto3 Bedrock client successfully created!
bedrock-runtime(https://bedrock-runtime.us-west-2.amazonaws.com)


In [20]:
# We will be using the Titan Embeddings Model to generate our Embeddings.
from langchain.embeddings import BedrockEmbeddings
from langchain.llms.bedrock import Bedrock

# - create the Anthropic Model
llm = Bedrock(model_id="anthropic.claude-v2", client=boto3_bedrock, model_kwargs={'max_tokens_to_sample':200})
bedrock_embeddings = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1", client=boto3_bedrock)


In [4]:
from urllib.request import urlretrieve

os.makedirs("data", exist_ok=True)
files = [
    "https://www.irs.gov/pub/irs-pdf/p1544.pdf",
    "https://www.irs.gov/pub/irs-pdf/p15.pdf",
    "https://www.irs.gov/pub/irs-pdf/p1212.pdf",
]
for url in files:
    file_path = os.path.join("data", url.rpartition("/")[2])
    urlretrieve(url, file_path)


In [21]:
import numpy as np
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, PyPDFDirectoryLoader

loader = PyPDFDirectoryLoader("./data/")

documents = loader.load()
# - in our testing Character split works better with this PDF data set
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 1000,
    chunk_overlap  = 100,
)
docs = text_splitter.split_documents(documents)


In [6]:
avg_doc_length = lambda documents: sum([len(doc.page_content) for doc in documents])//len(documents)
avg_char_count_pre = avg_doc_length(documents)
avg_char_count_post = avg_doc_length(docs)
print(f'Average length among {len(documents)} documents loaded is {avg_char_count_pre} characters.')
print(f'After the split we have {len(docs)} documents more than the original {len(documents)}.')
print(f'Average length among {len(docs)} documents (after split) is {avg_char_count_post} characters.')


Average length among 73 documents loaded is 5850 characters.
After the split we have 503 documents more than the original 73.
Average length among 503 documents (after split) is 910 characters.


In [7]:
try:

    sample_embedding = np.array(bedrock_embeddings.embed_query(docs[0].page_content))
    print("Sample embedding of a document chunk: ", sample_embedding)
    print("Size of the embedding: ", sample_embedding.shape)

except ValueError as error:
    if  "AccessDeniedException" in str(error):
        print(f"\x1b[41m{error}\
        \nTo troubeshoot this issue please refer to the following resources.\
         \nhttps://docs.aws.amazon.com/IAM/latest/UserGuide/troubleshoot_access-denied.html\
         \nhttps://docs.aws.amazon.com/bedrock/latest/userguide/security-iam.html\x1b[0m\n")
        class StopExecution(ValueError):
            def _render_traceback_(self):
                pass
        raise StopExecution
    else:
        raise error


Sample embedding of a document chunk:  [ 0.27539062 -0.38085938 -0.29492188 ... -0.109375   -0.26757812
 -0.11230469]
Sample metadata of a document chunk:  {'source': 'data/p1544.pdf', 'page': 0}
Sample text of a document chunk:  clerks of federal or state courts are discussed 
later under Bail received by court clerks.
However, you do not have to file Form 8300 
if the transaction is not related to your trade or 
business. For example, if you own a jewelry 
store and sell your personal automobile for 
more than $10,000 in cash, you would not sub-
mit a Form 8300 for that transaction.
Transaction defined.  A “transaction” occurs 
when:
Goods, services, or property are sold;
Property is rented;
Cash is exchanged for other cash;
A contribution is made to a trust or escrow 
account;
A loan is made or repaid; or
Cash is converted to a negotiable instru-
ment, such as a check or a bond.
Person defined.  A “person” includes an indi-
vidual, a company, a corporation, a partnership, 
an associ

In [22]:
from langchain.chains.question_answering import load_qa_chain
from langchain.vectorstores import FAISS
from langchain.indexes import VectorstoreIndexCreator
from langchain.indexes.vectorstore import VectorStoreIndexWrapper

vectorstore_faiss = FAISS.from_documents(
    docs,
    bedrock_embeddings,
)

wrapper_store_faiss = VectorStoreIndexWrapper(vectorstore=vectorstore_faiss)


In [9]:
query = """Is it possible that I get sentenced to jail due to failure in filings?"""
query_embedding = vectorstore_faiss.embedding_function(query)
np.array(query_embedding)


array([-0.18066406,  0.25976562, -0.09082031, ...,  0.3984375 ,
       -0.58984375, -0.27148438])

In [10]:
relevant_documents = vectorstore_faiss.similarity_search_by_vector(query_embedding)
print(f'{len(relevant_documents)} documents are fetched which are relevant to the query.')
print('----')
for i, rel_doc in enumerate(relevant_documents):
    print(f'## Document {i+1}: {rel_doc.page_content}.......')
    print('---')


4 documents are fetched which are relevant to the query.
----
## Document 1: There are civil penalties for failure to:
File a correct Form 8300 by the date it is 
due, and
Provide the required statement to those 
named in the Form 8300.
If you intentionally disregard the requirement 
to file a correct Form 8300 by the date it is due, 
the penalty is the greater of:
1.$25,000, or
2.The amount of cash you received and 
were required to report (up to $100,000).
There are criminal penalties for:
Willful failure to file Form 8300,
Willfully filing a false or fraudulent Form 
8300,
Stopping or trying to stop Form 8300 from 
being filed, and
Setting up, helping to set up, or trying to 
set up a transaction in a way that would 
make it seem unnecessary to file Form 
8300.
If you willfully fail to file Form 8300, you can 
be fined up to $250,000 for individuals 
RECORDS($500,000 for corporations) or sentenced to up 
to 5 years in prison, or both. These dollar 
amounts are based on Section 3571 

In [11]:
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
prompt_template = """

Human: Use the following pieces of context to provide a concise answer to the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
<context>
{context}
</context

Question: {question}

Assistant:"""

PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)


In [12]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore_faiss.as_retriever(
        search_type="similarity", search_kwargs={"k": 3}
    ),
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT}
)
answer = qa({"query": query})
print(answer)


{'query': 'Is it possible that I get sentenced to jail due to failure in filings?', 'result': ' Based on the context provided, yes - it is possible to be sentenced to jail due to failure in filings of Form 8300. The context states that there are criminal penalties for willful failure to file Form 8300, which can result in being fined up to $250,000 for individuals ($500,000 for corporations) or sentenced to up to 5 years in prison, or both. So jail time is a possible criminal penalty for willful failure to properly file Form 8300.', 'source_documents': [Document(page_content='There are civil penalties for failure to:\nFile a correct Form 8300 by the date it is \ndue, and\nProvide the required statement to those \nnamed in the Form 8300.\nIf you intentionally disregard the requirement \nto file a correct Form 8300 by the date it is due, \nthe penalty is the greater of:\n1.$25,000, or\n2.The amount of cash you received and \nwere required to report (up to $100,000).\nThere are criminal p

In [13]:


query_2 = "What is the difference between market discount and qualified stated interest"
answer_2  =answer = qa({"query": query_2})
printing.print_ww(answer_2)


{'query': 'What is the difference between market discount and qualified stated interest', 'result':
' Based on the context provided:\n\nMarket discount is the difference between the issue price plus
accrued OID and the adjusted basis (usually purchase price) of an OID debt instrument when it is
purchased in the secondary market at a value below its issue price. \n\nQualified stated interest is
stated interest that is unconditionally payable in cash or property at least annually over the term
of the debt instrument at a single fixed rate.\n\nSo the key difference is that market discount
refers to the discount on the purchase price of a debt instrument purchased in the secondary market
compared to its issue price, while qualified stated interest refers to a specific type of interest
payment on a debt instrument. Market discount is a form of discount, while qualified stated interest
is a form of interest payment.', 'source_documents': [Document(page_content="Market discount.  An
OID debt 

In [23]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

prompt_template = """

Human: Use the following pieces of context to provide a concise answer to the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
<context>
{context}
</context

Question: {question}

Assistant:"""

PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore_faiss.as_retriever(
        search_type="similarity", search_kwargs={"k": 3}
    ),
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT}
)
query = "what is the average velocity of an unladen swallow?"
result = qa({"query": query})
printing.print_ww(result['result'])
printing.print_ww(result['source_documents'])


 I do not have enough context to determine the average velocity of an unladen swallow. The provided
context discusses calculating yield to maturity for stripped bonds and coupons, and does not mention
anything about swallows.
---------------------
[Document(page_content='of a stripped bond or coupon is the discount \nrate that, when used in
figuring the present \nvalue of all principal and interest payments, pro-\nduces an amount equal to
the acquisition price.\nFiguring YTM.  How you figure the YTM for \na stripped debt instrument or
coupon pur-\nchased after 1984 depends on whether you Page 13 of 18  Fileid: …
ns/p1212/202301/a/xml/cycle06/source 12:46 - 31-Mar-2023\nThe type and rule above prints on all
proofs including departmental reproduction proofs. MUST be removed before printing.\nPublication
1212 (January 2023)   Page 13', metadata={'source': 'data/p1212.pdf', 'page': 12}),
Document(page_content='reporting requirements 2\nS\nSection I 3\nSection II 3\nSection III 3\nShort-
te