In [1]:
import os 
import numpy as np
from langchain_community.embeddings import HuggingFaceBgeEmbeddings, OpenAIEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_community.vectorstores import FAISS
from langchain_classic.chains import RetrievalQA
from langchain_classic.prompts import PromptTemplate

In [2]:
from urllib.request import urlretrieve

In [3]:
files = [
    "https://www.census.gov/content/dam/Census/library/publications/2022/demo/p70-178.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-017.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-016.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-015.pdf",
]
os.makedirs("us_census", exist_ok=True)

In [4]:
import urllib.request
import os

for url in files:
    req = urllib.request.Request(
        url,
        headers={"User-Agent": "Mozilla/5.0"}
    )

    file_path = os.path.join("us_census", url.rpartition("/")[2])

    with urllib.request.urlopen(req) as response, open(file_path, "wb") as out_file:
        out_file.write(response.read())


In [5]:
loader=PyPDFDirectoryLoader("./us_census/")

In [6]:
docs_before_split = loader.load()
print(f"Number of documents before splitting: {len(docs_before_split)}")


Number of documents before splitting: 63


In [7]:
len(docs_before_split[2].page_content)

1718

In [8]:
text_splitter=RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=50)
docs_after_split=text_splitter.split_documents(docs_before_split)

In [9]:
len(docs_after_split[2].page_content)

685

In [10]:
avg_len_split =lambda docs:sum ([len(doc.page_content) for doc in docs])/len(docs)
 

In [12]:
avg_len_before_split=avg_len_split(docs_before_split)
avg_len_after_split=avg_len_split(docs_after_split)
print(f"Average length of documents before splitting: {avg_len_before_split}")
print(f"Average length of documents after splitting: {avg_len_after_split}")

Average length of documents before splitting: 3841.3333333333335
Average length of documents after splitting: 624.2437185929648


In [13]:
from langchain_community.embeddings import HuggingFaceEmbeddings

huggingface_embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={"device": "cpu"},
    encode_kwargs={"normalize_embeddings": True}
)



  huggingface_embeddings = HuggingFaceEmbeddings(


In [None]:
huggingface_embeddings.embed_query("hello sally?")

[-0.05321172997355461,
 0.14960983395576477,
 0.08134892582893372,
 0.015747172757983208,
 -0.10374341160058975,
 0.02443990670144558,
 0.07608451694250107,
 -0.0038568542804569006,
 -0.07785522192716599,
 -0.008683320134878159,
 0.0132808368653059,
 -0.06875558197498322,
 0.034798018634319305,
 -0.06851699203252792,
 0.07301387935876846,
 0.024702247232198715,
 -0.0359608456492424,
 0.026374036446213722,
 -0.0411851592361927,
 0.03774288669228554,
 0.04970644414424896,
 0.040797315537929535,
 0.031543001532554626,
 0.023640330880880356,
 -0.022097475826740265,
 0.009641065262258053,
 0.04332054778933525,
 -0.054367851465940475,
 0.023850949481129646,
 -0.01434000302106142,
 -0.06951232254505157,
 0.020490234717726707,
 0.06323406100273132,
 0.0344301201403141,
 0.031272295862436295,
 0.051316916942596436,
 0.10787782818078995,
 0.026004305109381676,
 0.07277008891105652,
 -0.03325263783335686,
 -0.11101681739091873,
 -0.07697805762290955,
 -0.00824479665607214,
 -0.022002169862389565,

In [14]:
vectorstore = FAISS.from_documents(docs_after_split, huggingface_embeddings)

In [15]:
query="what are the trendiest in median household incomes  in the US between 2021 and 2023?"

In [16]:
relevant_docs=vectorstore.similarity_search(query, k=3) 

print(relevant_docs)
print("---")
print(relevant_docs[0].page_content)

[Document(id='2443132a-a687-478e-a269-1108313e3544', metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 18.5 (Windows)', 'creationdate': '2023-10-19T11:35:38-04:00', 'author': 'U.S. Census Bureau', 'keywords': 'household income in states and metropolitan areas 2022', 'moddate': '2023-11-30T12:35:09+00:00', 'title': 'Household Income in States and Metropolitan Areas: 2022', 'trapped': '/false', 'source': 'us_census\\acsbr-017.pdf', 'total_pages': 9, 'page': 1, 'page_label': '2'}, page_content='Comparisons\nThe U.S. median household income \nin 2022 was $74,755, according \nFigure 1.\nMedian Household Income in the Past 12 Months in the United States: 2005–2022\n \nNote: Estimates for 2020 experimental data not shown. For more information on the 2020 experimental data products, \nrefer to <www.census.gov/programs-surveys/acs/technical-documentation/user-notes/2021-02.html>. Information on \nconﬁdentiality protection, sampling error, nonsampling error, and deﬁnitio

In [17]:
retriever=vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [None]:
access_token = "**"
# Local LLM - no API token needed, runs on CPU
from langchain_community.llms import HuggingFacePipeline
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline

model_id = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, force_download=True)
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)

hf = HuggingFacePipeline(pipeline=pipe)

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cpu
  hf = HuggingFacePipeline(pipeline=pipe)


In [24]:
query = "What are the trends in median household income across different states in the united states between 2021 and 2022?"

hf.invoke(query)

'median household income in the state of california'

In [25]:

prompt_template = """Use the following pieces of context to answer the question at the end. Please follow the following rules:
1. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer but you may want to check the following links".
2. If you find the answer, write the answer in a concise way with five sentences maximum.

{context}

Question: {question}

Helpful Answer:
"""

PROMPT = PromptTemplate(
 template=prompt_template, input_variables=["context", "question"]
)
     

In [26]:
retrievalQA = RetrievalQA.from_chain_type(
    llm=hf, 
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs = {"prompt" : PROMPT}
)

### USE THE RAG !!! 

In [30]:
result = retrievalQA.invoke({"query" : query})
print(result)
print(result.keys())

{'query': 'What are the trends in median household income across different states in the united states between 2021 and 2022?', 'result': '0.8 percent', 'source_documents': [Document(id='16e80d21-5bd7-4f58-9f3f-9f32dd54c7e0', metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 18.5 (Windows)', 'creationdate': '2023-10-19T11:35:38-04:00', 'author': 'U.S. Census Bureau', 'keywords': 'household income in states and metropolitan areas 2022', 'moddate': '2023-11-30T12:35:09+00:00', 'title': 'Household Income in States and Metropolitan Areas: 2022', 'trapped': '/false', 'source': 'us_census\\acsbr-017.pdf', 'total_pages': 9, 'page': 3, 'page_label': '4'}, page_content='hold income in 2022 was $24,112 \n(Table 1 and Figure 2). Median \nhousehold income was lower than \nthe U.S. median in 30 states and \nPuerto Rico. It was higher than the \nU.S. median in 17 states and the \nDistrict of Columbia. The medians \nfor Arizona, Oregon, and Vermont \nwere not statistically di

In [31]:

print(result['result'])

0.8 percent


In [32]:
relevant_docs = result['source_documents']
print(f'There are {len(relevant_docs)} documents retrieved which are relevant to the query.')
print("*" * 100)
for i, doc in enumerate(relevant_docs):
    print(f"Relevant Document #{i+1}:\nSource file: {doc.metadata['source']}, Page: {doc.metadata['page']}\nContent: {doc.page_content}")
    print("-"*100)
    print(f'There are {len(relevant_docs)} documents retrieved which are relevant to the query.')

There are 3 documents retrieved which are relevant to the query.
****************************************************************************************************
Relevant Document #1:
Source file: us_census\acsbr-017.pdf, Page: 3
Content: hold income in 2022 was $24,112 
(Table 1 and Figure 2). Median 
household income was lower than 
the U.S. median in 30 states and 
Puerto Rico. It was higher than the 
U.S. median in 17 states and the 
District of Columbia. The medians 
for Arizona, Oregon, and Vermont 
were not statistically different from 
the U.S. median.
From 2021 to 2022, five states—
Alabama, Alaska, Delaware, Florida, 
and Utah—showed a statistically 
significant increase in real median 
household income; 17 states 
showed a decrease. Real median 
household income in 2022 was not 
statistically different from that in 
2021 for 28 states, the District of 
Columbia, and Puerto Rico  
(Table 1).
---------------------------------------------------------------------------------