In [32]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.prompts import PromptTemplate

from langchain.chains import RetrievalQA



In [None]:
## Read the ppdfs from the folder

# https://python.langchain.com/v0.1/docs/modules/data_connection/document_loaders/pdf/#pypdf-directory

loader=PyPDFDirectoryLoader("./us_census")

documents=loader.load()

text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)

final_documents=text_splitter.split_documents(documents)
final_documents[0]

In [33]:
len(final_documents)

316

In [34]:
## Embedding Using Huggingface
# https://python.langchain.com/v0.1/docs/integrations/text_embedding/bge_huggingface/

huggingface_embeddings=HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",      #sentence-transformers/all-MiniLM-l6-v2
    model_kwargs={'device':'cpu'},
    encode_kwargs={'normalize_embeddings':True}

)



In [35]:
huggingface_embeddings.embed_query("hi this is harrison")

[-0.013697370886802673,
 0.01759220100939274,
 0.0047983331605792046,
 -0.03559265285730362,
 0.006723250728100538,
 -0.03481069952249527,
 0.08128121495246887,
 0.004473627544939518,
 0.00266089104115963,
 -0.00950930267572403,
 0.015451968647539616,
 -0.0557049997150898,
 -0.004588160198181868,
 0.01328799594193697,
 0.026125693693757057,
 -0.06999903172254562,
 0.06557069718837738,
 -0.014749940484762192,
 -0.0026622996665537357,
 -0.004501919727772474,
 0.02205096371471882,
 0.045958563685417175,
 -0.04775809869170189,
 -0.039398614317178726,
 0.05938587337732315,
 -0.0030875890515744686,
 0.03972360119223595,
 -0.016757991164922714,
 -0.03204094246029854,
 -0.11328400671482086,
 -0.038143616169691086,
 0.013412456028163433,
 0.03303747624158859,
 0.03761553019285202,
 0.0031985195819288492,
 0.01186575647443533,
 0.025771645829081535,
 0.03292486444115639,
 0.008472209796309471,
 0.01434817723929882,
 -0.016087118536233902,
 -0.045701153576374054,
 0.03868671879172325,
 0.05413995

In [36]:
import  numpy as np
print(np.array(huggingface_embeddings.embed_query(final_documents[0].page_content)))
print(np.array(huggingface_embeddings.embed_query(final_documents[0].page_content)).shape)

[-8.46568272e-02 -1.19099189e-02 -3.37892659e-02  2.94559132e-02
  5.19160181e-02  5.73840030e-02 -4.10017967e-02  2.74267718e-02
 -1.05128221e-01 -1.58056132e-02  7.94858783e-02  5.64318150e-02
 -1.31765679e-02 -3.41544300e-02  5.81598235e-03  4.72548082e-02
 -1.30746616e-02  3.12984665e-03 -3.44225727e-02  3.08406539e-02
 -4.09086272e-02  3.52737606e-02 -2.43761316e-02 -4.35831062e-02
  2.41503324e-02  1.31986495e-02 -4.84455889e-03  1.92347132e-02
 -5.43913543e-02 -1.42735064e-01  5.15530631e-03  2.93116011e-02
 -5.60810603e-02 -8.53529852e-03  3.14141475e-02  2.76736375e-02
 -2.06188504e-02  8.24231282e-02  4.15425636e-02  5.79654947e-02
 -3.71587686e-02  6.26163371e-03 -2.41389740e-02 -5.61798457e-03
 -2.51715872e-02  5.04973531e-03 -2.52801329e-02 -2.91944970e-03
 -8.24040920e-03 -5.69604672e-02  2.30822787e-02 -5.54218795e-03
  5.11555783e-02  6.09937720e-02  6.49766028e-02 -5.38514033e-02
  2.19109766e-02 -2.54194904e-02 -4.49223556e-02  4.22459021e-02
  4.75252569e-02  7.23186

In [37]:
## VectorStore Creation
vectorstore=FAISS.from_documents(final_documents[:120],huggingface_embeddings)

In [38]:
## Query using Similarity Search
query="WHAT IS HEALTH INSURANCE COVERAGE?"
relevant_docments=vectorstore.similarity_search(query)

print(relevant_docments[0].page_content)

2 U.S. Census Bureau
WHAT IS HEALTH INSURANCE COVERAGE?
This brief presents state-level estimates of health insurance coverage 
using data from the American Community Survey (ACS). The  
U.S. Census Bureau conducts the ACS throughout the year; the 
survey asks respondents to report their coverage at the time of 
interview. The resulting measure of health insurance coverage, 
therefore, reflects an annual average of current comprehensive 
health insurance coverage status.* This uninsured rate measures a 
different concept than the measure based on the Current Population 
Survey Annual Social and Economic Supplement (CPS ASEC). 
For reporting purposes, the ACS broadly classifies health insurance 
coverage as private insurance or public insurance. The ACS defines 
private health insurance as a plan provided through an employer 
or a union, coverage purchased directly by an individual from an 
insurance company or through an exchange (such as healthcare.


In [39]:
retriever=vectorstore.as_retriever(search_type="similarity",search_kwargs={"k":3})
retriever

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceBgeEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000001E7A8B182C0>, search_kwargs={'k': 3})

In [40]:
import os
os.environ['HUGGINGFACEHUB_API_TOKEN']="hf_czNLRvUJ"

The Hugging Face Hub is an platform with over 350k models, 75k datasets, and 150k demo apps (Spaces), all open source and publicly available, in an online platform where people can easily collaborate and build ML together.

In [46]:
from langchain_community.llms import HuggingFaceHub

hf=HuggingFaceHub(
    #repo_id="mistralai/Mistral-7B-v0.1",
    repo_id="bigscience/bloom",
    model_kwargs={"temperature":0.1,"max_length":500}

)
hf.invoke(query)

'WHAT IS HEALTH INSURANCE COVERAGE? Health insurance coverage is a contract between you and your insurance company. It is a legal document that spells'

In [49]:
prompt_template="""
Use the following piece of context to answer the question asked.
Please try to provide the answer only based on the context

{context}
Question:{question}

Helpful Answers:
 """

In [50]:
prompt=PromptTemplate(template=prompt_template,input_variables=["context","question"])

In [51]:
retrievalQA=RetrievalQA.from_chain_type(
    llm=hf,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt":prompt}
)

In [52]:
query="""DIFFERENCES IN THE
UNINSURED RATE BY STATE
IN 2022"""

In [54]:
query="""which states remianed uninsured?"""

In [55]:
# Call the QA chain with our query.
result = retrievalQA.invoke({"query": query})
print(result['result'])


Use the following piece of context to answer the question asked.
Please try to provide the answer only based on the context

4 U.S. Census Bureau
Among that group, only Wisconsin 
had not expanded Medicaid eligi -
bility. As a group, the states that 
expanded Medicaid eligibility had 
a lower uninsured rate (6.3 per -
cent) compared with nonexpan -
sion states (11.8 percent).12
States in the South had some of 
the highest uninsured rates, while 
states in the Northeast had some 
of the lowest uninsured rates. Of 
the 15 states that had uninsured 
rates above the national average, 
nine were in the South, ranging 
12 Nonexpansion states are states that did 
not expand Medicaid eligibility.from 8.8 percent to 16.6 percent. 
All states in the Northeast—
Connecticut, Maine, Massachusetts, 
New Hampshire, New Jersey, New 
York, Pennsylvania, Rhode Island, 
and Vermont—had uninsured rates 
below the national average.13
PRIVATE HEALTH INSURANCE 
COVERAGE BY STATE IN 2022
Private coverage may