# Question-answering (QA) using LangChain and Hugging Face.

#### Step 1: Import Required Libraries

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

#### Step 2: Load PDFs from the Folder

In [None]:

## Read the ppdfs from the folder
loader=PyPDFDirectoryLoader("./us_census_data")
documents=loader.load()


#### Step 3: Split Documents into Chunks

In [None]:
text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
final_documents=text_splitter.split_documents(documents)


In [27]:
final_documents[0]

Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 18.2 (Windows)', 'creationdate': '2023-09-09T07:52:17-04:00', 'author': 'U.S. Census Bureau', 'keywords': 'acsbr-015', 'moddate': '2023-09-12T14:44:47+01:00', 'title': 'Health Insurance Coverage Status and Type by Geography: 2021 and 2022', 'trapped': '/false', 'source': 'us_census_data\\acsbr-015.pdf', 'total_pages': 18, 'page': 0, 'page_label': '1'}, page_content='Health Insurance Coverage Status and Type \nby Geography: 2021 and 2022\nAmerican Community Survey Briefs\nACSBR-015\nIssued September 2023\nDouglas Conway and Breauna Branch\nINTRODUCTION\nDemographic shifts as well as economic and govern-\nment policy changes can affect people’s access to \nhealth coverage. For example, between 2021 and 2022, \nthe labor market continued to improve, which may \nhave affected private coverage in the United States \nduring that time.1 Public policy changes included \nthe renewal of the Public Health Emergenc

In [28]:
len(final_documents)

316

#### Step 4: Convert Text into Embeddings Using Hugging Face

In [31]:

## Embedding Using Huggingface
huggingface_embeddings=HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",      #sentence-transformers/all-MiniLM-l6-v2
    model_kwargs={'device':'cpu'},
    encode_kwargs={'normalize_embeddings':True}
)

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


#### Step 5: Test the Embeddings

In [32]:
import  numpy as np
print(np.array(huggingface_embeddings.embed_query(final_documents[0].page_content)))
print(np.array(huggingface_embeddings.embed_query(final_documents[0].page_content)).shape)

[-0.07903485 -0.01134113 -0.02312097  0.02844461  0.05053344  0.05317826
 -0.01907787  0.03456026 -0.10211368 -0.02915702  0.0852426   0.05650727
 -0.02545439 -0.0330849  -0.00635735  0.04090864 -0.00628108  0.00356744
 -0.03854129  0.03667685 -0.04289803  0.03425252 -0.03116899 -0.03793729
  0.01728391  0.01214924  0.00653119  0.01463565 -0.05529054 -0.15320712
  0.00730845  0.03202944 -0.04701132 -0.01595974  0.0187445   0.02642936
 -0.02306378  0.08438035  0.04182485  0.05278177 -0.03057602  0.01564262
 -0.01689074  0.00529409 -0.02417436  0.00412995 -0.01889937 -0.00150625
 -0.00836945 -0.03390065  0.03515961 -0.00553131  0.04910938  0.05971856
  0.05615963 -0.05105155  0.01475136 -0.01849959 -0.03284641  0.03576624
  0.04947704 -0.00938883 -0.26202118  0.0975033   0.01715692  0.0478139
 -0.00556317 -0.00298307 -0.02207355 -0.04463669 -0.05760482  0.04815878
 -0.05522206  0.01635333  0.03299246  0.02147079  0.01296219  0.01462309
  0.02174952 -0.00202999  0.02099538  0.03353847 -0.

#### Step 6: Store Documents in FAISS (Vector Database)

In [33]:
## VectorStore Creation
vectorstore=FAISS.from_documents(final_documents[:120],huggingface_embeddings)

#### Step 7: Retrieve Similar Documents

In [34]:
## Query using Similarity Search
query="WHAT IS HEALTH INSURANCE COVERAGE?"
relevant_docments=vectorstore.similarity_search(query)
print(relevant_docments[0].page_content)

2 U.S. Census Bureau
WHAT IS HEALTH INSURANCE COVERAGE?
This brief presents state-level estimates of health insurance coverage 
using data from the American Community Survey (ACS). The  
U.S. Census Bureau conducts the ACS throughout the year; the 
survey asks respondents to report their coverage at the time of 
interview. The resulting measure of health insurance coverage, 
therefore, reflects an annual average of current comprehensive 
health insurance coverage status.* This uninsured rate measures a 
different concept than the measure based on the Current Population 
Survey Annual Social and Economic Supplement (CPS ASEC). 
For reporting purposes, the ACS broadly classifies health insurance 
coverage as private insurance or public insurance. The ACS defines 
private health insurance as a plan provided through an employer 
or a union, coverage purchased directly by an individual from an 
insurance company or through an exchange (such as healthcare.


####  Step 8: Convert FAISS into a Retriever

In [35]:
retriever=vectorstore.as_retriever(search_type="similarity",search_kwargs={"k":3})
print(retriever)

tags=['FAISS', 'HuggingFaceBgeEmbeddings'] vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x0000019B59940B60> search_kwargs={'k': 3}


#### Step 9: Set Up Hugging Face API for Mistral-7B

In [None]:

import os

os.environ["HUGGINGFACEHUB_API_TOKEN"] = ""  

#### Step 10: Initialize Mistral-7B for Question Answering

In [68]:

from langchain_huggingface import HuggingFaceEndpoint

hf = HuggingFaceEndpoint(
    endpoint_url="https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2",
    temperature=0.1,
    model_kwargs={"max_length": 500},
    timeout=600
)

query = "What is the health insurance coverage?"
response = hf.invoke(query)
print(response)





Health insurance coverage refers to the benefits provided by an insurance policy to pay for medical and health-related expenses. These expenses can include doctor visits, hospital stays, prescription medications, and preventative care services. The specific coverage and costs will depend on the terms of the particular insurance policy.

Health insurance can be obtained through an employer, purchased individually, or through government programs such as Medicare or Medicaid. The Affordable Care Act (ACA) also provides options for individuals to purchase health insurance through state-based marketplaces.

It's important to note that health insurance coverage can vary widely, and it's essential to understand the specific terms and conditions of your policy to know what is covered and what is not. Regularly reviewing your policy and asking questions of your insurance provider can help ensure that you are getting the most out of your coverage.


#### Step 11: Create a Custom Prompt

In [70]:
prompt_template="""
Use the following piece of context to answer the question asked.
Please try to provide the answer only based on the context

{context}
Question:{question}

Helpful Answers:
 """


In [71]:
prompt=PromptTemplate(template=prompt_template,input_variables=["context","question"])

#### Step 12: Create Retrieval-Augmented QA Chain


In [72]:
retrievalQA=RetrievalQA.from_chain_type(
    llm=hf,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt":prompt}
)

#### Step 13: Query the RAG System

In [73]:

query="""DIFFERENCES IN THE
UNINSURED RATE BY STATE
IN 2022"""

In [74]:
# Call the QA chain with our query.
result = retrievalQA.invoke({"query": query})
print(result['result'])



1. The uninsured rate in Massachusetts was the lowest at 2.4% in 2022, while Texas had the highest rate at 16.6%. The national rate was 8.0%.
 2. Ten states had uninsured rates above the national average.
 3. Medicaid coverage accounted for a larger percentage in states that expanded Medicaid eligibility (22.7%) compared to nonexpansion states (18.0%).
 4. Uninsured rates decreased in 27 states from 2021 to 2022, while Maine had an increase. Maine's uninsured rate remained below the national average.
 5. The private coverage rates were not statistically different in North Dakota and Utah.
 6. The most populous metropolitan areas with the lowest uninsured rates were Boston-Cambridge-Newton, MA-NH, and San Francisco-Oakland-Berkeley, CA.
 7. Detroit-Warren-Dearborn, MI had a statistically significant increase in the uninsured rate from 2021 to 2022.
 8. For more information on confidentiality protection, sampling error, nonsampling error, and definitions in the American Community Survey,