In [5]:
from langchain_community.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Logic: 'Prompts' now live in 'langchain_core'
from langchain_core.prompts import PromptTemplate

# Logic: 'Chains' live in 'langchain_classic' for version 1.2.6
from langchain_classic.chains import RetrievalQA

print("‚úÖ Success: Everything is now properly imported!")

‚úÖ Success: Everything is now properly imported!


In [17]:
import os
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# 1. Logic: Use "." to look in the CURRENT folder where the PDFs are
# If your PDFs are in the same folder as this notebook, use "."
path = "."

print(f"Checking for PDFs in: {os.path.abspath(path)}")

# 2. Logic: Load the documents
loader = PyPDFDirectoryLoader(path)
documents = loader.load()

# 3. Logic: Safety Check
if len(documents) == 0:
    print("‚ùå Still 0 documents. Logic: Python is in the wrong folder.")
    print(f"Files actually present here: {os.listdir(path)}")
else:
    print(f"‚úÖ Loaded {len(documents)} pages.")

    # 4. Logic: Split the text
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    final_documents = text_splitter.split_documents(documents)

    print(f"‚úÖ Created {len(final_documents)} chunks.")

    # 5. Logic: Show the first chunk safely
    if len(final_documents) > 0:
        print("\n--- FIRST CHUNK DATA ---")
        print(final_documents[0].page_content[:200]) # Show first 200 chars

Checking for PDFs in: D:\Langchain\huggingface\us_census
‚úÖ Loaded 63 pages.
‚úÖ Created 316 chunks.

--- FIRST CHUNK DATA ---
Health Insurance Coverage Status and Type 
by Geography: 2021 and 2022
American Community Survey Briefs
ACSBR-015
Issued September 2023
Douglas Conway and Breauna Branch
INTRODUCTION
Demographic shift


In [18]:
len(final_documents)

316

In [19]:
## Embedding Using Huggingface
huggingface_embeddings=HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",      #sentence-transformers/all-MiniLM-l6-v2
    model_kwargs={'device':'cpu'},
    encode_kwargs={'normalize_embeddings':True}

)

  huggingface_embeddings=HuggingFaceBgeEmbeddings(


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [21]:

import  numpy as np
print(np.array(huggingface_embeddings.embed_query(final_documents[0].page_content)))
print(np.array(huggingface_embeddings.embed_query(final_documents[0].page_content)).shape)

[-0.07903485 -0.01134113 -0.02312097  0.02844461  0.05053344  0.05317826
 -0.01907787  0.03456026 -0.10211368 -0.02915702  0.0852426   0.05650727
 -0.02545439 -0.0330849  -0.00635735  0.04090864 -0.00628108  0.00356744
 -0.03854129  0.03667685 -0.04289803  0.03425252 -0.03116899 -0.03793729
  0.01728391  0.01214924  0.00653119  0.01463565 -0.05529054 -0.15320712
  0.00730845  0.03202944 -0.04701132 -0.01595974  0.0187445   0.02642936
 -0.02306378  0.08438035  0.04182485  0.05278177 -0.03057602  0.01564262
 -0.01689074  0.00529409 -0.02417436  0.00412995 -0.01889937 -0.00150625
 -0.00836945 -0.03390065  0.03515961 -0.00553131  0.04910938  0.05971856
  0.05615963 -0.05105155  0.01475136 -0.01849959 -0.03284641  0.03576624
  0.04947704 -0.00938883 -0.26202118  0.0975033   0.01715692  0.0478139
 -0.00556317 -0.00298307 -0.02207355 -0.04463669 -0.05760482  0.04815878
 -0.05522206  0.01635333  0.03299246  0.02147079  0.01296219  0.01462309
  0.02174952 -0.00202999  0.02099538  0.03353847 -0.

In [22]:
## VectorStore Creation
vectorstore=FAISS.from_documents(final_documents[:120],huggingface_embeddings)

In [23]:
## Query using Similarity Search
query="WHAT IS HEALTH INSURANCE COVERAGE?"
relevant_docments=vectorstore.similarity_search(query)

print(relevant_docments[0].page_content)

2 U.S. Census Bureau
WHAT IS HEALTH INSURANCE COVERAGE?
This brief presents state-level estimates of health insurance coverage 
using data from the American Community Survey (ACS). The  
U.S. Census Bureau conducts the ACS throughout the year; the 
survey asks respondents to report their coverage at the time of 
interview. The resulting measure of health insurance coverage, 
therefore, reflects an annual average of current comprehensive 
health insurance coverage status.* This uninsured rate measures a 
different concept than the measure based on the Current Population 
Survey Annual Social and Economic Supplement (CPS ASEC). 
For reporting purposes, the ACS broadly classifies health insurance 
coverage as private insurance or public insurance. The ACS defines 
private health insurance as a plan provided through an employer 
or a union, coverage purchased directly by an individual from an 
insurance company or through an exchange (such as healthcare.


In [24]:
retriever=vectorstore.as_retriever(search_type="similarity",search_kwargs={"k":3})
print(retriever)

tags=['FAISS', 'HuggingFaceBgeEmbeddings'] vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x00000187EE3A2A50> search_kwargs={'k': 3}


In [3]:
import os
os.environ['HUGGINGFACEHUB_API_TOKEN']=""

In [31]:
import os
from dotenv import load_dotenv
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace
from langchain_core.messages import HumanMessage

load_dotenv()
sec_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")

# 1. Logic: Use a model that supports Text-Generation or specify the task
repo_id = "mistralai/Mistral-7B-Instruct-v0.2" # v0.2 is often more stable for text-generation

llm = HuggingFaceEndpoint(
    repo_id=repo_id,
    task="text-generation", # Force the task type
    huggingfacehub_api_token=sec_token,
    temperature=0.7,
    max_new_tokens=512,
)

# 2. Logic: Wrap it in ChatHuggingFace to fix the 'Conversational' error
chat_model = ChatHuggingFace(llm=llm)

# 3. Test with a Message object
query = [HumanMessage(content="What is health insurance coverage?")]

print("üöÄ Sending query to Hugging Face...")

try:
    response = chat_model.invoke(query)
    print("\n--- LLM RESPONSE ---")
    print(response.content)
except Exception as e:
    print(f"‚ùå API Error: {e}")

üöÄ Sending query to Hugging Face...

--- LLM RESPONSE ---
 Health insurance is a type of insurance coverage that helps pay for medical and surgical expenses, as well as some related costs, such as deductibles, coinsurance, and copayments. It's designed to provide financial protection against the high cost of healthcare services. Health insurance can be provided through an employer, purchased individually, or obtained through government programs like Medicare or Medicaid. The specific coverage and benefits of health insurance plans can vary widely, so it's important to carefully review the details of any plan before enrolling. In general, health insurance can help protect individuals and families from financial hardship in the event of unexpected medical expenses.


In [None]:
#Hugging Face models can be run locally through the HuggingFacePipeline class.
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

hf = HuggingFacePipeline.from_model_id(
    model_id="mistralai/Mistral-7B-v0.1",
    task="text-generation",
    pipeline_kwargs={"temperature": 0, "max_new_tokens": 300}
)

llm = hf
llm.invoke(query)

tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

In [5]:
# 1. Logic: In version 1.2.6, Prompts live in 'langchain_core'
from langchain_core.prompts import PromptTemplate

# 2. Logic: Define your string
prompt_template = """
Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Answer:"""

# 3. Logic: Create the object
prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

print("‚úÖ Success: PromptTemplate is defined using the modern Core path!")

‚úÖ Success: PromptTemplate is defined using the modern Core path!


In [11]:

prompt=PromptTemplate(template=prompt_template,input_variables=["context","question"])

In [29]:
!pip install langchain-huggingface




In [31]:
from langchain_classic.chains import RetrievalQA
from langchain_core.prompts import PromptTemplate

# 1. Define Template
template = """
Use the following pieces of context to answer the user's question.
If you don't know the answer, just say that you don't know.

Context: {context}
Question: {question}

Helpful Answer:
"""

prompt = PromptTemplate(
    template=template,
    input_variables=["context", "question"]
)

# 2. Initialize the Chain (Now 'hf' and 'retriever' are defined!)
retrievalQA = RetrievalQA.from_chain_type(
    llm=hf,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt}
)

print("‚úÖ Success: RetrievalQA chain is ready!")

# 3. Run Query
query = "What is the main topic of these documents?"
response = retrievalQA.invoke({"query": query})

print("\n--- FINAL ANSWER ---")
print(response["result"])

NameError: name 'retriever' is not defined

In [None]:
query="""DIFFERENCES IN THE
UNINSURED RATE BY STATE
IN 2022"""

In [None]:

# Call the QA chain with our query.
result = retrievalQA.invoke({"query": query})
print(result['result'])