In [1]:
import os

from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.prompts import PromptTemplate

from langchain.chains import RetrievalQA

In [2]:
# Read the pdf fromthe folder
loader    = PyPDFDirectoryLoader('./pdf')
documents = loader.load()

text_splitter   = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
final_documents = text_splitter.split_documents(documents)

final_documents[0]

Document(metadata={'source': 'pdf/acsbr-015.pdf', 'page': 0}, page_content='Health Insurance Coverage Status and Type \nby Geography: 2021 and 2022\nAmerican Community Survey Briefs\nACSBR-015Issued September 2023Douglas Conway and Breauna Branch\nINTRODUCTION\nDemographic shifts as well as economic and govern-\nment policy changes can affect people’s access to health coverage. For example, between 2021 and 2022, the labor market continued to improve, which may have affected private coverage in the United States \nduring that time.\n1 Public policy changes included \nthe renewal of the Public Health Emergency, which \nallowed Medicaid enrollees to remain covered under the Continuous Enrollment Provision.\n2 The American \nRescue Plan (ARP) enhanced Marketplace premium subsidies for those with incomes above 400 percent of the poverty level as well as for unemployed people.\n3')

In [3]:
len(final_documents)

316

In [4]:
# Embedding using huggingFace
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name    = 'BAAI/bge-small-en-v1.5',
    model_kwargs  = {'device':'cpu'},
    encode_kwargs = {'normalize_embeddings': True}
)
huggingface_embeddings.embed_query(final_documents[0].page_content)
# 

  from tqdm.autonotebook import tqdm, trange


[-0.08465683460235596,
 -0.011909885331988335,
 -0.03378922492265701,
 0.0294560007750988,
 0.0519159771502018,
 0.05738397687673569,
 -0.041001737117767334,
 0.02742674946784973,
 -0.10512824356555939,
 -0.01580560952425003,
 0.07948586344718933,
 0.056431885808706284,
 -0.013176502659916878,
 -0.034154441207647324,
 0.005816021002829075,
 0.0472547821700573,
 -0.013074672780930996,
 0.0031299644615501165,
 -0.03442257270216942,
 0.030840644612908363,
 -0.04090864583849907,
 0.03527376428246498,
 -0.02437613159418106,
 -0.04358314722776413,
 0.024150291457772255,
 0.013198663480579853,
 -0.00484446668997407,
 0.019234759733080864,
 -0.05439131706953049,
 -0.14273500442504883,
 0.0051552592776715755,
 0.029311588034033775,
 -0.056081127375364304,
 -0.008535345084965229,
 0.031414128839969635,
 0.02767363376915455,
 -0.02061879076063633,
 0.08242309093475342,
 0.04154250770807266,
 0.057965561747550964,
 -0.03715869411826134,
 0.006261622533202171,
 -0.024138960987329483,
 -0.0056179501

In [5]:
vectorstore = FAISS.from_documents(final_documents[:120], huggingface_embeddings)

In [6]:
relevent_doc = vectorstore.similarity_search("health insurance coverage?")
relevent_doc[0].page_content

'2 U.S. Census Bureau\nWHAT IS HEALTH INSURANCE COVERAGE?\nThis brief presents state-level estimates of health insurance coverage \nusing data from the American Community Survey (ACS). The  \nU.S. Census Bureau conducts the ACS throughout the year; the \nsurvey asks respondents to report their coverage at the time of \ninterview. The resulting measure of health insurance coverage, \ntherefore, reflects an annual average of current comprehensive \nhealth insurance coverage status.* This uninsured rate measures a \ndifferent concept than the measure based on the Current Population \nSurvey Annual Social and Economic Supplement (CPS ASEC). \nFor reporting purposes, the ACS broadly classifies health insurance \ncoverage as private insurance or public insurance. The ACS defines \nprivate health insurance as a plan provided through an employer \nor a union, coverage purchased directly by an individual from an \ninsurance company or through an exchange (such as healthcare.'

In [7]:
retriever = vectorstore.as_retriever(search_type='similarity', search_kwargs={'k':3})

In [1]:
from dotenv import load_dotenv
load_dotenv()

Python-dotenv could not parse statement starting at line 1
Python-dotenv could not parse statement starting at line 10


True

In [9]:
from langchain_community.llms import HuggingFaceHub

hf = HuggingFaceHub(
    repo_id = 'mistralai/Mistral-7B-v0.1',
    model_kwargs = {'temperature':0.1, 'max_lenght':500}
)

  hf = HuggingFaceHub(


In [10]:
hf.invoke('what is health insurance coverage?')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


"what is health insurance coverage?\n\nHealth insurance is a type of insurance coverage that covers the cost of an individual's medical and surgical expenses. Health insurance policies reimburse policyholders for medical and surgical expenses incurred due to sickness or injury. Health insurance can reimburse policyholders for costs incurred from illness, injury, maternity, and prenatal care, rehabilitation, respite care, home health care, medical equipment, and prescription drugs.\n\n"

In [2]:
# To locally run huggingface model use HuggingFacePipeline
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline


In [None]:
hf = HuggingFacePipeline.from_model_id(
    model_id = 'mistralai/Mistral-7B-v0.1',
    task     = 'text-generation',
    pipeline_kwargs = {'temperature':0, 'max_new_tokens':300}
)


Loading checkpoint shards:   0%|                                                                                                                                            | 0/2 [00:00<?, ?it/s]

In [None]:

llm = hf
llm.invoke('what is machine learning?')