In [10]:
import os
from dotenv import load_dotenv

import numpy as np

from langchain_community.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint, HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

In [11]:
# Read pdfs

loader = PyPDFDirectoryLoader("./us_cencus")
docs = loader.load()
text_spliter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
final_docs = text_spliter.split_documents(docs)

In [12]:
len(final_docs)
load_dotenv()

os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv("HUGGINGFACEHUB_API_TOKEN")

In [13]:
## Embedding using HuggingFace
huggingface_embeddings = HuggingFaceEmbeddings(
    model_name = "BAAI/bge-small-en-v1.5"
)

In [14]:
vectorstore=FAISS.from_documents(final_docs[:120], huggingface_embeddings)

In [15]:
query = "what is health insurance coverage"
relevant_docs = vectorstore.similarity_search(query)

retriever = vectorstore.as_retriever(search_type = 'similarity', search_kwargs = {"k":3})
retriever

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000002AD254D4CD0>, search_kwargs={'k': 3})

In [None]:
llm = HuggingFaceEndpoint(
    repo_id="TinyLlama/TinyLlama-1.1B-Chat-v1.0", 
    max_new_tokens=256
)
chat = ChatHuggingFace(llm=llm, verbose=True)
chat.invoke([{
    "role": "system",
    "content": query
    }])