In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.prompts import PromptTemplate

from langchain.chains import RetrievalQA



In [2]:
## Read the ppdfs from the folder
loader=PyPDFDirectoryLoader("./pdf_data")

documents=loader.load()

text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)

final_documents=text_splitter.split_documents(documents)
final_documents[0]

Document(page_content='SQL Interview\xa0Questions and Answers\xa0\xa0\xa0\xa0\xa0\xa0\xa0A collection of 60+ SQL interview questions and\xa0answers to help with your next SQL interview\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0Ben Brumm\xa0www.databasestar.com', metadata={'source': 'pdf_data/SQL_interview_Question.pdf', 'page': 0})

In [3]:
len(final_documents)

96

In [4]:
## Embedding Using Huggingface
huggingface_embeddings=HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",      #sentence-transformers/all-MiniLM-l6-v2
    model_kwargs={'device':'cpu'},
    encode_kwargs={'normalize_embeddings':True}

)

  from tqdm.autonotebook import tqdm, trange
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [5]:
import  numpy as np
print(np.array(huggingface_embeddings.embed_query(final_documents[0].page_content)))
print(np.array(huggingface_embeddings.embed_query(final_documents[0].page_content)).shape)

[-7.19598383e-02 -2.62970198e-02 -4.66158316e-02 -2.21723900e-03
 -3.10089975e-03  1.59724616e-02 -3.67967710e-02 -4.18105610e-02
 -2.06521451e-02 -8.73553455e-02 -2.95485612e-02  3.24115679e-02
  2.22944524e-02 -2.29062606e-02 -5.84969204e-03 -1.00983139e-02
  6.54184669e-02 -6.14145771e-03 -3.40984985e-02 -1.08711896e-02
 -6.86124638e-02 -7.39559112e-03 -1.44385323e-02 -1.87891573e-02
  1.89230833e-02  1.08360201e-02  5.37839085e-02 -7.28066340e-02
 -6.69434816e-02 -1.68933377e-01  2.69605424e-02 -1.99552160e-02
  4.60217409e-02 -1.99544001e-02 -2.04936764e-03 -8.07916466e-03
  3.13335247e-02  5.03138453e-02 -6.76253019e-03 -1.64716747e-02
  2.51801200e-02 -1.56202475e-02 -3.06483526e-02  5.82259102e-03
  2.22472847e-02  1.08215781e-02  2.34164186e-02 -4.95338961e-02
  3.33432890e-02  1.46443471e-02 -7.93523639e-02 -5.30545004e-02
 -4.93064374e-02  6.66703423e-03 -8.27798992e-02  2.56809872e-05
  1.97104663e-02  6.63163662e-02 -4.57085855e-02 -1.01437047e-02
  2.88708154e-02 -6.91193

In [6]:
## VectorStore Creation
vectorstore=FAISS.from_documents(final_documents[:120],huggingface_embeddings)

In [7]:
## Query using Similarity Search
query="WHAT IS HEALTH INSURANCE COVERAGE?"
relevant_docments=vectorstore.similarity_search(query)

print(relevant_docments[0].page_content)

©Topperworld                                                                                                                                                                  Topperworld.in  
                        
Q 1.  What is SQL?   
Ans:   SQL stands for Structured Query Language .  
• It is a language used to interact with the database, i.e to create a 
database, to create a table in the database, to retrieve data or update a 
table in the database, etc.  
• SQL is an ANSI( American National Standards Institute ) standard. Using 
SQL, we can do many things.  
For example  – we can execute queries, we can insert records into a table, can 
update records, can create a database, can create a table, can delete a table, 
etc.          
Q 2. What is a database?   
Ans:  A Database is defined as a structured form of data storage in a computer 
or a collection of data in an organized manner and can be accessed in various 
ways.


In [8]:
retriever=vectorstore.as_retriever(search_type="similarity",search_kwargs={"k":3})
print(retriever)

tags=['FAISS', 'HuggingFaceBgeEmbeddings'] vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7fb81197b940> search_kwargs={'k': 3}


In [9]:
import os
os.environ['HUGGINGFACEHUB_API_TOKEN']="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"

The Hugging Face Hub is an platform with over 350k models, 75k datasets, and 150k demo apps (Spaces), all open source and publicly available, in an online platform where people can easily collaborate and build ML together.

In [10]:
from langchain_community.llms import HuggingFaceHub

hf=HuggingFaceHub(
    repo_id="mistralai/Mistral-7B-v0.1",
    model_kwargs={"temperature":0.1,"max_length":500}

)
query="What is SQL?"
hf.invoke(query)

  warn_deprecated(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


'What is SQL?\n\nSQL is a programming language that is used to communicate with databases. It is a standard language that is used by many different types of databases. SQL is used to create, read, update, and delete data in a database.\n\nWhat is a database?\n\nA database is a collection of data that is organized in a way that makes it easy to find and use. Databases are used to store data for many different purposes, such as keeping track of customer information,'

In [11]:
#Hugging Face models can be run locally through the HuggingFacePipeline class.
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

hf = HuggingFacePipeline.from_model_id(
    model_id="mistralai/Mistral-7B-v0.1",
    task="text-generation",
    pipeline_kwargs={"temperature": 0, "max_new_tokens": 300}
)

llm = hf 
llm.invoke(query)

OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/mistralai/Mistral-7B-v0.1.
401 Client Error. (Request ID: Root=1-667f0e6d-6a59499a79caf76f6352a5b7;1cf2a6a5-e6d8-4a76-b879-4adade580104)

Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json.
Access to model mistralai/Mistral-7B-v0.1 is restricted. You must be authenticated to access it.

In [12]:
prompt_template="""
Use the following piece of context to answer the question asked.
Please try to provide the answer only based on the context

{context}
Question:{question}

Helpful Answers:
 """

In [13]:
prompt=PromptTemplate(template=prompt_template,input_variables=["context","question"])

In [14]:
retrievalQA=RetrievalQA.from_chain_type(
    llm=hf,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt":prompt}
)

In [15]:
query="""DIFFERENCES IN THE
UNINSURED RATE BY STATE
IN 2022"""

In [20]:
# Call the QA chain with our query.
result = retrievalQA.invoke({"query": query})
print(result['result'])


Use the following piece of context to answer the question asked.
Please try to provide the answer only based on the context

©Topperworld                                                                                                                                                                  Topperworld.in  
                        
Q 1.  What is SQL?   
Ans:   SQL stands for Structured Query Language .  
• It is a language used to interact with the database, i.e to create a 
database, to create a table in the database, to retrieve data or update a 
table in the database, etc.  
• SQL is an ANSI( American National Standards Institute ) standard. Using 
SQL, we can do many things.  
For example  – we can execute queries, we can insert records into a table, can 
update records, can create a database, can create a table, can delete a table, 
etc.          
Q 2. What is a database?   
Ans:  A Database is defined as a structured form of data storage in a computer 
or a collection of 