<a href="https://colab.research.google.com/github/sravan1320/RAG/blob/main/LLAMA2_RAG_FIASSDB_Chat_With_PDF_HuggingFace_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# !pip install transformers langchain torch constants chromadb accelerate sentence-transformers pypdf pdfminer.six faiss-gpu

In [5]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.c

#Load LLM Model

In [19]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings, HuggingFaceEmbeddings
import torch
import os
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModel
import faiss

# Check if a GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

persist_directory = "db"

# Loading documents
documents = []
for root, dirs, files in os.walk("docs"):
    for file in files:
        if file.endswith(".pdf"):
            print(file)
            loader = PyPDFLoader(os.path.join(root, file))
            documents.extend(loader.load())

# Splitting documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
texts = text_splitter.split_documents(documents)

# Initializing tokenizer and model
model_name = "sentence-transformers/all-mpnet-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

class CustomEmbedding:
    def __init__(self):
        self.tokenizer = tokenizer
        self.model = model.to(device)

    def custom_embedding(self, text):
        inputs = self.tokenizer(text.page_content, return_tensors="pt", padding=True, truncation=True, max_length=512)
        input_ids = inputs.input_ids.to(device)
        attention_mask = inputs.attention_mask.to(device)

        with torch.no_grad():
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=input_ids)
        hidden_states = outputs.last_hidden_state

        embeddings = hidden_states[:, 0, :].cpu().numpy().tolist()

        return embeddings

    def embed_documents(self, documents):
        embeddings_list = [self.custom_embedding(doc) for doc in documents]
        return embeddings_list

# Create an instance of the CustomEmbedding class
embeddings = CustomEmbedding()

# Embed documents using the custom embedding method
embeddings_list = embeddings.embed_documents(texts)

# Flatten the embeddings and convert to numpy array
flattened_embeddings = np.array([np.array(emb).flatten() for emb in embeddings_list], dtype='float32')

# Number of dimensions in your embeddings
# dimension = flattened_embeddings.shape[1]

print(f"Shape of flattened embeddings: {flattened_embeddings.shape}")
# print(f"Dimension of embeddings: {dimension}")

# Build a Faiss flat index
index = faiss.IndexFlatL2(flattened_embeddings.shape[1])

# Add your embeddings to the index
index.add(flattened_embeddings)

# Save the index
faiss.write_index(index, 'faiss_flat_index1.index')

# Load the index
loaded_index_1 = faiss.read_index('faiss_flat_index1.index')

2018-Annual-Report.pdf


tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Shape of flattened embeddings: (673, 768)


In [27]:
import faiss

# Define a query
query_text = "Provide glimse on QUARTERLY RESULTS?"
print(query_text)
# Example query string construction
class Document:
    def __init__(self, page_content, metadata):
        self.page_content = page_content
        self.metadata = metadata

query_string = Document(
    page_content=query_text,
    metadata={"source": "your_source", "page": 1}  # Adjust metadata accordingly
)

# Embed the query using the same CustomEmbedding class
query_embedding = embeddings.custom_embedding(query_string)
query_vector = np.array(query_embedding).flatten().astype('float32').reshape(1, -1)

# Perform similarity search using Faiss
k = 5  # Adjust 'k' as needed
distances, indices = loaded_index_1.search(query_vector, k)

# Display the similar sentences
valid_indices = [index for index in indices.flatten() if 0 <= index < len(documents)]
similar_sentences = [documents[index].page_content for index in valid_indices]
print(similar_sentences)
for score, sentence in zip(distances.flatten(), similar_sentences):
    print(f"Score: {score}, Sentence: {sentence}")


Provide glimse on QUARTERLY RESULTS?
['2018\nANNUAL REPORT']
Score: 7.50020694732666, Sentence: 2018
ANNUAL REPORT


In [8]:
# class CustomEmbedding:
#     def __init__(self):
#       self.tokenizer=tokenizer
#       self.model=model.to(device)
#     def custom_embedding(self, text):
#         inputs = self.tokenizer(text, return_tensors="pt")
#         input_ids = inputs.input_ids.to(device)  # Move input IDs to GPU if available
#         attention_mask = inputs.attention_mask.to(device)  # Move attention mask to GPU if available

#         # print("Input IDs:", input_ids)

#         # Specify decoder_input_ids
#         with torch.no_grad():
#             outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=input_ids)
#         hidden_states = outputs.last_hidden_state

#         # Extract the embedding from the desired layer
#         embedding = hidden_states[0][0].cpu().numpy().tolist()  # Move back to CPU for further processing
#         return embedding

#     def embed_documents(self, texts):
#       return [self.custom_embedding(text) for text in texts]

# embeddings = CustomEmbedding()  # Create an instance of the class


Use Chroma DB with custom embedding function

Chroma support 768 is max dimension so we can't use MBZUAI/LaMini-T5-738M (dimension is 1024), we should use other Vector store here

In [9]:
# # create vector store here
# print(f"Creating embeddings. May take some minutes...")
# db = Chroma.from_documents(texts, embeddings, persist_directory='db')

# db.persist()
# db = None

# print(f"Ingestion complete! You can now run privateGPT.py to query your documents")

FIASS with LLAMA Embedding

In [13]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings, HuggingFaceEmbeddings
import torch
import os
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModel
import faiss

# Check if a GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

persist_directory = "db"

# Loading documents
documents = []
for root, dirs, files in os.walk("docs"):
    for file in files:
        if file.endswith(".pdf"):
            print(file)
            loader = PyPDFLoader(os.path.join(root, file))
            documents.extend(loader.load())

# Splitting documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
texts = text_splitter.split_documents(documents)

# Initializing tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('MBZUAI/LaMini-T5-738M')
model_name = "MBZUAI/LaMini-T5-738M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

class CustomEmbedding:
    def __init__(self):
        self.tokenizer = tokenizer
        self.model = model.to(device)

    def custom_embedding(self, text):
        inputs = self.tokenizer(text.page_content, return_tensors="pt", padding=True, truncation=True, max_length=512)
        input_ids = inputs.input_ids.to(device)
        attention_mask = inputs.attention_mask.to(device)

        with torch.no_grad():
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=input_ids)
        hidden_states = outputs.last_hidden_state

        embeddings = hidden_states[:, 0, :].cpu().numpy().tolist()

        return embeddings

    def embed_documents(self, documents):
        embeddings_list = [self.custom_embedding(doc) for doc in documents]
        return embeddings_list

# Create an instance of the CustomEmbedding class
embeddings = CustomEmbedding()

# Embed documents using the custom embedding method
embeddings_list = embeddings.embed_documents(texts)

# Flatten the embeddings and convert to numpy array
flattened_embeddings = np.array([np.array(emb).flatten() for emb in embeddings_list], dtype='float32')

# Number of dimensions in your embeddings
# dimension = flattened_embeddings.shape[1]

print(f"Shape of flattened embeddings: {flattened_embeddings.shape}")
# print(f"Dimension of embeddings: {dimension}")

# Build a Faiss flat index
index = faiss.IndexFlatL2(flattened_embeddings.shape[1])

# Add your embeddings to the index
index.add(flattened_embeddings)

# Save the index
faiss.write_index(index, 'faiss_flat_index.index')

# Load the index
loaded_index = faiss.read_index('faiss_flat_index.index')

2018-Annual-Report.pdf
Shape of flattened embeddings: (673, 1024)


In [18]:
import faiss

# Define a query
query_text = "what are seasonality?"

# Example query string construction
class Document:
    def __init__(self, page_content, metadata):
        self.page_content = page_content
        self.metadata = metadata

query_string = Document(
    page_content=query_text,
    metadata={"source": "your_source", "page": 1}  # Adjust metadata accordingly
)

# Embed the query using the same CustomEmbedding class
query_embedding = embeddings.custom_embedding(query_string)
query_vector = np.array(query_embedding).flatten().astype('float32').reshape(1, -1)

# Perform similarity search using Faiss
k = 5  # Adjust 'k' as needed
distances, indices = loaded_index.search(query_vector, k)

# Display the similar sentences
valid_indices = [index for index in indices.flatten() if 0 <= index < len(documents)]
similar_sentences = [documents[index].page_content for index in valid_indices]
for score, sentence in zip(distances.flatten(), similar_sentences):
    print(f"Score: {score}, Sentence: {sentence}")


Score: 16.202184677124023, Sentence: InApril 2018, weestablished acommercial paper program (the “Commercia lPaper Program”) under which wemay from
time totime issue unsecured commercia lpaper uptoatotal of$7.0 billion atanytime, with individual maturitie sthatmay vary
butwillnotexceed 397days from thedateofissue. There were noborrowings outstanding under theCommercial Paper
Program asofDecember 31,2018.
Wehadnoborrowings outstanding under ourunsecured revolving credit facility (the“Credit Agreement”) and$594
million ofborrowings outstanding under our$620 million secured revolving credit facility (the“Credit Facility”) asof
December 31,2018. SeeItem 8ofPartII,“Financial Statements andSupplementary Data —Note 5—Debt” foradditional
information.
In2016, 2017, and2018, werecorded nettaxprovisions of$1.4billion, $769 million, and$1.2billion. Certain foreign
subsidiary earnings aresubject toU.S. taxation under theU.S.TaxAct,which alsorepeals U.S. taxation onthesubsequent
repatriation ofthose 

In [None]:
query = "what is failure needs to scale too?"
process_answer(query)

'Failure needs to scale too as a company grows, including the size of failed experiments.'