### Step1 : Installing Libraries

In [1]:
# Install Pytorch & other libraries
! pip install "torch==2.1.2" tensorboard

# Install Hugging Face libraries
! pip install  --upgrade \
  "transformers==4.38.2" \
  "datasets==2.16.1" \
  "accelerate==0.26.1" \
  "evaluate==0.4.1" \
  "bitsandbytes==0.42.0" \
  "trl==0.7.11" \
  "peft==0.8.2" \
    "langchain" \
"sentence-transformers" \
"faiss-cpu"
! pip install unstructured
! pip install pdfminer
! pip install pdfminer.six
! pip install -U langchain-community==0.2.4

[31mERROR: Could not find a version that satisfies the requirement torch==2.1.2 (from versions: 2.2.0, 2.2.1, 2.2.2, 2.3.0, 2.3.1)[0m[31m
[0m[31mERROR: No matching distribution found for torch==2.1.2[0m[31m
[0mCollecting transformers==4.38.2
  Using cached transformers-4.38.2-py3-none-any.whl.metadata (130 kB)
Collecting datasets==2.16.1
  Using cached datasets-2.16.1-py3-none-any.whl.metadata (20 kB)
Collecting accelerate==0.26.1
  Using cached accelerate-0.26.1-py3-none-any.whl.metadata (18 kB)
Collecting bitsandbytes==0.42.0
  Using cached bitsandbytes-0.42.0-py3-none-any.whl.metadata (9.9 kB)
Collecting trl==0.7.11
  Using cached trl-0.7.11-py3-none-any.whl.metadata (10 kB)
Collecting peft==0.8.2
  Using cached peft-0.8.2-py3-none-any.whl.metadata (25 kB)
Collecting tokenizers<0.19,>=0.14 (from transformers==4.38.2)
  Using cached tokenizers-0.15.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Using cached transformers-4.38.2-py3-none-any.whl 

### Step2 : Importing Model Pipeline

In [4]:
import torch
from IPython.display import display_markdown
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import pipeline
import transformers
import time
from langchain.document_loaders import UnstructuredPDFLoader,PDFMinerLoader,TextLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
 

# Hugging Face model id
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
hf_token = "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"   # get authontication permisson from haggingface

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,token=hf_token,
    model_kwargs={
        "torch_dtype": torch.float16,
        "quantization_config": {"load_in_4bit": True},
        "low_cpu_mem_usage": True,
    },
    device_map="cuda:2"
)

terminators =  [
    pipeline.tokenizer.eos_token_id,
    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]


Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00,  1.09s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
from langchain_community.vectorstores import FAISS
from langchain.docstore import InMemoryDocstore
import faiss
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")
# Initialize an empty FAISS index
dimension = embeddings.client.get_sentence_embedding_dimension()
index = faiss.IndexFlatL2(dimension)

docstore = InMemoryDocstore()

### for semantic cache
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=docstore,
    index_to_docstore_id={}
)



In [9]:
### Pdf file Path for RAG
pdf_file_path = "/home/praveent/Gemma2-9B-Llama3-8B-Finetune-and-RAG-Project/DeepLearningBook.pdf"

In [12]:
### this class used to retrieve the text from pdf and chunk it 
class Langchain_RAG:
    def __init__(self, pdf_file_path):
        self.embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")
        self.pdf_file_path = pdf_file_path
        print("Loading PDF file, this may take time to process...")
        self.loader = PDFMinerLoader(self.pdf_file_path)
        self.data = self.loader.load()
        print("PDF file loaded.")
        print("Chunking...")
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0, separators=[" ", ",", "\n"])
        self.texts = text_splitter.split_documents(self.data)
        print("Chunking completed.")
        self.get_vec_value = FAISS.from_documents(self.texts, self.embeddings)
        print("Vector values saved.")
        self.retriever = self.get_vec_value.as_retriever(search_kwargs={"k": 4})

    def __call__(self, query):
        relevant_docs = self.retriever.get_relevant_documents(query)
        return "".join([doc.page_content for doc in relevant_docs])


### Step3 : Implementing Similarity Serch Algo

In [13]:
import time

# This class is used to generate responses from an LLM model
class Llama3_8B_gen:
    def __init__(self, pipeline, embeddings, vector_store, threshold):
        self.pipeline = pipeline
        self.embeddings = embeddings
        self.vector_store = vector_store
        self.threshold = threshold
        
    @staticmethod
    def generate_prompt(query,retrieved_text):    # Generate Prompt of my query data and recevied text
        messages = [
            {"role": "system", "content": "Answer the Question for the Given below context and information and not prior knowledge, only give the output result \n\ncontext:\n\n{}".format(retrieved_text) },
            {"role": "user", "content": query},]
        return pipeline.tokenizer.apply_chat_template(messages, tokenize=False,add_generation_prompt=True)
    
    def semantic_cache(self, query, prompt):                               # findout Similarity between query vectore and vectore_store
        query_embedding = self.embeddings.embed_documents([query])
        similar_docs = self.vector_store.similarity_search_with_score_by_vector(query_embedding[0], k=1) 
        
        if similar_docs and similar_docs[0][1] <self.threshold:            # if similarity less than my set thresold then return response from my cache data
            self.print_bold_underline("---->> From Cache")                 # otherwise generate text from pipeline and data to vectore_store
            return similar_docs[0][0].metadata['response']
        else:
            self.print_bold_underline("---->> From LLM")
            output = self.pipeline(prompt, max_new_tokens=512, eos_token_id=terminators, do_sample=True, temperature=0.7, top_p=0.9)
            
            response = output[0]["generated_text"][len(prompt):]
            self.vector_store.add_texts(texts = [query], 
                       metadatas = [{'response': response},])
            
            return response
            
    def generate(self, query, retrieved_context):
        start_time = time.time()                                          
        
        prompt = self.generate_prompt(query, retrieved_context)           # Generate prompt
        res = self.semantic_cache(query, prompt)                          # lokking Similarity and findout respose according to similarity from pipeline or verctore_sore(cache)
        
        end_time = time.time()
        execution_time = end_time - start_time                            # Claculate the running Time
        self.print_bold_underline(f"LLM generated in {execution_time:.6f} seconds")
        
        return res

    @staticmethod
    def print_bold_underline(text):
        print(f"\033[1m\033[4m{text}\033[0m")


In [14]:
text_gen = Llama3_8B_gen(pipeline=pipeline,embeddings=embeddings,
                         vector_store=vector_store,threshold=0.1)
retriever = Langchain_RAG(pdf_file_path=pdf_file_path)

Loading PDF file, this may take time to process...
PDF file loaded.
Chunking...
Chunking completed.
Vector values saved.


### Step4 : Generate Question Answer Like as Chatbot

In [15]:
def Rag_qa(query):
    retriever_context = retriever(query)
    result = text_gen.generate(query,retriever_context)
    return result

In [16]:
Rag_qa("What is Deep learning ?")

  warn_deprecated(
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.




[1m[4m---->> From LLM[0m
[1m[4mLLM generated in 5.203011 seconds[0m


'According to the provided context, Deep learning is an approach to machine learning that has drawn heavily on our knowledge of the human brain, statistics, and applied math. It is a type of machine learning that achieves great power and flexibility by learning to represent the world as a nested hierarchy of concepts, with each concept defined in relation to simpler concepts, and more abstract representations computed in terms of less abstract ones.'

In [17]:
Rag_qa("What is Deep learning ?")

[1m[4m---->> From Cache[0m
[1m[4mLLM generated in 0.009573 seconds[0m


'According to the provided context, Deep learning is an approach to machine learning that has drawn heavily on our knowledge of the human brain, statistics, and applied math. It is a type of machine learning that achieves great power and flexibility by learning to represent the world as a nested hierarchy of concepts, with each concept defined in relation to simpler concepts, and more abstract representations computed in terms of less abstract ones.'

In [18]:
Rag_qa("Explain back propagation algorithm.")

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


[1m[4m---->> From LLM[0m
[1m[4mLLM generated in 15.481651 seconds[0m


"The back-propagation algorithm is a method for computing the gradient of the loss function with respect to the model's parameters. It's a key component of the training process for neural networks.\n\nHere's a step-by-step explanation:\n\n1. **Forward Propagation**: The algorithm starts by computing the output of the neural network for a given input. This is called forward propagation.\n2. **Error Calculation**: The algorithm calculates the error between the predicted output and the actual output.\n3. **Backward Propagation**: The algorithm then computes the gradient of the loss function with respect to the model's parameters. This is called backward propagation.\n4. **Gradient Computation**: The algorithm computes the gradient of the loss function with respect to the model's parameters using the chain rule. This involves computing the gradient of the output with respect to the input, and then propagating the error backwards through the network.\n5. **Gradient Update**: The algorithm u

In [19]:
Rag_qa("back propagation algorithm.")

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


[1m[4m---->> From LLM[0m
[1m[4mLLM generated in 2.130513 seconds[0m


'The back-propagation algorithm applied to the unrolled graph with O(τ) cost is called back-propagation computation.'

In [20]:
Rag_qa("back propagation algorithm.")

[1m[4m---->> From Cache[0m
[1m[4mLLM generated in 0.009071 seconds[0m


'The back-propagation algorithm applied to the unrolled graph with O(τ) cost is called back-propagation computation.'

### Explanation
When generating text directly from the Large Language Model (LLM), the process may take over 40 seconds. However, by caching the generated text, subsequent requests for the same text experience significantly reduced response times. This caching mechanism stores previously generated text, allowing for quick retrieval without the need to regenerate it, thus improving response times for repetitive requests. By leveraging this cache, the system optimizes performance and enhances user experience by minimizing wait times for text generation.