## Retrieval-Augmented Generation (RAG) System Workflow

The process involves several key steps:

1. **Document Upload**: The user uploads a document (PDF or DOCX).
   
2. **Document Processing**: 
   - If a PDF is uploaded, PyPDF2 extracts the text.
   - If a DOCX is uploaded, python-docx extracts the text.

3. **Text Splitting**: The extracted text is divided into chunks of 500 tokens, with a 50-token overlap.

4. **Embedding**: A Sentence Transformer (e.g., 'all-MiniLM-L6-v2') encodes each chunk into vector embeddings.

5. **FAISS Indexing**: The document embeddings are stored in a FAISS index for efficient retrieval.

6. **Query Input**: The user inputs a query.

7. **Query Embedding**: The query is embedded using the same Sentence Transformer model.

8. **Similarity Search**: The system compares the query embedding with document embeddings, retrieving the top 5 most relevant chunks.

9. **Answer Generation**: The retrieved chunks are used as context to generate an answer via a LLaMA model (e.g., "Llama-3.1-8B-Instruct").

10. **Chat Interface**: The user interacts with the chatbot, which retrieves information from the documents and generates responses based on the query.

<img src="RAG.png" alt="RAG System" width="800"/>


You need to install all the following Libraries before import

In [None]:
# Necessary imports
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
import faiss
import PyPDF2
from docx import Document
import torch
import gradio as gr

# Initialize the tokenizer and model
model_name = "meta-llama/Llama-3.1-8B-Instruct"  # Replace with actual model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
model.eval()

# Set pad_token_id if not set
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Initialize the Sentence Transformer model (embedding model)
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Function to process PDF files
def process_pdf_file(file_obj):
    text = ''
    reader = PyPDF2.PdfReader(file_obj)
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text
    return text

# Function to process DOCX files
def process_docx_file(file_obj):
    doc = Document(file_obj)
    return '\n'.join([para.text for para in doc.paragraphs])

# Function to split text into chunks
def split_text(text, chunk_size=500, overlap=50):
    tokens = text.split()
    chunks = []
    start = 0
    while start < len(tokens):
        end = start + chunk_size
        chunk = ' '.join(tokens[start:end])
        chunks.append(chunk)
        start += chunk_size - overlap
    return chunks

# Function to process documents
def process_document(file_obj):
    if file_obj is None:
        return None, "No file uploaded."
    
    if file_obj.name.endswith('.pdf'):
        text = process_pdf_file(file_obj)
    elif file_obj.name.endswith('.docx'):
        text = process_docx_file(file_obj)
    else:
        return None, "Unsupported file type. Please upload a PDF or DOCX file."
    
    # Split the text into chunks
    chunks = split_text(text)
    
    # Generate embeddings for each chunk
    corpus_embeddings = embedder.encode(chunks, convert_to_numpy=True).astype('float32')
    dimension = corpus_embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(corpus_embeddings)
    
    return (chunks, index), None

# Retrieve relevant text based on query
def retrieve(query, index, documents, embedder, top_k=5):
    query_embedding = embedder.encode([query], convert_to_numpy=True).astype('float32')
    distances, indices = index.search(query_embedding, top_k)
    retrieved_texts = [documents[i] for i in indices[0]]
    
    # Debugging: Print retrieved texts
    print("Retrieved Texts:", retrieved_texts)
    
    return retrieved_texts

# Generate an answer using Llama model
def generate_answer(conversation_history, query, retrieved_texts, tokenizer, model, max_input_length=4096, max_new_tokens=150):
    # Limit conversation history to last 5 exchanges
    limited_history = conversation_history[-5:]
    history = "\n".join([f"User: {msg['user']}\nBot: {msg['bot']}" for msg in limited_history])
    context = "\n".join(retrieved_texts)
    
    prompt = (
        f"Below is a conversation between a user and an assistant. The assistant should answer the user's questions based on the provided context.\n\n"
        f"Conversation History:\n{history}\n\n"
        f"User: {query}\n"
        f"Context:\n{context}\n"
        f"Bot:"
    )
    
    inputs = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=max_input_length - max_new_tokens)
    
    input_ids = inputs['input_ids'].to(model.device)
    attention_mask = inputs['attention_mask'].to(model.device)
    
    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=max_new_tokens,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        do_sample=True,
        temperature=0.7,
        top_p=0.9
    )
    
    generated_tokens = outputs[0][inputs['input_ids'].shape[1]:]
    answer = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
    
    return answer

# Gradio interface functions
def handle_upload(file):
    document_data, error = process_document(file)
    if error:
        return None, error
    return document_data, None

def handle_conversation(conversation, file, user_input):
    if file is None:
        return conversation, "Please upload a document first."
    
    documents, index = file
    retrieved_texts = retrieve(user_input, index, documents, embedder)
    answer = generate_answer(conversation, user_input, retrieved_texts, tokenizer, model)
    
    # Update conversation history
    conversation.append({"user": user_input, "bot": answer})
    
    return conversation, None

# Function to update the chatbot display
def update_chat(conversation):
    return [(msg["user"], msg["bot"]) for msg in conversation]

# Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# 📄 Document-Based Conversational Chatbot")
    gr.Markdown("Upload a PDF or DOCX document and start chatting with the bot about its content.")
    
    with gr.Row():
        file_upload = gr.File(label="Upload a PDF or DOCX document", file_types=[".pdf", ".docx"])
        process_button = gr.Button("Process Document")
    
    document_state = gr.State()
    process_error = gr.Textbox(label="Error", visible=False, lines=2)
    
    process_button.click(
        fn=handle_upload,
        inputs=file_upload,
        outputs=[document_state, process_error]
    )
    
    gr.Markdown("### Chat Interface")
    
    # Chat Interface Components
    with gr.Column():
        chatbot = gr.Chatbot()
        with gr.Row():
            user_input = gr.Textbox(label="You:", placeholder="Type your message here...", show_label=False)
            send_button = gr.Button("Send")
    
    conversation_state = gr.State([])  # Initialize empty conversation history
    
    send_button.click(
        fn=handle_conversation,
        inputs=[conversation_state, document_state, user_input],
        outputs=[conversation_state, process_error]
    )
    
    # Update the chatbot display based on conversation_state
    conversation_state.change(
        fn=update_chat,
        inputs=conversation_state,
        outputs=chatbot
    )
    
    # Clear the input textbox after sending a message
    send_button.click(
        lambda: "",
        inputs=None,
        outputs=user_input
    )
    
demo.launch(share=True)
