In [36]:
import gradio as gr
import shutil
import os
import pandas as pd
import PyPDF2
import docx
import faiss
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
import requests  # For making HTTP requests to Ollama API
from ollama import Client


# Initialize Ollama client
client = Client(
  host='http://localhost:11434',
  headers={'x-some-header': 'some-value'}
)

# Function to handle file upload and return the file path
def upload_file(file):
    UPLOAD_FOLDER = "./RAG_data"
    if not os.path.exists(UPLOAD_FOLDER):
        os.mkdir(UPLOAD_FOLDER)
    
    # Save the uploaded file to the destination folder
    destination = os.path.join(UPLOAD_FOLDER, os.path.basename(file.name))
    shutil.copy(file.name, destination)
    
    # Return the file path for further processing
    return destination

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    try:
        reader = PyPDF2.PdfReader(pdf_path)
        text = "".join([page.extract_text() for page in reader.pages if page.extract_text()])
        return text
    except Exception as e:
        return f"Error extracting text from PDF: {str(e)}"

# Function to extract text from Word documents
def extract_text_from_docx(docx_path):
    try:
        doc = docx.Document(docx_path)
        text = "\n".join([para.text for para in doc.paragraphs])
        return text
    except Exception as e:
        return f"Error extracting text from DOCX: {str(e)}"

# Function to extract text from Excel and CSV files
def extract_text_from_spreadsheet(file_path):
    try:
        if file_path.endswith(".csv"):
            df = pd.read_csv(file_path)
        else:
            df = pd.read_excel(file_path)
        return df.to_string()
    except Exception as e:
        return f"Error extracting text from spreadsheet: {str(e)}"

# Function to extract text based on file type
def extract_text_from_file(file_path):
    if file_path.endswith(".pdf"):
        return extract_text_from_pdf(file_path)
    elif file_path.endswith(".docx"):
        return extract_text_from_docx(file_path)
    elif file_path.endswith(".xlsx") or file_path.endswith(".xls") or file_path.endswith(".csv"):
        return extract_text_from_spreadsheet(file_path)
    else:
        return "Unsupported file format. Please upload a PDF, DOCX, XLSX, XLS, or CSV file."

# Function to split text into smaller chunks
def split_text_into_chunks(text, chunk_size=1000, chunk_overlap=100):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = text_splitter.create_documents([text])
    return chunks


# Function to create embeddings using SentenceTransformer
def create_local_embeddings():
    return SentenceTransformer('all-MiniLM-L6-v2')  # Local embedding model


def embed_chunks_locally(chunks, model):
    chunk_texts = [chunk.page_content for chunk in chunks]
    embeddings = model.encode(chunk_texts, convert_to_numpy=True)
    return embeddings


# Function to index chunks with FAISS
def index_chunks_with_faiss(chunks, embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index


# Function to retrieve relevant chunks from FAISS
def retrieve_relevant_chunks(faiss_index, query, model, chunks, top_k=50):
    query_embedding = model.encode([query], convert_to_numpy=True)
    distances, indices = faiss_index.search(query_embedding, top_k)
    relevant_chunks = [chunks[i] for i in indices[0]]
    return relevant_chunks


# Function to interact with Ollama model (Qwen2.5:32b)
def query_ollama(model_name, input_text):
    try:
        # Running the Ollama CLI command using subprocess
        result = subprocess.run(
            ['ollama', 'run', model_name, input_text],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True
        )

        if result.returncode == 0:
            return result.stdout
        else:
            return f"Error: {result.stderr}"

    except Exception as e:
        return f"An error occurred: {str(e)}"

# Function to create the RAG pipeline
def create_rag_pipeline_local(faiss_index, embedding_model, generator, chunks, model):
    def generate_answer(query):
        # Retrieve relevant chunks
        relevant_docs = retrieve_relevant_chunks(faiss_index, query, embedding_model, chunks)
        context = "Given the context information above, I want you to think step by step to answer the query in a highly precise manner focused on the final answer, incase case you don't know the answer say 'I do not know!'\n\n".join([doc.page_content for doc in relevant_docs])
        
        # Generate the answer
        prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
        return generator(model, prompt)

    return generate_answer

# RAG Pipeline
def rag_pipeline(file_path, query):
    # Extract text from file
    file_text = extract_text_from_file(file_path)
    if "Error" in file_text or file_text == "Unsupported file type.":
        print(file_text)
        return

    # Split text into chunks
    chunks = split_text_into_chunks(file_text)

    # Create embeddings
    embedding_model = create_local_embeddings()
    embeddings = embed_chunks_locally(chunks, embedding_model)

    # Index the embeddings with FAISS
    faiss_index = index_chunks_with_faiss(chunks, embeddings)

    # Create the RAG pipeline
    rag_pipeline = create_rag_pipeline_local(faiss_index, embedding_model, query_ollama, chunks, model='qwen2.5:32b')

    # Get the answer to the query
    answer = rag_pipeline(query)
    #print("Answer:", answer)
    return answer

# Function to handle file input and update content and path
def handle_file_input(file):
    file_path = upload_file(file)
    file_content = extract_text_from_file(file_path)  # Extract file content
    return file_path, file_content

# Function to handle the query after file content has been loaded
def handle_query(file_path, query):
    if not file_path:
        return "No file uploaded yet. Please upload a file first."
    answer = rag_pipeline(file_path, query)  # Execute query after file is loaded
    return answer

def clear_all():
    return None, None, None  # Reset file path, file content, and answer to default
    
# Gradio UI components for file upload and query input
with gr.Blocks(css=".center-text {text-align: center;}") as demo:
    # Centered Title
    with gr.Row():
        gr.Markdown(
            """
            # RAG over Files
            Upload your documents and ask a question. This system processes the file and provides answers based on its content.
            """,
            elem_id="title",  # Custom ID for CSS styling
        )

    # File Upload
    file_input = gr.File(label="Add your documents! Provide a file path (PDF, DOCX, XLSX, XLS, or CSV)")
    file_path_output = gr.Textbox(label="File Path", interactive=False, visible=False)
    file_content_output = gr.Textbox(label="File Content", interactive=False)
    
    # Query Input and Answer Output
    query_input = gr.Textbox(label="Query:", placeholder="What's Up?")
    answer_output = gr.Textbox(label="Answer", interactive=False)

    # Clear All Button
    clear_button = gr.Button("Clear All ↺")
    
    # Handle file input: file upload triggers file content extraction
    file_input.change(
        handle_file_input,
        inputs=[file_input],
        outputs=[file_path_output, file_content_output]
    )
    
    # Handle query input: execute query after file is loaded
    query_input.submit(
        handle_query,
        inputs=[file_path_output, query_input],
        outputs=[answer_output]
    )

    # Clear all components
    clear_button.click(
        clear_all,
        inputs=[],
        outputs=[file_path_output, file_content_output, answer_output]
    )
    
# Launch Gradio UI
demo.launch(share=True)


* Running on local URL:  http://127.0.0.1:7954
* Running on public URL: https://4562247a1cd99dd4f8.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


