In [2]:
import os
import pymupdf  # PyMuPDF for PDFs
from PyPDF2 import PdfReader
from docx import Document

def read_file(file_path):
    """Reads a file (PDF, DOCX, TXT) and returns its text content."""
    if not os.path.exists(file_path):
        return "File not found!"

    ext = os.path.splitext(file_path)[1].lower()

    if ext == ".pdf":
        return read_pdf(file_path)
    elif ext == ".docx":
        return read_docx(file_path)
    elif ext == ".txt":
        return read_txt(file_path)
    else:
        return "Unsupported file format!"

def read_pdf(file_path):
    """Reads text from a PDF file."""
    try:
        doc = pymupdf.open(file_path)
        text = "\n".join(page.get_text() for page in doc)
        return text.strip()
    except Exception as e:
        return f"Error reading PDF: {e}"

def read_docx(file_path):
    """Reads text from a DOCX file."""
    try:
        doc = Document(file_path)
        text = "\n".join(para.text for para in doc.paragraphs)
        return text.strip()
    except Exception as e:
        return f"Error reading DOCX: {e}"

def read_txt(file_path):
    """Reads text from a TXT file."""
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            return file.read().strip()
    except Exception as e:
        return f"Error reading TXT: {e}"

# Example usage


In [51]:
pip install pymupdf python-docx pypdf2


Defaulting to user installation because normal site-packages is not writeable
Collecting pymupdf
  Downloading pymupdf-1.25.3-cp39-abi3-macosx_11_0_arm64.whl (18.6 MB)
[K     |████████████████████████████████| 18.6 MB 7.7 MB/s eta 0:00:011
[?25hCollecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[K     |████████████████████████████████| 244 kB 4.6 MB/s eta 0:00:01
Installing collected packages: python-docx, pymupdf
Successfully installed pymupdf-1.25.3 python-docx-1.1.2
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [5]:
import requests

# url = "http://localhost:4000/upload"  # Replace with your actual backend URL
url = "https://rag-deploy.onrender.com/upload"  # Replace with your actual backend URL


file_path = "/Users/shubhamsharma/Downloads/history_of_git.pdf"  # Replace with the file you want to upload
extracted_text=read_file(file_path)
files = {"document": extracted_text}
response = requests.post(url, files=files)
data = response.json()  # Convert Response object to a dictionary
global_document_id = data.get("globaldocumentId")  # Extract ID safely


In [6]:

search_url = "https://rag-deploy.onrender.com/search"  # Backend URL

query_data = {
    "query": "History of git",
    "globaldocumentId": global_document_id  # Pass the document ID correctly
}

response = requests.post(search_url, json=query_data)

#print("Response:", response.json())  # Print the search results
# Handle response
if response.status_code == 200:
    try:
        data = response.json()  # Convert Response object to a dictionary
        
        # Check if response is a list
        if isinstance(data, list):
            # Merge all 'text' values into a single string
            merged_text = " ".join(chunk.get("text", "") for chunk in data)
            
            print("\nMerged Text:\n")
            print(merged_text)
        else:
            print("Unexpected response format:", data)
    except Exception as e:
        print("Error parsing JSON response:", e)
else:
    print("Error:", response.status_code, response.text)



Merged Text:


