In [13]:
import os
import pymupdf  # PyMuPDF for PDFs
from PyPDF2 import PdfReader
from docx import Document

def read_file(file_path):
    """Reads a file (PDF, DOCX, TXT) and returns its text content."""
    if not os.path.exists(file_path):
        return "File not found!"

    ext = os.path.splitext(file_path)[1].lower()

    if ext == ".pdf":
        return read_pdf(file_path)
    elif ext == ".docx":
        return read_docx(file_path)
    elif ext == ".txt":
        return read_txt(file_path)
    else:
        return "Unsupported file format!"

def read_pdf(file_path):
    """Reads text from a PDF file."""
    try:
        doc = pymupdf.open(file_path)
        text = "\n".join(page.get_text() for page in doc)
        return text.strip()
    except Exception as e:
        return f"Error reading PDF: {e}"

def read_docx(file_path):
    """Reads text from a DOCX file."""
    try:
        doc = Document(file_path)
        text = "\n".join(para.text for para in doc.paragraphs)
        return text.strip()
    except Exception as e:
        return f"Error reading DOCX: {e}"

def read_txt(file_path):
    """Reads text from a TXT file."""
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            return file.read().strip()
    except Exception as e:
        return f"Error reading TXT: {e}"




In [51]:
pip install pymupdf python-docx pypdf2


Defaulting to user installation because normal site-packages is not writeable
Collecting pymupdf
  Downloading pymupdf-1.25.3-cp39-abi3-macosx_11_0_arm64.whl (18.6 MB)
[K     |████████████████████████████████| 18.6 MB 7.7 MB/s eta 0:00:011
[?25hCollecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[K     |████████████████████████████████| 244 kB 4.6 MB/s eta 0:00:01
Installing collected packages: python-docx, pymupdf
Successfully installed pymupdf-1.25.3 python-docx-1.1.2
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
# import requests

# # url = "http://localhost:4000/upload"  # Replace with your actual backend URL
# # url = "http://localhost:4000/api/files/upload"  # Correct format
# url="https://rag-bakend-assignment-deploy.onrender.com/api/files/upload"
# file_path = "/Users/shubhamsharma/Downloads/Uploader_test_doc.pdf"  # Replace with the file you want to upload
# extracted_text=read_file(file_path)
# files = {"document": extracted_text}
# response = requests.post(url, files=files)
# print(response)
# data = response.json()  # Convert Response object to a dictionary
# global_document_id = data.get("globaldocumentId")  # Extract ID safely


In [38]:
import requests

# url = "https://rag-bakend-assignment-deploy.onrender.com/upload"  # Backend URL

url = "http://localhost:4000/api/files/upload"  # Backend URL
# url="https://rag-bakend-assignment-deploy.onrender.com/api/files/upload"


file_path = "/Users/shubhamsharma/Downloads/api_test/Speech_ACM (5).pdf"  # File path

# Open the file and send it in the request
with open(file_path, "rb") as file:
    files = {"document": (file_path.split("/")[-1], file, "application/pdf")}  # Correct filename

    response = requests.post(url, files=files)

# Process response
data = response.json()
global_document_id = data.get("globaldocumentId")
print(data)

{'message': 'File processed successfully', 'globaldocumentId': 'd4dbe934-f32b-4ef2-ac39-10441faa125e', 'fileName': 'Speech_ACM (5).pdf', 'extractedText': '\n\n1\nBridging Linguistic Divides: A Novel\nTransformer-Based Neural Machine Translation\nFramework for Gondi-Hindi Translation\nRahul Shukla, Bhavesh Ajwani, Santosh Kumar\n✦\nAbstract—In an era where cultural preservation is paramount, bridging\nlinguistic gaps between endangered and low-resource language groups\nis crucial. This paper introduces a groundbreaking approach for translat-\ning the endangered'}


In [43]:
import requests
import os

# Define the backend URL
# url = "https://rag-bakend-assignment-deploy.onrender.com/api/files/upload"
url = "http://localhost:4000/api/files/upload"  # Backend URL

# Folder containing the files
folder_path = "/Users/shubhamsharma/Downloads/api_test/"

# Iterate over all files in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith(".pdf"):  # Process only PDF files
        file_path = os.path.join(folder_path, file_name)
        
        with open(file_path, "rb") as file:
            files = {"document": (file_name, file, "application/pdf")}
            response = requests.post(url, files=files)

        # Process response
        if response.status_code == 200:
            data = response.json()
            global_document_id = data.get("globaldocumentId")
            print(f"Uploaded {file_name}: {data}")
        else:
            print(f"Failed to upload {file_name}: {response.text}")

Uploaded Speech_ACM (5).pdf: {'message': 'File processed successfully', 'globaldocumentId': 'db40427d-f926-4004-ad49-d72c345a0edb', 'fileName': 'Speech_ACM (5).pdf', 'extractedText': '\n\n1\nBridging Linguistic Divides: A Novel\nTransformer-Based Neural Machine Translation\nFramework for Gondi-Hindi Translation\nRahul Shukla, Bhavesh Ajwani, Santosh Kumar\n✦\nAbstract—In an era where cultural preservation is paramount, bridging\nlinguistic gaps between endangered and low-resource language groups\nis crucial. This paper introduces a groundbreaking approach for translat-\ning the endangered'}
Uploaded history_of_git.pdf: {'message': 'File processed successfully', 'globaldocumentId': '30b03519-a18a-4757-930d-7fa1fa4412f7', 'fileName': 'history_of_git.pdf', 'extractedText': '\n\nThe History of Git\nIntroduction\nGit  is  a  distributed  version  control  system  that  has  become  an  essential  tool  for  software\ndevelopment. Created by Linus Torvalds in 2005, Git was designed to addres

In [44]:

# search_url = "https://rag-bakend-assignment-deploy.onrender.com/api/search/search"  # Backend URL
search_url = "http://localhost:4000/api/search/search"  # Backend URL

query_data = {
    "query": "Santosh Kumar",
    "globaldocumentId": "db40427d-f926-4004-ad49-d72c345a0edb" # Pass the document ID correctly
}

response = requests.post(search_url, json=query_data)

#print("Response:", response.json())  # Print the search results
# Handle response
if response.status_code == 200:
    try:
        data = response.json()  # Convert Response object to a dictionary
        
        # Check if response is a list
        if isinstance(data, list):
            # Merge all 'text' values into a single string
            merged_text = " ".join(chunk.get("text", "") for chunk in data)
            
            print("\nMerged Text:\n")
            print(merged_text)
        else:
            print("Unexpected response format:", data)
    except Exception as e:
        print("Error parsing JSON response:", e)
else:
    print("Error:", response.status_code, response.text)



Merged Text:

Santosh  Kumar  is  working  as  an  Assistant  Professor  with  the  Department
of Computer Science and Engineering at IIIT-Naya Raipur, Chhattisgarh (e-
mail:santosh@iiitnr.edu.in). L
ANGUAGEthe  essence  of  human  connection  and  cul-
tural   identity   serves   as   a   conduit   for   transmitting
knowledge, traditions, and heritage [1]Amidst the vast array
of languages spoken worldwide, a stark reality emerges: be-
tween 6,000 to 7,000 languages currently exist, yet between
50% and 90% of them face severe endangerment or extinc-
tion  by  the  dawn  of  the  22nd  century  [4]. [18]  Mehta,  D.,  Santy,  S.,  Mothilal,  R.,  Srivastava,  B.,  Sharma,  A.,
Shukla,  A.,  Prasad,  V.,  U,  V.,  Sharma,  A.,  &  Bali,  K. (2020).
