In [None]:
import os
import json
import requests
import pymupdf  # PyMuPDF for PDFs
from docx import Document

# Backend URLs
UPLOAD_URL = "http://localhost:4000/upload"
SEARCH_URL = "http://localhost:4000/search"

# Folder containing test documents
TEST_FOLDER = "test_files"

# Define test queries for each document
TEST_QUERIES = {
    "history_of_git.pdf": ["git usage by companies", "when was git created"],
    "software_dev.docx": ["impact of CI/CD", "advantages of version control"],
    "tech_trends.txt": ["latest AI advancements", "future of software development"]
}

# Function to read different file types
def read_file(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    
    if ext == ".pdf":
        return read_pdf(file_path)
    elif ext == ".docx":
        return read_docx(file_path)
    elif ext == ".txt":
        return read_txt(file_path)
    else:
        return None  # Unsupported file format

def read_pdf(file_path):
    """Reads text from a PDF file."""
    try:
        doc = pymupdf.open(file_path)
        return "\n".join(page.get_text() for page in doc).strip()
    except Exception as e:
        return f"Error reading PDF: {e}"

def read_docx(file_path):
    """Reads text from a DOCX file."""
    try:
        doc = Document(file_path)
        return "\n".join(para.text for para in doc.paragraphs).strip()
    except Exception as e:
        return f"Error reading DOCX: {e}"

def read_txt(file_path):
    """Reads text from a TXT file."""
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            return file.read().strip()
    except Exception as e:
        return f"Error reading TXT: {e}"

# Function to upload document
def upload_document(file_name, text):
    response = requests.post(UPLOAD_URL, json={"text": text})
    
    if response.status_code == 200:
        data = response.json()
        return data.get("globaldocumentId")
    else:
        print(f"Upload failed for {file_name}: {response.status_code} - {response.text}")
        return None

# Function to perform search queries
def search_queries(global_document_id, queries):
    results = {}
    
    for query in queries:
        query_data = {"query": query, "globaldocumentId": global_document_id}
        response = requests.post(SEARCH_URL, json=query_data)

        if response.status_code == 200:
            try:
                data = response.json()
                merged_text = " ".join(chunk.get("text", "") for chunk in data)
                results[query] = merged_text
            except Exception as e:
                results[query] = f"Error parsing response: {e}"
        else:
            results[query] = f"Error {response.status_code}: {response.text}"
    
    return results

# Function to automate testing
def run_tests():
    all_results = {}

    for file_name, queries in TEST_QUERIES.items():
        file_path = os.path.join(TEST_FOLDER, file_name)

        # Read file content
        extracted_text = read_file(file_path)
        if not extracted_text:
            print(f"Skipping {file_name}: Unsupported file format")
            continue

        # Upload document and get document ID
        global_document_id = upload_document(file_name, extracted_text)
        if not global_document_id:
            continue  # Skip if upload failed

        # Run queries for this document
        search_results = search_queries(global_document_id, queries)

        # Store results
        all_results[file_name] = search_results

        # Print results
        print(f"\nResults for {file_name}:")
        for query, result in search_results.items():
            print(f"\nQuery: {query}\nResult: {result}\n")

    # Save results to a JSON file for easy analysis
    with open("test_results.json", "w", encoding="utf-8") as f:
        json.dump(all_results, f, indent=4, ensure_ascii=False)

    print("\nAll test results saved to test_results.json")

# Run the test automation
run_tests()
