In [1]:

import os
import requests
from dotenv import load_dotenv
from datetime import datetime
import time
from datetime import datetime, timedelta
load_dotenv() 

swparse_api_key = "95a58258c53ecfac7d612ecf3b82f51f4edac3ec9bb156ca0d3d6763a1250a0d"
BASE_URL =  "http://52.202.108.42:8000"


def get_file_content(filename: str)-> bytes:
    
    with open(f"pdf/{filename}", mode="rb") as f:
        return f.read()


def write_file(filepath: str, content:str):
    with open(filepath, "w") as f:
        f.write(content)


def upload_file(files: dict[str, tuple[str, bytes, str]], force_ocr: bool = False, plain_text: bool = False)-> str:    
    
    
    data = {
        "force_ocr": force_ocr,
        "plain_text": plain_text
    }
    
    headers = {
        'Authorization': f'Bearer {swparse_api_key}'
    }
    try:
        
        response = requests.post(f"{BASE_URL}/api/parsing/upload", files=files, headers=headers, data = data)
        if response.status_code == 201:
            print("File uploaded successfully!")
            res  = response.json()
            print(res)

            return res["id"]
     
        else:
            print(f"Failed to upload the file. Status code: {response.status_code}")
            print(response.text)
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        
        


def get_result(job_id: str,  result_type:str) -> str:        
    headers = {
        'Authorization': f'Bearer {swparse_api_key}'
    }
    while True:
  
        response = requests.get(f"{BASE_URL}/api/parsing/job/{job_id}/result/{result_type}", headers=headers)
         
        if response.status_code == 200:
            break

        time.sleep(.5)
 
    result = response.json()
    output = result[result_type]
    
    return output
    
def sequential_process_files(
    files: list[str],
    result_type: str,
    attempts: int = 3,
    force_ocr: bool = False,
    plain_text: bool = False
) -> list[dict[str, list[timedelta]]]:
    results = []

    for filename in files:
        file_content = get_file_content(filename)
        file = {
            "file": (filename, file_content, "application/pdf")
        }
        
        print(f"Processing file sequentially: {filename}")
        
        times = []
        for attempt in range(attempts):
            print(f"Attempt {attempt + 1} for file {filename}")
            start_time = datetime.now()
          
            job_id = upload_file(file, force_ocr=force_ocr, plain_text=plain_text)
            result = get_result(job_id, result_type=result_type)
            
            end_time = datetime.now()
            times.append(end_time - start_time)

        # Save results to disk
        output_filename = filename.replace(".pdf", "")
        if force_ocr:
            output_filename += "(force_ocr)"
        if plain_text:
            output_filename += "(plain_text).txt"
        else:
            output_filename += ".md"
        write_file(f"output/{output_filename}", result)

 
        results.append({
            "file_name": filename,
            "times": times
        })
    
    return results


def process_file(files: dict[str, tuple[str, bytes, str]], result_type: str, attempts:int = 3, force_ocr:bool=False, plain_text:bool=False)->list[object]:
        times =  []
        filename, _, _ = list( files.values())[0]
        for attempt in range(attempts):
                start_time = datetime.now()
                
                job_id = upload_file(files, force_ocr=force_ocr, plain_text=plain_text)
                result = get_result(job_id, result_type=result_type)
                end_time = datetime.now()                     
                
                time_taken = end_time - start_time   
                times.append(time_taken)
        filename = filename.replace(".pdf", "")
        if force_ocr:
            filename = f"{filename}(force_ocr)" 
        if plain_text:
                filename = f"{filename}(plain_text).txt"
        else:
            filename = f"{filename}.md"
        write_file(f"output/{filename}", result)
                
        return times 
 
 
def get_average_str(times:list[object]):
    total_time = sum(times, timedelta())
    avg_time = total_time / len(times)
    return get_time_str(avg_time)


def get_time_str(time:object)->str:
    minutes, seconds = divmod(time.seconds, 60)
    milliseconds = time.microseconds // 1000
    
    return f"Time Taken: {minutes} min {seconds} sec {milliseconds} ms"
        

In [8]:
from concurrent.futures import ThreadPoolExecutor
import os

files = [
    '2024 Sales Presentation C6501-PPOs-1.pdf',
    'CMS_AI_Playbook_3_Final.pdf',
]

result_type="markdown"
force_ocr=False
plain_text=False
attempts=1

output_dir = "output(multi-worker)"
 
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Sequential processing
print("Starting sequential processing...")
sequential_results = sequential_process_files(
    files=files,
    result_type=result_type,
    force_ocr=force_ocr,
    plain_text=plain_text,
    attempts=attempts
)




Starting sequential processing...
Processing file sequentially: CMS_AI_Playbook_3_Final.pdf
Attempt 1 for file CMS_AI_Playbook_3_Final.pdf
File uploaded successfully!
{'id': 'saq:job:swparse:ab417396-d40c-11ef-ae16-0242ac120008', 'status': 'PENDING', 's3_url': 'swparse/5d6faf070bfa5d230b34da0bf9c0be79.pdf'}
Processing file sequentially: 2024 Sales Presentation C6501-PPOs-1.pdf
Attempt 1 for file 2024 Sales Presentation C6501-PPOs-1.pdf
File uploaded successfully!
{'id': 'saq:job:swparse:c92b550c-d40c-11ef-ae16-0242ac120008', 'status': 'PENDING', 's3_url': 'swparse/b7a3bd297b23fbfd1a3bc7042c237fec.pdf'}


In [10]:
# Parallel processing
print("Starting concurrent processing...")
def parallel_process_file(filename:str):
    file_content = get_file_content(filename)
    file = {
        "file": (filename, file_content, "application/pdf")
    }
    process_file(file, result_type=result_type, force_ocr=force_ocr, plain_text=plain_text, attempts=attempts)

start_time = datetime.now()
with ThreadPoolExecutor(max_workers=len(files)) as executor:
    executor.map(parallel_process_file, files)
end_time = datetime.now()
parallel_time = end_time - start_time


print("Finished")


Starting concurrent processing...
File uploaded successfully!
{'id': 'saq:job:swparse:220b9bd2-d40d-11ef-ae16-0242ac120008', 'status': 'PENDING', 's3_url': 'swparse/b7a3bd297b23fbfd1a3bc7042c237fec.pdf'}
File uploaded successfully!
{'id': 'saq:job:swparse:2211192c-d40d-11ef-ae16-0242ac120008', 'status': 'PENDING', 's3_url': 'swparse/5d6faf070bfa5d230b34da0bf9c0be79.pdf'}
Finished


In [11]:
from datetime import timedelta

test = "Markdown"
if force_ocr:
    test = "Force OCR"
if plain_text:
    test = "Plain text"
test += " Extraction"
with open("comparison_results.md", "w") as f:
    f.write(f"# Test: {test} \n\n")
    f.write("| Case         | File Name                         | Total Time Taken          |\n")
    f.write("|--------------|----------------------------------|---------------------------|\n")
    total_timetaken = timedelta() 
    for result in sequential_results:
        for idx, time_taken in enumerate(result["times"]):
            total_timetaken += time_taken
            f.write(f"| Sequential   | {result['file_name']}  | {get_time_str(time_taken)} |\n")
    f.write(f"| Sequential Total  |      | {get_time_str(total_timetaken)} |\n")
    
    f.write(f"| Concurrent   | All Files Processed Concurrently | {get_time_str(parallel_time)} |\n")
