In [31]:
import os
from doctr.io import DocumentFile
from doctr.models import ocr_predictor

from helper import save_json_file

In [32]:
def scan_document(path):
   if not os.path.exists(path):
      raise FileNotFoundError(f"The file '{path}' does not exist.")
   
   if not os.path.isfile(path):
      raise ValueError(f"The path '{path}' is not a regular file.")
   
   if not path.lower().endswith('.pdf'):
      raise ValueError("The file must be a PDF.")
   
   model = ocr_predictor(pretrained=True)
   doc = DocumentFile.from_pdf(path)
   
   result = model(doc)
   json_result = result.export()
   
   return json_result


In [33]:
def process_all_document(input_folder, output_folder):
   if not os.path.exists(input_folder):
      raise FileNotFoundError(f"The folder '{input_folder}' does not exist.")
   
   if not os.path.isdir(input_folder):
      raise ValueError(f"The path '{input_folder}' is not a directory.")
   
   if not os.path.exists(output_folder):
      os.makedirs(output_folder)
   
   for filename in os.listdir(input_folder):
      input_file_path = os.path.join(input_folder, filename)
      output_filename = filename.replace('.pdf', '.json')
      output_file_path = os.path.join(output_folder, output_filename)
      
      try:
         result = scan_document(input_file_path)
         save_json_file(output_file_path, result)
      except Exception as e:
         print(f"Error processing '{input_file_path}': {e}")

In [34]:
# process_all_document('sample/document', 'sample/scanned')