# Description:
In this notebook we do a brief Proof of Concept (POC) about OCR application on pdf and image files. We first transform all files (pdf, png, ...) to png files, then we use the pretrained OCR model to extract the files' text. More on the OCR model at: https://github.com/sirfz/tesserocr.

# TODO:
- Look into package that converts pdfs with text to txt files

In [1]:
import os
import shutil
import re
from tesserocr import PyTessBaseAPI
from pdf2image import convert_from_path
import tempfile

from src import PROJECT_ROOT

In [2]:
# input, intermediate and output paths
inp_path = os.path.join(PROJECT_ROOT, "data", "raw")
out_path = os.path.join(PROJECT_ROOT, "outputs", "ocr")

# list pdf files in inp_path
pdf_files_inp = [os.path.join(inp_path, file) for file in os.listdir(inp_path) if file.endswith(".pdf")]

# list png files in inp_path
png_files_inp = [os.path.join(inp_path, file) for file in os.listdir(inp_path) if file.endswith(".png")]

In [3]:
with tempfile.TemporaryDirectory() as path:
    # converts pdf files in inp_path to png files
    for file in pdf_files_inp:
        convert_from_path(file, output_folder=path, fmt='png')
    
    # list png files
    png_files_inter = [os.path.join(path, file) for file in os.listdir(path) if file.endswith(".png")]
    
    # merge png filesZZZ
    png_files = png_files_inp + png_files_inter
    
    # extracts text from png files in the directory to txt files
    with PyTessBaseAPI() as api:
        for file in png_files:
            api.SetImageFile(file)
            full_path = os.path.join(out_path, os.path.splitext(os.path.basename(file))[0] + '.txt')
            txt_file = open(full_path, "w")
            txt_file.write(api.GetUTF8Text())
            txt_file.close()