# Description:
In this notebook we do a brief Proof of Concept (POC) about OCR application on pdf and image files. We first transform all files (pdf, png, ...) to png files, then we use the pretrained OCR model to extract the files' text. More on the OCR model at: https://github.com/sirfz/tesserocr.

# TODO:
- Look into package that converts pdfs with text to txt files

In [None]:
import os
import shutil
import re
from tesserocr import PyTessBaseAPI
from pdf2image import convert_from_path

In [None]:
# input, intermediate and output paths
inp_path = os.path.join("..", "data", "raw", "ocr")
inter_path = os.path.join("..", "data", "interim", "ocr")
out_path = os.path.join("..", "models", "ocr")

# list pdf files in inp_path
pdf_files_inp = [os.path.join(inp_path, file) for file in os.listdir(inp_path) if file.endswith(".pdf")]

# list png files in inp_path
png_files_inp = [os.path.join(inp_path, file) for file in os.listdir(inp_path) if file.endswith(".png")]

In [None]:
# converts pdf files in inp_path to png files in out_path
for file in pdf_files_inp:
    convert_from_path(file, output_folder=inter_path, fmt='png')

# list png files in inter_path
png_files_inter = [os.path.join(inter_path, file) for file in os.listdir(inter_path) if file.endswith(".png")]

In [None]:
# extracts text from png files in the directory to txt files
png_files = png_files_inp + png_files_inter

with PyTessBaseAPI() as api:
    for file in png_files:
        api.SetImageFile(file)
        full_path = os.path.join(out_path, re.split('\\\\|\.', file)[-2] + '.txt')
        txt_file = open(full_path, "w")
        txt_file.write(api.GetUTF8Text())
        txt_file.close()