# Example of usage Spark OCR for recognize text from scanned multipage PDF and store results to PDF with text layout

## Install spark-ocr python packge
Need specify license and path to `spark-ocr-assembly-[version].jar` or `secret`

In [1]:
secret = ""
license = ""
version = secret.split("-")[0]
spark_ocr_jar_path = "../../target/scala-2.11"

In [2]:
%%bash
if python -c 'import google.colab' &> /dev/null; then
    echo "Run on Google Colab!"
    echo "Install Open JDK"
    apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
    java -version
fi

In [3]:
import sys
import os

if 'google.colab' in sys.modules:
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
  os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

In [4]:
# install from PYPI using secret
#%pip install spark-ocr==$version+spark30 --extra-index-url=https://pypi.johnsnowlabs.com/$secret --upgrade

In [5]:
# or install from local path
# %pip install ../../python/dist/spark-ocr-3.7.0+spark30.tar.gz

## Initialization of spark session

In [6]:
from pyspark.sql import SparkSession
from sparkocr import start

if license:
    os.environ['SPARK_OCR_LICENSE'] = license

spark = start(secret=secret, jar_path=spark_ocr_jar_path)
spark

Spark version: 3.3.0
Spark NLP version: 4.0.0
Spark NLP for Healthcare version: 4.0.0
Spark OCR version: 4.0.2



## Import OCR transformers

In [7]:
from sparkocr.transformers import *
from sparkocr.enums import *
from pyspark.ml import PipelineModel
from sparkocr.utils import display_pdf_file

## Define OCR transformers and pipeline

In [8]:
def pipeline():
    # Transform PDF document to images per page
    pdf_to_image = PdfToImage() \
        .setInputCol("content") \
        .setOutputCol("image") \
        .setKeepInput(True)
    
    # Run OCR and render results to PDF
    ocr = ImageToTextPdf() \
        .setInputCol("image") \
        .setOutputCol("pdf_page")
    
    # Assemble multipage PDF
    pdf_assembler = PdfAssembler() \
        .setInputCol("pdf_page") \
        .setOutputCol("pdf")

    pipeline = PipelineModel(stages=[
        pdf_to_image,
        ocr,
        pdf_assembler
    ])
    
    return pipeline

## Read PDF document as binary file and display it
Please note that document is scanned and you can't select text

In [9]:
import pkg_resources
pdf_example = pkg_resources.resource_filename('sparkocr', 'resources/ocr/pdfs/test_document.pdf')

pdf_example_df = spark.read.format("binaryFile").load(pdf_example).cache()
display_pdf_file(pdf_example)

## Run OCR pipelines

In [10]:
result = pipeline().transform(pdf_example_df).cache()

## Store results to pdf file

In [11]:
pdf = result.select("pdf").head().pdf
pdfFile = open("searchable.pdf", "wb")
pdfFile.write(pdf)
pdfFile.close()

## Display  output selectable PDF
Pdf document contains original image and you can select text.

In [12]:
display_pdf_file('searchable.pdf')

In [13]:
result.unpersist()

path,pdf,exception
file:/home/jose/....,[25 50 44 46 2D 3...,


In [14]:
%%bash
rm -r -f searchable.pdf
rm -r -f tmp