# Example of usage Spark OCR with Update Text Position

## Import Spark OCR transformers and Spark NLP annotators

## Install spark-ocr python packge
Need specify path to `spark-ocr-assembly-[version].jar` or `secret`

In [1]:
secret = ""
license = ""
version = secret.split("-")[0]
spark_ocr_jar_path = "../../target/scala-2.11"

In [2]:
%%bash
if python -c 'import google.colab' &> /dev/null; then
    echo "Run on Google Colab!"
    echo "Install Open JDK"
    apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
    java -version
fi

In [3]:
import os
import sys

if 'google.colab' in sys.modules:
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
  os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

In [4]:
# install from PYPI using secret
#%pip install spark-nlp==2.5.5
#%pip install spark-ocr==$version+spark30 --extra-index-url=https://pypi.johnsnowlabs.com/$secret --upgrade

In [5]:
# %pip install ../dist/spark-ocr-[version].tar.gz

## Initialization of spark session
Need specify path to `spark-ocr-assembly.jar` or `secret`

In [6]:
from sparkocr import start

if license:
    os.environ['JSL_OCR_LICENSE'] = license

spark = start(secret=secret, jar_path=spark_ocr_jar_path)
spark

Spark version: 3.2.1
Spark NLP version: 4.0.0
Spark NLP for Healthcare version: 4.0.0
Spark OCR version: 4.0.0



In [7]:
from pyspark.ml import Pipeline
from pyspark.ml import PipelineModel
from sparkocr.transformers import *
from sparknlp.annotator import *
from sparknlp.base import *
from sparkocr.enums import PageSegmentationMode

## Define OCR transformers and pipeline

In [8]:
def update_text_pipeline():

    document_assembler = DocumentAssembler() \
        .setInputCol("text") \
        .setOutputCol("document")

    sentence_detector = SentenceDetector() \
        .setInputCols(["document"]) \
        .setOutputCol("sentence")

    tokenizer = Tokenizer() \
        .setInputCols(["sentence"]) \
        .setOutputCol("tokens")

    spell = NorvigSweetingModel().pretrained("spellcheck_norvig", "en") \
          .setInputCols("tokens") \
          .setOutputCol("spell")
    
    tokenAssem = TokenAssembler() \
          .setInputCols(["spell", "document"]) \
          .setOutputCol("newDocs")

    updatedText = UpdateTextPosition() \
          .setInputCol("positions") \
          .setOutputCol("output_positions") \
          .setInputText("newDocs.result")

    pipeline = Pipeline(stages=[
        document_assembler,
        sentence_detector,
        tokenizer,
        spell,
        tokenAssem,
        updatedText
    ])
    
    return pipeline


def ocr_pipeline():
    # Transforrm PDF document to images per page
        pdf_to_image = PdfToImage() \
            .setInputCol("content") \
            .setOutputCol("image_raw") \
            .setKeepInput(True)

        binarizer = ImageBinarizer() \
            .setInputCol("image_raw") \
            .setOutputCol("image") \
            .setThreshold(130)

        ocr = ImageToText() \
            .setInputCol("image") \
            .setOutputCol("text") \
            .setIgnoreResolution(False) \
            .setPageSegMode(PageSegmentationMode.SPARSE_TEXT) \
            .setConfidenceThreshold(60)

        pipeline = Pipeline(stages=[
            pdf_to_image,
            binarizer,
            ocr
        ])
        return pipeline

## Read PDF document as binary file

In [9]:
import pkg_resources
pdf_example = pkg_resources.resource_filename('sparkocr', 'resources/ocr/pdfs/multiplepages/image_3_pages.pdf')
pdf_example_df = spark.read.format("binaryFile").load(pdf_example).cache()

## Run OCR pipelines

In [10]:
ocr_result = ocr_pipeline().fit(pdf_example_df).transform(pdf_example_df)
updated_result= update_text_pipeline().fit(ocr_result).transform(ocr_result)
textToPdf = TextToPdf() \
      .setInputCol("output_positions") \
      .setInputImage("image") \
      .setOutputCol("pdf")
result = textToPdf.transform(updated_result).cache()

spellcheck_norvig download started this may take some time.
Approximate size to download 4.2 MB
[OK!]


## Store results to pdf file

In [12]:
pdf_raw_bytes = result.head()
pdfFile = open("pdf_raw_bytes.pdf", "wb")
pdfFile.write(pdf_raw_bytes.pdf)
pdfFile.close()

In [None]:
%%bash
rm -r -f pdf_raw_bytes.pdf