# Ecxample of processing data from S3 using Spark OCR
## Initialize spark session

In [1]:
secret = ""
license = ""
AWS_ACCESS_KEY_ID = ""
AWS_SECRET_ACCESS_KEY = ""
version = secret.split("-")[0]
spark_ocr_jar_path = "../../target/scala-2.12"

In [2]:
%%bash
if python -c 'import google.colab' &> /dev/null; then
    echo "Run on Google Colab!"
    echo "Install Open JDK"
    apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
    java -version
fi

In [3]:
import sys
import os

if 'google.colab' in sys.modules:
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
  os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

In [4]:
# install from PYPI using secret
#%pip install spark-ocr==$version+spark30 --extra-index-url=https://pypi.johnsnowlabs.com/$secret --upgrade

## Initialization of spark session

In [5]:
# make sure we add the right dep
import pyspark
spark_to_aws_hadoop = {"3.0": "2.7.4", "3.1": "3.2.0", "3.2": "3.3.1", "3.3": "3.3.2", "3.4":"3.3.4"}
spark_version = pyspark.__version__[:3]
aws_version = spark_to_aws_hadoop[spark_version]

In [6]:
import os
from pyspark.sql import SparkSession
from pyspark import SparkConf
from sparkocr import start

if AWS_ACCESS_KEY_ID != "":
    os.environ["AWS_ACCESS_KEY_ID"] = AWS_ACCESS_KEY_ID
    os.environ["AWS_SECRET_ACCESS_KEY"] = AWS_SECRET_ACCESS_KEY
    
if license:
    os.environ['JSL_OCR_LICENSE'] = license

# set additinal dependensies for read data from S3
conf = SparkConf() \
    .set("spark.jars.packages", "org.apache.hadoop:hadoop-aws:"+aws_version)
# or you can set AWS API Keys here
#    .set('spark.hadoop.fs.s3a.access.key', "your key" ) \
#    .set('spark.hadoop.fs.s3a.secret.key', "your secret")

spark = start(secret=secret, jar_path=spark_ocr_jar_path, extra_conf=conf)
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")
spark

Spark version: 3.3.0
Spark NLP version: 4.0.0
Spark NLP for Healthcare version: 4.0.0
Spark OCR version: 4.0.1



## Imports

In [7]:
from pyspark.ml import PipelineModel

from sparkocr.transformers import *
from sparkocr.utils import display_image

## Define paths to images on S3

In [8]:
images_path = "s3a://dev.johnsnowlabs.com/ocr/datasets/news.2B/0/*.tif"

## Read images

In [9]:
images = spark.read.format("binaryFile").load(images_path).cache()
images.count()

18

## Define OCR pipeline 

In [10]:
# Transform binary to image
binary_to_image = BinaryToImage()
binary_to_image.setOutputCol("image")

# Run OCR for each region
ocr = ImageToText()
ocr.setInputCol("image")
ocr.setOutputCol("text")
ocr.setIgnoreResolution(False)

# OCR pipeline
pipeline = PipelineModel(stages=[
    binary_to_image,
    ocr
])

## Run OCR pipeline

In [11]:
results = pipeline.transform(images)

## Show results

In [12]:
results.select("path", "text").show(5, False)

+-----------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------