# Example of parsing FoundationOne report using Spark OCR

## Install spark-ocr python packge
Need specify path to `spark-ocr-assembly-[version].jar` or `secret`

In [1]:
secret = ""
license = ""
version = secret.split("-")[0]
spark_ocr_jar_path = "../../target/scala-2.11"

In [2]:
%%bash
if python -c 'import google.colab' &> /dev/null; then
    echo "Run on Google Colab!"
    echo "Install Open JDK"
    apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
    java -version
fi

In [3]:
import sys
import os

if 'google.colab' in sys.modules:
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
  os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

In [4]:
# install from PYPI using secret
#%pip install spark-ocr==$version+spark30 --extra-index-url=https://pypi.johnsnowlabs.com/$secret --upgrade

In [5]:
# or install from local path
#%pip install --user ../../python/dist/spark-ocr-3.7.0+spark30.tar.gz

## Initialization of spark session

In [6]:
from pyspark.sql import SparkSession
from sparkocr import start

if license:
    os.environ['JSL_OCR_LICENSE'] = license

spark = start(secret=secret, jar_path=spark_ocr_jar_path)
spark

Spark version: 3.2.1
Spark NLP version: 4.0.0
Spark NLP for Healthcare version: 4.0.0
Spark OCR version: 4.0.0



## Import OCR transformers

In [7]:
from sparkocr.transformers import *
from sparkocr.enums import *
from pyspark.ml import PipelineModel

## Define OCR transformers and pipeline

In [8]:
def pipeline():
    
    # Transforrm PDF document to images per page
    pdf_to_text = PdfToText()
    pdf_to_text.setOutputCol("text")
    pdf_to_text.setSplitPage(False)
    pdf_to_text.setSort(True)
    pdf_to_text.setTextStripper(TextStripperType.PDF_LAYOUT_TEXT_STRIPPER)

    genomic_parser = FoundationOneReportParser()
    genomic_parser.setInputCol("text")
    genomic_parser.setOutputCol("genomics")

    
    pipeline = PipelineModel(stages=[
        pdf_to_text,
        genomic_parser
    ])
    
    return pipeline

## Read PDF document as binary file

In [9]:
import pkg_resources
pdf_example = pkg_resources.resource_filename('sparkocr', 'resources/ocr/pdfs/genomics/3.pdf')
pdf_example_df = spark.read.format("binaryFile").load(pdf_example).cache()

## Run OCR pipelines

In [10]:
result = pipeline().transform(pdf_example_df).cache()

## Display results

In [11]:
print(result.select("genomics").collect()[0].genomics)

{
  "Appendix" : {
    "dna_gene_list" : [ "ABL1", "ACVR1B", "AKT1", "AKT2", "AKT3", "ALK", "ALOX12B", "AMER1", "(FAM123B)", "APC", "AR", "ARAF", "ARFRP1", "ARID1A", "ASXL1", "ATM", "ATR", "ATRX", "AURKA", "AURKB", "AXIN1", "AXL", "BAP1", "BARD1", "BCL2", "BCL2L1", "BCL2L2", "BCL6", "BCOR", "BCORL1", "BRAF", "BRCA1", "BRCA2", "BRD4", "BRIP1", "BTG1", "BTG2", "BTK", "C11orf30", "(EMSY)", "C17orf39", "(GID4)", "CALR", "CARD11", "CASP8", "CBFB", "CBL", "CCND1", "CCND2", "CCND3", "CCNE1", "CD22", "CD274", "(PD-L1)", "CD70", "CD79A", "CD79B", "CDC73", "CDH1", "CDK12", "CDK4", "CDK6", "CDK8", "CDKN1A", "CDKN1B", "CDKN2A", "CDKN2B", "CDKN2C", "CEBPA", "CHEK1", "CHEK2", "CIC", "CREBBP", "CRKL", "CSF1R", "CSF3R", "CTCF", "CTNNA1", "CTNNB1", "CUL3", "CUL4A", "CXCR4", "CYP17A1", "DAXX", "DDR1", "DDR2", "DIS3", "DNMT3A", "DOT1L", "EED", "EGFR", "EP300", "EPHA3", "EPHB1", "EPHB4", "ERBB2", "ERBB3", "ERBB4", "ERCC4", "ERG", "ERRFI1", "ESR1", "EZH2", "FAM46C", "FANCA", "FANCC", "FANCG", "FANCL", "FAS

## Clear cache

In [12]:
result.unpersist()

path,modificationTime,length,text,positions,height_dimension,width_dimension,content,pagenum,genomics,exception
file:/home/jose/....,2022-07-13 23:25:...,1260852,...,,792.0,612.0,,0,"{\n ""Appendix"" :...",
