In [12]:
from textractor import Textractor
from textractor.data.constants import TextractFeatures
from textractor.entities.document import Document
from pathlib import Path
from rich import print
import json

In [34]:
png_path = Path(
    "/Users/williamthompson/Code/kellogg/ocr-projects/projects/kotra/data/1969/png")
assert png_path.exists()
png_files = list(png_path.glob("*.png"))
png_files.sort()
print(f"Found {len(png_files)} png files")
print(f"First file: {png_files[0].name}")
print(f"Page 13 file: {png_files[7].name}")

In [3]:
extractor = Textractor(profile_name="default")

## Synchronous Example

In [35]:
document = extractor.detect_document_text(str(png_files[7]))

In [36]:
document.text

'Dong Ryung Moolsan\n7\nChung Kum Products\n8\nDai Duck Ind. Co\n4\nKoramar Sang Sa Ltd. Seoul\n11\nTae Un Tdg. Co.\n30\nTotal\n1,384\nPlastic\nDong-A PharmerCeutical\n3\n1. Charles Tucker Corp. N.Y.\nArtificial\nFlower\nHan Dok Remedia\n4\nKorea Art. Flower Mfg.\n10\n1.\nNorman Ind. Inc. 2850 Leonis Blvd. Los Angels\nCalif. (3)\n2. Air Rofresber Co., , 2638 Nicholson St. San\nLeandro Calif\nSung Moon Ind. Co.\n3\nBaik Su Pharm Co.\n1\n1. Belichen Paper Co. , Inc N.Y.\nKosung Ind. Co.\n1\nKyong Su Ind. Co,\n2\nTotal\n24\nOther\nYoo Poong Luggage Mfg\n4\n1. Albert E Price Inc. 235. North Third Philadelphia\nPlastic\nInwha Co.\n3\n1. Flambro Imports Inc. U.S.A. (Georgia) (2)\n2. K. Yamada Distributors Ltd. 747. Unin St.\nHonolulu Hawaii 96819 (2)\nDae Young Nong San Co.\n1\n1. Band Wagon Inc. 401 Summer St. . Hoston\nMassachusetts 02210\n- 8 -'

## Asynchronous Example

In [37]:
document2 = extractor.start_document_analysis(
    str(png_files[7]),
    features=[TextractFeatures.TABLES],
    s3_upload_path="s3://kellogg-ocr/temp/",
)

In [38]:
document2.text

'\n\n\tDong Ryung Moolsan\t7\t\n\tChung Kum Products\t8\t\n\tDai Duck Ind. Co\t4\t\n\tKoramar Sang Sa Ltd. Seoul\t11\t\n\tTae Un Tdg. Co.\t30\t\n\tTotal\t1,384\t\nPlastic\tDong-A PharmerCeutical\t3\t1. Charles Tucker Corp. N.Y.\nArtificial Flower\tHan Dok Remedia\t4\t\n\tKorea Art. Flower Mfg.\t10\t1. Norman Ind. Inc. 2850 Leonis Blvd. Los Angels Calif. (3)\n\t\t\t2. Air Rofresber Co., , 2638 Nicholson St. San Leandro Calif\n\tSung Moon Ind. Co.\t3\t\n\tBaik Su Pharm Co.\t1\t1. Belichen Paper Co , Inc N.Y.\n\tKosung Ind. Co.\t1\t\n\tKyong Su Ind. Co,\t2\t\n\tTotal\t24\t\nOther Plastic\tYoo Poong Luggage Mfg\t4\t1. Albert E Price Inc. 235. North Third Philadelphia\n\tInwha Co.\t3\t1. Flambro Imports Inc. U.S.A. (Georgia) (2) 2. K. Yamada Distributors Ltd. 747. Unin St. Honolulu Hawaii 96819 (2)\n\tDae Young Nong San Co.\t1\t1. Band Wagon Inc. 401 Summer St. . Hoston Massachusetts 02210\n\n\n- 8 -'

In [18]:
document2.tables

[Table
 Rows - 20
 Columns - 4
 Cells - 80
 Merged Cells - 46]

## Restore from File

In [39]:
Path("doc.json").write_text(json.dumps(document2.response, indent=4, sort_keys=True))
document3 = Document.open("doc.json")

In [40]:
document3

This document holds the following data:
Pages - 1
Words - 153
Lines - 53
Key-values - 0
Checkboxes - 0
Tables - 1
Queries - 0
Signatures - 0
Identity Documents - 0
Expense Documents - 0

In [41]:
document3.tables[0].to_pandas()

Unnamed: 0,0,1,2,3
0,,Dong Ryung Moolsan,7.0,
1,,Chung Kum Products,8.0,
2,,Dai Duck Ind. Co,4.0,
3,,Koramar Sang Sa Ltd. Seoul,11.0,
4,,Tae Un Tdg. Co.,30.0,
5,,Total,1384.0,
6,Plastic,Dong-A PharmerCeutical,3.0,1. Charles Tucker Corp. N.Y.
7,Artificial Flower,Han Dok Remedia,4.0,
8,,Korea Art. Flower Mfg.,10.0,1. Norman Ind. Inc. 2850 Leonis Blvd. Los Ange...
9,,,,"2. Air Rofresber Co., , 2638 Nicholson St. San..."


In [30]:
print(document3.text)