In [1]:
from shutil import copyfile
from pathlib import Path
from datetime import date
import pdb

import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from ipywidgets import interact
from PIL import Image
import pytesseract

In [2]:
DATA_DIR = Path('.') / 'data'

PROJECT = 'label_babel'
TODAY = date.today().isoformat()

TRN_DIR = DATA_DIR / 'train'
VAL_DIR = DATA_DIR / 'valid'
MODEL_DIR = DATA_DIR / 'models'
HAND_DIR = DATA_DIR / 'handwritten'
TYPE_DIR = DATA_DIR / 'typewritten'

VAL_CSV = DATA_DIR / 'valid.csv'
TRN_CSV = DATA_DIR / 'train.csv'

VAL_OUT_CSV = DATA_DIR / f'{PROJECT}_valid_{TODAY}.csv'
TRN_OUT_CSV = DATA_DIR / f'{PROJECT}_train_{TODAY}.csv'

BOX = 'box'
CAT = 'category'
ORI = 'original'
PATH = 'path'
CLASS = 'class'
SUB_ID = 'subject_id'
PRED_BOX = 'predicted_box'
PRED_CAT = 'predicted_category'
PRED_CLASS = 'predicted_class'
PRED_TEXT = 'predicted_text'

HANDWRITTEN = 'handwritten'
TYPEWRITTEN = 'typewritten'
CATS = ['background', HANDWRITTEN, TYPEWRITTEN]
CLASSES = len(CATS)

In [3]:
CSV = TRN_CSV
OUT_CSV = TRN_OUT_CSV

# CSV = VAL_CSV
# OUT_CSV = VAL_OUT_CSV

In [4]:
df = pd.read_csv(CSV)
df[PRED_TEXT] = None

In [5]:
CONFIG = ' '.join([
    '-l eng',
    "-c tessedit_char_blacklist='€«¢»£®'",
    '',
])

In [6]:
def show_results(idx):
    row = df.loc[idx]
    subject_id = int(row.at[SUB_ID])
    print(subject_id)
    if row.at[PRED_CAT] != TYPEWRITTEN:
        print('Not typewritten')
        return
    path = Path(TYPE_DIR / f'{subject_id}.jpg')
    path = str(path)
    image = Image.open(path)
    text = pytesseract.image_to_string(image, config=CONFIG)

    fig, ax = plt.subplots(1, 2, figsize=(20, 10))
    plt.axis('off')

    ax[0].set_axis_off()
    ax[0].imshow(image)

    ax[1].text(0, 0.5, text, fontsize=16, verticalalignment='center')

    plt.show()


# interact(show_results, idx=(0, df.shape[0] - 1));

In [7]:
def output_results(idx):
    row = df.loc[idx]
    subject_id = int(row.at[SUB_ID])
    
    if row.at[PRED_CAT] != TYPEWRITTEN:
        return
    
    path = Path(TYPE_DIR / f'{subject_id}.jpg')
    path = str(path)
    image = Image.open(path)
    text = pytesseract.image_to_string(image, config=CONFIG)

    return text

In [8]:
for idx in tqdm(df.index):
    df.at[idx, PRED_TEXT] = output_results(idx)

100%|██████████| 4865/4865 [30:24<00:00,  2.67it/s]


In [9]:
df[PRED_TEXT].head()

0    University of Arkansas Herbarium (UARK)\nFlora...
1    APPALACHIAN STATE UNIVERSITY HERBARION\nPLANTS...
2    a .\n\nAPPALACHIAN STATE UNV ERSTHY EMIS RIA\n...
3    AMARAA YHA CE AE\nHERBARIUM\nNORTHEAST LOUISIA...
4    FLORA OF ARKANSAS\nPoinsett County\n\nPOACEAE\...
Name: predicted_text, dtype: object

In [10]:
df.to_csv(OUT_CSV, index=False)