In [1]:
import sys
sys.path.append('..')

In [2]:
import math
import re
from collections import namedtuple
from pathlib import Path

import numpy as np
from PIL import Image, ImageDraw, ImageOps
from scipy import signal
import pytesseract
from pytesseract import Output
import pandas as pd

from allometry.const import DATA_DIR

In [3]:
SHEET = DATA_DIR / 'allometry_sheets'
SHEET = SHEET / 'Biomass_Fish_Families_FamilyProgram'
# SHEET = SHEET / 'Biomass_Mollusks'
SHEET = SHEET / '00001.tif'

In [4]:
MIN_PIXELS = 20
PAD = 2
THETA = 90.0
PIXEL_THRESHOLD = 230
ROW_THRESHOLD = 40
COL_THRESHOLD = 40
DIV = 2
VERT_DIST = 35
HORI_DIST = 30
ON = 255
OFF = 0
MIN_PIXELS = 40

In [5]:
Pair = namedtuple('Row', 'low high')
BBox = namedtuple('BBox', 'left top right bottom')

### Get the image

In [6]:
image = Image.open(SHEET).convert('L')
image = image.rotate(THETA, expand=True, fillcolor='white')
# image = image.resize((image.size[0] // DIV, image.size[1] // DIV))
# image = image.crop((2770, 584, 2770+570, 584+365))

# display(image)
path = DATA_DIR / 'image.jpg'
image.save(path, 'JPEG')

### Binarize the image

In [7]:
binary = image.point(lambda x: ON if x < PIXEL_THRESHOLD else OFF)
print(binary.size)

# display(image)
path = DATA_DIR / 'binary.jpg'
binary.save(path, 'JPEG')

(5968, 4436)


### Find rows

In [8]:
data = np.array(binary) // ON

proj = data.sum(axis=1)
proj = proj < (binary.size[0] // ROW_THRESHOLD)
proj = proj.astype(int) * ON

# print(proj.size)
# print(proj)

In [9]:
proj[0] = 0
proj[-1] = 0

In [10]:
peaks = signal.find_peaks(proj, distance=VERT_DIST, plateau_size=1)
# print(peaks[0].shape)
# peaks

In [18]:
tops = peaks[1]['right_edges']
bots = peaks[1]['left_edges'][1:]
pairs = [Pair(t-PAD, b+PAD) for t, b in zip(tops, bots)]
pairs[-5:]

[Row(low=4119, high=4126),
 Row(low=4198, high=4204),
 Row(low=4398, high=4404),
 Row(low=4439, high=4446),
 Row(low=4479, high=4485)]

In [12]:
rows = binary.convert('RGB')
draw = ImageDraw.Draw(rows)

In [13]:
for pair in pairs:
    draw.line((0, pair.low, rows.size[0], pair.low), fill='cyan')
    draw.line((0, pair.high, rows.size[0], pair.high), fill='yellow')

# display(marked)
path = DATA_DIR / 'rows.jpg'
rows.save(path, 'JPEG')

### Find characters

In [14]:
boxes = []

width = binary.size[0]

for row in pairs:
    line = binary.crop((0, row.low, width, row.high))

    data = np.array(line) // ON

    proj = data.sum(axis=0)
    proj = proj == 0
    proj = proj.astype(int) * ON

    proj[0] = 0
    proj[-1] = 0

    peaks = signal.find_peaks(proj, distance=HORI_DIST, plateau_size=1)

    lefts = peaks[1]['right_edges']
    rights = peaks[1]['left_edges'][1:]
    cols = [Pair(ll-PAD, rr+PAD) for ll, rr in zip(lefts, rights)]

    for col in cols:
        box = BBox(col.low, row.low, col.high, row.high)
        char = binary.crop(box)
        data = np.array(char) // ON
        pixels = np.sum(data)
        if pixels > MIN_PIXELS:
            boxes.append(box)

### Show the dissection

In [15]:
marked = binary.convert('RGB')
draw = ImageDraw.Draw(marked)

In [16]:
for box in boxes:
    draw.rectangle((box.left, box.top, box.right, box.bottom), outline='cyan')

# display(marked)
path = DATA_DIR / 'marked.jpg'
marked.save(path, 'JPEG')