/
Makefile
49 lines (36 loc) · 1.7 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
DOCKER_IMAGE ?= ryanfb/kraken
CUDA_DEVICE ?= cpu
USE_DOCKER ?= true
DOCKER_SHM ?= 256M
PARALLEL_ARGS ?= -j 4
ifeq ($(USE_DOCKER),true)
DOCKER_PREFIX=docker run --shm-size=$(DOCKER_SHM) -it -v $(shell pwd):/data $(DOCKER_IMAGE)
else
DOCKER_PREFIX=
endif
.PHONY: all clean test
all: gaza_best.mlmodel
extract: groundtruth/*.html
$(DOCKER_PREFIX) ketos extract --output extract --normalization NFD groundtruth/*.html
gaza_best.mlmodel: extract
time $(DOCKER_PREFIX) ketos train --device $(CUDA_DEVICE) --output gaza extract/*.png
test: gaza_best.mlmodel
$(DOCKER_PREFIX) ketos test --device $(CUDA_DEVICE) -m gaza_best.mlmodel extract/*.png
gazapng.zip:
wget 'http://rfbaumann.com/gazapng.zip'
gazapng: gazapng.zip
unzip -o gazapng.zip
# gazalines.zip:
# wget 'http://rfbaumann.com/gazalines.zip'
# lines: gazapng gazalines.zip
# unzip -o gazalines.zip
# Use this instead if you *really* want to regenerate line segmentation:
# mkdir -p lines && parallel --will-cite --progress --bar --eta -u -j $(shell nproc) 'kraken -i {} lines/{/.}.json binarize segment > /dev/null' ::: gazapng/*.png
ocr: lines/*.json gazapng gazapng/*.png gaza_best.mlmodel
mkdir ocr
time $(DOCKER_PREFIX) parallel --will-cite --progress --bar --eta -u $(PARALLEL_ARGS) 'kraken --device $(CUDA_DEVICE) -i {} ocr/{/.}.txt binarize ocr --lines lines/{/.}.json -m gaza_best.mlmodel > /dev/null' ::: gazapng/*.png
hocr: lines/*.json gazapng gazapng/*.png gaza_best.mlmodel
mkdir hocr
time $(DOCKER_PREFIX) parallel --will-cite --progress --bar --eta -u $(PARALLEL_ARGS) 'kraken --device $(CUDA_DEVICE) -i {} hocr/{/.}.html binarize ocr --lines lines/{/.}.json -m gaza_best.mlmodel -h > /dev/null' ::: gazapng/*.png
clean:
rm -rfv extract *.mlmodel