In [1]:
import os
import h5py
import cv2
import numpy as np
import pandas as pd
import sklearn as skl
from matplotlib import pyplot as plt

from pdf2image import convert_from_path, convert_from_bytes
import pytesseract

from OCRVision import OCRVision
from OCRPharse3 import OCRPharse3
from OCRData import OCRData

In [2]:
# set home directory
home_path = os.getcwd()[:os.getcwd().find('Code/python/obf_autax/ocr')-1]
# set input_path
input_path = os.path.join(home_path, 'Data', 'obf_autman', 'pdf', 'Manuale_Faistenau_20201101.pdf')
# Operat Hinterriß
# Manuale_Pernitz_PS
# 171_11_GS_Mannersdorf
# 171_12_PS_Pernitz

# Manuale_Faistenau_20201101
# Manuale_Haselbach_Mannersdorf_20200430

In [3]:
# convert pdf to image (pdf2image)
def convert_to_image(input_path, page_nr):

    pages = convert_from_path(input_path, 500, first_page=page_nr, last_page=page_nr)

    return pages[0]

In [4]:
#img = convert_to_image(input_path, 3).convert('RGB')
#img = np.array(img)
# 113

img = convert_to_image(input_path, 720).convert('RGB')
img = np.array(img)

In [5]:
data = OCRPharse3()

In [6]:
data.pharse_page(img)

094 B1


In [9]:
for i in range(721,740): # 952
    img = convert_to_image(input_path, i).convert('RGB')
    img = np.array(img)
    data.pharse_page(img)

094 B2
094 D1
094 D2
094 D3
094 D4
094 D5
094 D6
094 GO
094 I 0
094 K 1
094 K2
094 K 3
094 K4
094 K5
094 K6

094 L 1
094 L2
094 M1


In [10]:
data.bz

[['094B1', '5 FI', '1 LA', '3 BU', '1 TA', ''],
 ['094B2', '', '', '', '', ''],
 ['094D1', '6 FI', '3 TA', '1 BU', '', ''],
 ['094D2', '5 FI', '2 BU', '2 TA', 'T AH', ''],
 ['094D3', '6 Fl', '2 ZTA', '2 BU', '', ''],
 ['094D4', '6 Fl', '2 BU', '1 TA', '1 LA', ''],
 ['094D5', '', '', '', '', ''],
 ['094D6', '6 Fl', '2 ZTA', '2 BU', '', ''],
 ['094G0', '', '', '', '', ''],
 ['094I0', '7 /FI2TA', '1 BU', '', '', ''],
 ['094K1', '5 FI', '2 TA', '2 BU', 'T AH', ''],
 ['094K2', '5 FI', 'T TA', '2 BU', '2 AH', ''],
 ['094K3', '5 FI', '1 TA', '2 BU', '2 AH', ''],
 ['094K4', '5 FI', '1 LA', '2 AH', '1 BU', '1 TA'],
 ['094K5', 'A FIZAH', '2 BU', '1 TA', '1 LA', ''],
 ['094K6', '6 Fl', '1 TA', '1 BU', '2 AH', ''],
 ['094L1', '3 BU', '2 TA', '2 LA', '5 Fl', ''],
 ['094L2', '3 BU', '1 ITA', '2 LA', 'A FI', ''],
 ['094M1', '', '', '', '', '']]

In [11]:
data.text

[['094B1',
  'ST   unten STOE 23',
  'BE   oben vl BU, an Besitzgrenze Wiesen-BL, unten älter u besser, vl Lagerholz, Fl teils geringe Kronen unten stabiler, einz. TA',
  'MA   ZV, 1 GH, nächstes Dez. LI KH Verbot!',
  '',
  ''],
 ['094B2', '', '', '', '', ''],
 ['094D1',
  'ST   verkrautet und teils vergrast, vl Bäche',
  '',
  'BE   wenig VJ, TA verbissen; im SO an STR jger, unten vl TA, einz. BU; anerkannter Saatgut-BE, im W s lückig, geschlägerte Käferbäume, im O dichter',
  'und vi Wipfelbrüche',
  ''],
 ['094D2',
  'ST   Bacheinhang',
  'BE   unglalt, ein LH in jüngerer Schicht, etwas UL, vl TA',
  'MA   DF TA/AH fördern',
  '',
  ''],
 ['094D3',
  'ST   O-TL stk beweidet, 3 Teile',
  'BE   einige BU/ES, im Mittel-TL vi AH, im W von O-TL 5 J jger, Schneeschäden 2019, Gassen vorhanden, im W Teil Schneebruch aufgearbeitet',
  'MA   DF/DE Schadholz aufarbeiten undurchforstet Teile pflegen',
  '',
  ''],
 ['094D4',
  'BE   2 TLE, unglalt, 20-35 jg, Gruppenstruktur, vl BU, LA ober Str

In [12]:
data.wo_data

[['094B1', 0, 0, 23, '', 0],
 ['094B2', 0, 0, 0, '', 0],
 ['094D1', 0, 0, 0, '', '2'],
 ['094D2', 0, 0, 0, 'HS', 0],
 ['094D3', 0, 0, 0, 'SS', 0],
 ['094D4', 0, 0, 0, 'SS', 0],
 ['094D5', 0, 0, 0, '', 0],
 ['094D6', 0, 0, 0, 'SS', 0],
 ['094G0', 0, 0, 0, '', 0],
 ['094I0', 0, 0, 0, '', 0],
 ['094K1', 0, 0, 0, '', 0],
 ['094K2', 0, 0, 0, '', 0],
 ['094K3', 0, 0, 0, 'WS', 0],
 ['094K4', 0, 0, 22, 'BH', 0],
 ['094K5', 0, 0, 22, 'BH', 0],
 ['094K6', 0, 0, 0, 'ws', 0],
 ['094L1', 0, 0, 0, '', 0],
 ['094L2', 0, 0, 0, '', 0],
 ['094M1', 0, 0, 0, '', 0]]

In [36]:
data.ma

[['094B1',
  '1',
  'ZV',
  '0,0',
  '3',
  '1',
  '1',
  '2',
  '26',
  '',
  '1',
  'KH',
  '0,5',
  '2',
  '2',
  '1',
  '2',
  '26',
  'KH Verbot beachten! 0,5 ha dürfen nicht überschritten werden!',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  ''],
 ['094B2',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  ''],
 ['094D1',
  '1',
  'JF',
  '0,3',
  '2',
  '1',
  '1',
  '2',
  '31',
  'auf obere Straße zuschlägern',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  ''],
 ['094D2',
  '1',
  'DE',
  '0,5',
  '1',
  '1',
  '1',
  '2',
  '31',
  'alternativ 26 mit JF in D1',
  '1',
  'DF',
  '1,0',
  '2',
  '1',
  '1',
  '2',
  '23',
  'gemeinsam mit DE in D3',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  ''],
 ['094D3',
  '1',
  'DF',
  '',
  '2',
  '1',
  '1',
  '2',
  '23',
  'Schadholz Aufarbeiten',
  '',
  '',


In [13]:
pd.DataFrame(data.bz, columns=['WO', 'text1', 'text2', 'text3', 'text4', 'text5',])

Unnamed: 0,WO,text1,text2,text3,text4,text5
0,094B1,5 FI,1 LA,3 BU,1 TA,
1,094B2,,,,,
2,094D1,6 FI,3 TA,1 BU,,
3,094D2,5 FI,2 BU,2 TA,T AH,
4,094D3,6 Fl,2 ZTA,2 BU,,
5,094D4,6 Fl,2 BU,1 TA,1 LA,
6,094D5,,,,,
7,094D6,6 Fl,2 ZTA,2 BU,,
8,094G0,,,,,
9,094I0,7 /FI2TA,1 BU,,,


In [15]:
data.bz

[['094B1', '5 FI', '1 LA', '3 BU', '1 TA', ''],
 ['094B2', '', '', '', '', ''],
 ['094D1', '6 FI', '3 TA', '1 BU', '', '']]