In [1]:
import os
import h5py
import cv2
import numpy as np
import pandas as pd
import sklearn as skl
from matplotlib import pyplot as plt

from pdf2image import convert_from_path, convert_from_bytes
import pytesseract

from OCRVision import OCRVision
from OCRPharse3 import OCRPharse3
from OCRData import OCRData

In [2]:
# set home directory
home_path = os.getcwd()[:os.getcwd().find('Code/python/obf_autax/ocr')-1]
# set input_path
input_path = os.path.join(home_path, 'Data', 'obf_autman', 'pdf', 'Manuale_Faistenau_20201101.pdf')

# Manuale_Faistenau_20201101
# Manuale_Haselbach_Mannersdorf_20200430

In [3]:
# convert pdf to image (pdf2image)
def convert_to_image(input_path, page_nr):

    pages = convert_from_path(input_path, 500, first_page=page_nr, last_page=page_nr)

    return pages[0]

In [4]:
#img = convert_to_image(input_path, 3).convert('RGB')
#img = np.array(img)
# 113

img = convert_to_image(input_path, 720).convert('RGB')
img = np.array(img)

In [5]:
data = OCRPharse3()

In [6]:
data.pharse_page(img)

094 B1


In [None]:
for i in range(721,953): # 952
    img = convert_to_image(input_path, i).convert('RGB')
    img = np.array(img)
    data.pharse_page(img)

094 B2
094 D1
094 D2
094 D3
094 D4
094 D5
094 D6


In [8]:
data.bz

[['094B1', '5 FI', '1 LA', '3 BU', '1 TA', ''],
 ['094B2', '', '', '', '', ''],
 ['094D1', '6 FI', '3 TA', '1 BU', '', ''],
 ['094D2', '5 FI', '2 BU', '2 TA', 'T AH', ''],
 ['094D3', '6 Fl', '2 ZTA', '2 BU', '', ''],
 ['094D4', '6 Fl', '2 BU', '1 TA', '1 LA', ''],
 ['094D5', '', '', '', '', ''],
 ['094D6', '6 Fl', '2 ZTA', '2 BU', '', ''],
 ['094G0', '', '', '', '', ''],
 ['094I0', '7 /FI2TA', '1 BU', '', '', ''],
 ['094K1', '5 FI', '2 TA', '2 BU', 'T AH', ''],
 ['094K2', '5 FI', 'T TA', '2 BU', '2 AH', ''],
 ['094K3', '5 FI', '1 TA', '2 BU', '2 AH', ''],
 ['094K4', '5 FI', '1 LA', '2 AH', '1 BU', '1 TA'],
 ['094K5', 'A FIZAH', '2 BU', '1 TA', '1 LA', ''],
 ['094K6', '6 Fl', '1 TA', '1 BU', '2 AH', ''],
 ['094L1', '3 BU', '2 TA', '2 LA', '5 Fl', ''],
 ['094L2', '3 BU', '1 ITA', '2 LA', 'A FI', ''],
 ['094M1', '', '', '', '', ''],
 ['094M2', '7 /FI2TA', '1 BU', '', '', ''],
 ['094M3', '5 FI', '2 BU', '2 TA', 'T AH', ''],
 ['094MA', '', '', '', '', ''],
 ['094M5', '5 Fl', '2 BU', 'T AH',

In [None]:
data.wo_data

In [None]:
# get SAP raw data

In [None]:
# set raw_path
raw_path = os.path.join(home_path, 'Data', 'obf_autman', 'raw_data', 'TO_1364_178_02_20201111.XLS')
# read tsv
wo_data = pd.read_csv(raw_path, sep='\t', encoding = "ISO-8859-1", decimal=',', error_bad_lines=False)
# filter relevant data
wo_data = wo_data[wo_data['Best.-Schicht'] == 0]

### pernitz
wo_data = wo_data[wo_data['Abteilung'] > 93]

# sort data
wo_data.sort_values(by=['Abteilung', 'WE-Typ', 'Unterabteil.', 'Teilfl.'], ascending=[True,False,True,True], inplace=True)
wo_data = wo_data[['Waldort', 'WE-Typ']]

### write to disk

In [None]:
# loop over all data
for data_part in [[data.wo_data, 'wo_data', ['Waldort', 'bkl', 'uz', 'stoe', 'vtyp', 'vg']], \
                  [data.bz, 'bz', ['Waldort', 'BZ1', 'BZ2', 'BZ3', 'BZ4', 'BZ5']], \
                  [data.text, 'text', ['Waldort', 'text1', 'text2', 'text3', 'text4', 'text5']], \
                  [data.ma, 'ma', ['Waldort', 'S1', 'MA1', 'fl1', 'DR1', 'BH1', 'ZP1', 'SG1', 'RU1', 'Text1',\
                                   'S2', 'MA2', 'fl2', 'DR2', 'BH2', 'ZP2', 'SG2', 'RU2', 'Text2',\
                                   'S3', 'MA3', 'fl3', 'DR3', 'BH3', 'ZP3', 'SG3', 'RU3', 'Text3']]]:
    
    print('creating ' + data_part[1])
    
    # create a dataframe out of the list
    df = pd.DataFrame(data_part[0], columns=data_part[2])

    # merge
    merge_data = pd.merge(wo_data, df, how='left', on='Waldort')

    # specify the path for writing dow the data
    save_path = os.path.join(home_path, 'Data', 'obf_autman', 'csv', 'feistenau', data_part[1] + '_1364_02.csv')
    # write data to disk
    df.to_csv(save_path, index=False)