### 1. Installing following installers required: 
- ImageMagick: https://imagemagick.org/script/download.php
- ghostscript: https://www.ghostscript.com/download/gsdnld.html
- download & install tesseract: https://github.com/UB-Mannheim/tesseract/wiki

### 2. Note: before pip install wand, make sure install above installer at first
### 3. Note: adding r before 'C:\\Program Files\Tesseract-OCR\tesseract.exe'
- pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\Tesseract-OCR\tesseract.exe'

### 4. Clean temp file, where stores several giga bytes caches after running
- C:\Users\u279014\AppData\Local\Temp

In [21]:
import os
import io

import PyPDF2
import re
import pandas as pd

from PIL import Image
import pytesseract
from wand.image import Image as Img

In [2]:
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [3]:
class OCR():   
    def __init__(self, path, fileNumbers = 200):
        '''
        parameter:
            path: direct to the directory where batch pdf residing
            fileNUmbers: maximum pdf to parse at one time
        '''
        self.img_dict = {}
        self.extracted_text = []
        self.path = path
        self.fileNumbers = fileNumbers
    
    def pdf2jpg(self):
        os.chdir(self.path)
        files = [f for f in os.listdir('.') if f.endswith(('pdf'))]
        for i,f in enumerate(files):
            if i < self.fileNumbers: 
                print('processing: {:.2%}'.format((i+1)/self.fileNumbers))
                with Img(filename = f, resolution= 400) as img:
                    #img.compression_quality = 400
                    img.
                    pdf2img = img.convert('jpg')
                    self.img_dict[f] = pdf2img
        print('pdf_to_image:100% completed!')
        return self.img_dict    
    
    def jpg2text(self,img_dict,rotate = 0):
        imgBlobs = [] 
        for img in img_dict.values():
            page = Img(image=img)
            imgBlobs.append(page.make_blob('jpg'))
                
        for i,imgBlob in enumerate(imgBlobs):
            im = Image.open(io.BytesIO(imgBlob))
            text = pytesseract.image_to_string(im).replace('\n','')
            
            # check rotation for drawings, skipping this step could cause mid-read pdf and un-recognizable text
            try:
                text.index('VENDOR')
            except: 
                text = pytesseract.image_to_string(im.rotate(angle = 180)).replace('\n','')
                
            self.extracted_text.append(text)
            print('processing: {:.2%}'.format((i+1)/(len(imgBlobs))))
        print('image_to_text: 100% completed')
        return list(img_dict.keys()), list(img_dict.values()), self.extracted_text 

    def text2item(self, drawing_nums, texts):
        weight_rule_list = ['EST\w+:\d+.\d+%','EST\w+:\d+.\d+#','\d+.\d+k',
                            'EST\w+:\d+.\w+%','EST\w+:\d+.\w+#','\d+.\w+k',
                            'SCALE:\d+.\w+%','SCALE:\d+.\w+#','SCALE:\d+.\w+k',
                            'SCALE:\d+.\d+%','SCALE:\d+.\d+#','SCALE:\d+.\d+k']
        weight_rules = re.compile('|'.join(weight_rule_list))
        material_rule_list = ['MATERIAL:SEEPARTLIST','MATERIAL:SEEPARTSLIST','MATERIAL:\d+']
        material_rules = re.compile('|'.join(material_rule_list))
        pattern = re.compile('\s+')
        weights = []
        materials = []        
        for i,text in enumerate(texts):
            text_whiten = re.sub(pattern, '',text)
            weight = weight_rules.findall(text_whiten)
            material = material_rules.findall(text_whiten)
            weights.append(weight)
            materials.append(material)           
        return pd.DataFrame(list(zip(drawing_nums,weights, materials)), columns=['drawings_number','weight', 'material'])  

In [12]:
path1 = r'\\pmi.corp.truck\public\FAE-SHARE\Apps\CADView\344'
# path2 = r'C:\Users\U279014\Documents\H_Drive\7.AA Models\7.FabricationPriceAnalysis\Data\archaive\from_Andrew'
path2 = r'C:\Users\U279014\Documents\H_Drive\7.AA Models\7.FabricationPriceAnalysis\Data\archaive'
path3 = r'S:\OSK-Share\DEPT\PURCHASING\000-GPSC Business Analysts\13 - GPSC Steel Model\JLG Engineering Drawings'

In [5]:

# img_dict ={}
# os.chdir(path2)
# files = [f for f in os.listdir('.') if f.endswith('pdf')]
# for i,f in enumerate(files):
#     if i < 3: 
#         print(f)
#         print('processing: {:.2%}'.format((i+1)/3))
#         with Img(filename = f, resolution= 600) as img:
#             # img.compression_quality = 400
#             pdf2img = img.convert('png')
#             img_dict[f] = pdf2img

In [13]:
ocr = OCR(path = path2,fileNumbers=4)

In [18]:
import time

In [19]:
start_time = time.time()
img_dict = ocr.pdf2jpg()
end_time = time.time()

processing: 25.00%
processing: 50.00%
processing: 75.00%
processing: 100.00%
pdf_to_image:100% completed!


In [20]:
end_time - start_time

18.238457679748535

In [15]:
Image.MAX_IMAGE_PIXELS = 933120000

In [9]:
drawings, pics, texts = ocr.jpg2text(img_dict = img_dict)

processing: 100.00%
image_to_text: 100% completed


In [10]:
extracted = ocr.text2item(drawing_nums=drawings, texts=texts)

In [11]:
texts[0]

'V/VFULLY COMPRESSEDIRUBBER OPRING" PARI # AQIlO2o- /oG IQre~ VENDOR:TIMBREN IND. LID.S38] WESTNEY RD. SOUTHAJAX, ONTARIO LISoM6i5004.188_ 5.563 _REVISIONSDRAWINGS CREATED IN PRO/E PER LG TOROUE CHART. Oe REF. DG. NO.OUANT ITY MEL DIMENSIONS IN CD ARE UN. emp ALL WELDING To conform | TOLERANCESTO STRUCTURAL WELDING UNLESS OTHERWISE | SCALE: | / | STRESS APP‘D.: PROJECTDI. Y XYY E062 | WEIGHT: 3 LB. 8 OZ. | DATE: 2103THIS PRINT |S THE PROPERTY OF JLG INDUSTRIES, INC. AND CONTAINS (yp EM cE 080 018) Freer aT)PROPRIETARY AND CONFIDENTIAL INFORMATION OF JLG. IT MAY_NOT BE -———~p>——— “anetes + 1 meee RRCOPIED OR DISCLOSED TQ ANY THIRD PARTY WITHOUT JLG’S WRITTENCONSENT AND JS PROVIDED FOR THE LIMITED PURPOSE OF REVIEW AND MACH. SURFFVALUATION. Tow FE ABSORBER, RUBBER SPRINGil'

In [37]:
extracted

Unnamed: 0,drawings_number,weight,material
0,0020005_drawing.pdf,[],[]


In [12]:
# extracted.to_csv(r'S:\OSK-Share\DEPT\PURCHASING\000-GPSC Business Analysts\13 - GPSC Steel Model\JLG Engineering Drawings\extracted.csv')

In [15]:
drawings.index('1001158797_CHILD_EngineeringDrawing.pdf')

1

In [16]:
texts[1]

'| DRAWING NUMBERIT MAY NOT BE COPIED OR DISCLOSED TO ANY THIRD PARTY WITHOUT JLG’S WRITTEN CONSENT AND ISPROVIDED FOR THE LIMITED PURPOSE OF REVIEW AND EVALUATION.OF JLG.=©—<=oe©Lu=—_d<<—=Ludaa=©>co=<>co<<—LJocQW©coQa.Vv=<—=Ooqeco=<Cqo=Y>Ludoe—wv—fan=c>—~>LNOo=~—[a alLudQa.Ooa:Qa.Lu=—Y—=[a alQWwva—IN MILLIMETERSIN CREO MCAD.UNLESS OTHERWISE SPECIFIED.DRAWING CREATEDALL DIMENSIONSALL DIMENSIONS AREARE IN INCH[ ]INlOO1 158797cx(45°)VA(1730)2X (1473.6)I214(1193.6)R502X 1630ALL WELDING TO JLG INDWELD SPECIFICATION1001112769FOR ANGLES BETWEEN 80-100DEGREES. ALL OTHER ANGLESPER SECTION 2.3.2.6 OFAWS D.1 OR EQUIVALENT, [DRAWING NUMBER | REV. || ? | [1001158797] A |ISO VIEW(256.3) SCALE 1:102X 50AX 280\\ 560J R150R1452X R50[0.375 STK] ai254 220REV ZONE DATE BYDESCRIPTION po ER NO CHRD:REVISIONSDIMENSIONING AND TOLERANC INGPER ASME Y¥14.5M-1994JLG GLOBAL DESIGN SOLUTIONSOL#!L) SOL#2C) SOL#3Q—)JLG TORQUE CHART 5000059APPLIES TO ITEMS WITHIN OJUNLESS OTHERWISE | DRAWN BY: E.WELSHTOLERANCES ARE+/