### 1. Installing following installers required: 
- ImageMagick: https://imagemagick.org/script/download.php
- ghostscript: https://www.ghostscript.com/download/gsdnld.html

### 2. Note: before pip install wand, make sure install above installer at first
### 3. Note: adding r before 'C:\\Program Files\Tesseract-OCR\tesseract.exe'
- pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\Tesseract-OCR\tesseract.exe'

### 4. Clean temp file, where stores several giga bytes caches after running
- C:\Users\u279014\AppData\Local\Temp

In [1]:
import os
import io

import PyPDF2
import re
import pandas as pd

from PIL import Image
import pytesseract
from wand.image import Image as Img

In [2]:
pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\Tesseract-OCR\tesseract.exe'

In [13]:
class OCR():   
    def __init__(self, path, fileNumbers = 200):
        '''
        parameter:
            path: direct to the directory where batch pdf residing
            fileNUmbers: maximum pdf to parse at one time
        '''
        self.img_dict = {}
        self.extracted_text = []
        self.path = path
        self.fileNumbers = fileNumbers
    
    def pdf2jpg(self):
        os.chdir(self.path)
        files = [f for f in os.listdir('.') if f.endswith(('pdf'))]
        for i,f in enumerate(files):
            if i < self.fileNumbers: 
                print('processing: {:.2%}'.format((i+1)/self.fileNumbers))
                with Img(filename = f, resolution= 800) as img:
                    #img.compression_quality = 400
                    pdf2img = img.convert('jpg')
                    self.img_dict[f] = pdf2img
        print('pdf_to_image:100% completed!')
        return self.img_dict    
    
    def jpg2text(self,img_dict,rotate = 0):
        imgBlobs = [] 
        for img in img_dict.values():
            page = Img(image=img)
            imgBlobs.append(page.make_blob('jpg'))
                
        for i,imgBlob in enumerate(imgBlobs):
            im = Image.open(io.BytesIO(imgBlob))
            text = pytesseract.image_to_string(im).replace('\n','')
            
            # check rotation for drawings, skipping this step could cause mid-read pdf and un-recognizable text
            try:
                text.index('MATERIAL')
            except: 
                text = pytesseract.image_to_string(im.rotate(angle = 180)).replace('\n','')
                
            self.extracted_text.append(text)
            print('processing: {:.2%}'.format((i+1)/(len(imgBlobs))))
        print('image_to_text: 100% completed')
        return list(img_dict.keys()), list(img_dict.values()), self.extracted_text 

    def text2item(self, drawing_nums, texts):
        weight_rule_list = ['EST\w+:\d+.\d+%','EST\w+:\d+.\d+#','\d+.\d+k',
                            'EST\w+:\d+.\w+%','EST\w+:\d+.\w+#','\d+.\w+k',
                            'SCALE:\d+.\w+%','SCALE:\d+.\w+#','SCALE:\d+.\w+k',
                            'SCALE:\d+.\d+%','SCALE:\d+.\d+#','SCALE:\d+.\d+k']
        weight_rules = re.compile('|'.join(weight_rule_list))
        material_rule_list = ['MATERIAL:SEEPARTLIST','MATERIAL:SEEPARTSLIST','MATERIAL:\d+']
        material_rules = re.compile('|'.join(material_rule_list))
        pattern = re.compile('\s+')
        weights = []
        materials = []        
        for i,text in enumerate(texts):
            text_whiten = re.sub(pattern, '',text)
            weight = weight_rules.findall(text_whiten)
            material = material_rules.findall(text_whiten)
            weights.append(weight)
            materials.append(material)           
        return pd.DataFrame(list(zip(drawing_nums,weights, materials)), columns=['drawings_number','weight', 'material'])  

In [14]:
path1 = r'\\pmi.corp.truck\public\FAE-SHARE\Apps\CADView\344'
path2 = r'C:\Users\u279014\Documents\H_Drive\7.AA Models\7.FabricationPriceAnalysis\Data'
path3 = r'S:\OSK-Share\DEPT\PURCHASING\000-GPSC Business Analysts\13 - GPSC Steel Model\JLG Engineering Drawings'

In [15]:
'''
img_dict ={}
os.chdir(path2)
files = [f for f in os.listdir('.') if f.endswith('pdf')]
for i,f in enumerate(files):
    if i < 3: 
        print(f)
        print('processing: {:.2%}'.format((i+1)/3))
        with Img(filename = f, resolution= 600) as img:
            # img.compression_quality = 400
            pdf2img = img.convert('png')
            img_dict[f] = pdf2img
'''

"\nimg_dict ={}\nos.chdir(path2)\nfiles = [f for f in os.listdir('.') if f.endswith('pdf')]\nfor i,f in enumerate(files):\n    if i < 3: \n        print(f)\n        print('processing: {:.2%}'.format((i+1)/3))\n        with Img(filename = f, resolution= 600) as img:\n            # img.compression_quality = 400\n            pdf2img = img.convert('png')\n            img_dict[f] = pdf2img\n"

In [16]:
img_dict

{'1001116876.pdf': <wand.image.Image: 886bc9a 'JPG' (6800x8800)>,
 '1001158801_PARENT_EngineeringDrawing.pdf': <wand.image.Image: 4c22a42 'JPG' (6800x8800)>,
 '1001160873.pdf': <wand.image.Image: f10085c 'JPG' (6800x8800)>,
 '1001163773.pdf': <wand.image.Image: 7e4b966 'JPG' (6800x8800)>}

In [17]:
ocr = OCR(path = path2,fileNumbers=4)

In [18]:
img_dict = ocr.pdf2jpg()

processing: 25.00%
processing: 50.00%
processing: 75.00%
processing: 100.00%
pdf_to_image:100% completed!


In [19]:
drawings, pics, texts = ocr.jpg2text(img_dict = img_dict)

processing: 25.00%
processing: 50.00%
processing: 75.00%
processing: 100.00%
image_to_text: 100% completed


In [20]:
extracted = ocr.text2item(drawing_nums=drawings, texts=texts)

In [21]:
extracted

Unnamed: 0,drawings_number,weight,material
0,1001116876.pdf,[07265.5k],[MATERIAL:SEEPARTSLIST]
1,1001158801_PARENT_EngineeringDrawing.pdf,[],[MATERIAL:SEEPARTSLIST]
2,1001160873.pdf,"[ESTWI:103.2%, 46.8k]",[MATERIAL:4]
3,1001163773.pdf,[10|198XR502XRI242k],[MATERIAL:4150421]


In [None]:
# extracted.to_csv(r'S:\OSK-Share\DEPT\PURCHASING\000-GPSC Business Analysts\13 - GPSC Steel Model\JLG Engineering Drawings\extracted.csv')

In [22]:
drawings.index('1001163773.pdf')

3

In [23]:
texts[3]

'THIS PRINT IS THE PROPERTY OF JLG INDUSTRIES AND CONTAINS PROPRIETARY AND CONFIDENTIALOR DISCLOSED TO ANY THIRD PARTY WITHOUT JLG’S WRITTEN CONSENT ANDINFORMATION OF JLG.      |T MAY NOT BE COPIEDIS PROVIDED FOR THE LIMITED PURPOSE OF REVIEW AND EVALUATION.DRAWINGS CREATED IN Creo MCAD. UNLESS OTHERWISE SPECIFIED,ALL DIMENSIONS ARE IN MILLIMETERS.                                         NEXTASSY|FOR ANGLES BETWEEN 80-100DEGREES. ALL OTHER ANGLESPER SECTION 2.3.2.6 OFAWS D.1 OR EQUIVALENT   1001163761 Oc REF. DWG.     a oo [DRAWING NUMBER | REV.8 T 6 S th 4 3 c | =fio011637731 LDRAWING NUMBER[1001163773]AX R20 34X 1734X 2802X 44 | | 2X 1024 SEE DETAIL C 42X 425 ON 2X 4702X 575 ia \\ |AN \\ e/er4X R20(+ +) . : <____ 6ER 2x 71.24X R28 2X 557ex 4 4X 11992X 1935 |2X 2132.6F170.9 |2X 12902X 628S+3 LEAD IN = 530 AX B12.7 2X 802 2X 105- 8X REI |200+3 LEAD IN || 2X (90°) :600 578 2X 628 838 E10| 19 8X R502X RI242k=}578 102 DETAIL C 6 STK2X B42 SCALE 3:2 (0.25 STK]bee | 046 dDfog 1270 }———— 4K 