In [11]:
#coding: utf-8
import os
import xlrd
import re
import shutil
import sys
import threading
import time
from datetime import datetime

import pdfplumber

import pyzbar.pyzbar as pyzbar
import numpy as np
import pandas as pd
import cv2
import fitz
from io import BytesIO
from PIL import Image
import qrcode

In [12]:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTFigure, LTImage, LTChar

def parse_pdf_with_PdfMiner3K(path):
    with open(path, 'rb') as fp:
        parser = PDFParser(fp)
        doc = PDFDocument(parser)
        parser.set_document(doc)
#         doc.set_parser(parser)
#         doc.initialize('')
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        laparams.char_margin = 1.0
        laparams.word_margin = 1.0
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        extracted_text = ''
        for i,page in enumerate(PDFPage.create_pages(doc)):
            interpreter.process_page(page)
            layout = device.get_result()
            for lt_obj in layout:
                if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                    extracted_text += lt_obj.get_text()
    return extracted_text

In [69]:

class QPDFInvoiceExtraction:
    qrcode_keys = [
        ['s0', 's1', '发票代码', '发票号码', '金额', '开票日期', '校验码', 's7'],
        ['s0', 's1', '发票代码', '发票号码', '销售方税号', '金额', '开票日期', '校验码'],
    ]
    
    local_regex_search_dict = {
        '购买方税号' : re.compile(r'购\s*买\s*方.*(?P<field>[0-9a-zA-Z]{18}).*地\s*址\s*、\s*电\s*话', re.S),
        '购买方名称' : re.compile(r'购\s*买\s*方.*名\s*称:(?P<field>.*)\n*', re.M),
        '价税合计(小写)': re.compile(r'价税合计.*[¥￥](?P<field>\d+\.\d+)', re.S),
        '价税合计(大写)': re.compile(r'(?P<field>[壹贰叁肆伍陆柒捌玖拾]\s?[零壹贰叁肆伍陆柒捌玖拾佰仟万亿整元圆角分\s]+[整元圆角分])'),
        '销售方税号': re.compile(r'销\s*售\s*方.*(?P<field>[0-9a-zA-Z]{18}).*地\s*址\s*、\s*电\s*话', re.S),
        '销售方名称' : re.compile(r'销\s*售\s*方.*名\s*称:(?P<field>.*)\n*', re.M),
    }

    global_regex_search_dict = {
        '开票地': re.compile(r'(?P<field>.*增值税电子((普通)|(专用))发票)'),
        '购买方税号' : re.compile(r'购\s*买\s*方.*(?P<field>[0-9a-zA-Z]{18}).*地\s*址\s*、\s*电\s*话', re.S),
        '购买方名称' : re.compile(r'购\s*买\s*方.*名\s*称:(?P<field>.*)\n*', re.M),
        '价税合计(小写)': re.compile(r'价税合计.*[¥￥](?P<field>\d+\.\d+)', re.S),
        '价税合计(大写)': re.compile(r'(?P<field>[壹贰叁肆伍陆柒捌玖拾]\s?[零壹贰叁肆伍陆柒捌玖拾佰仟万亿整元圆角分\s]+[整元圆角分])'),
        '销售方税号': re.compile(r'销\s*售\s*方.*(?P<field>[0-9a-zA-Z]{18}).*地\s*址\s*、\s*电\s*话', re.S),
        '销售方名称' : re.compile(r'销\s*售\s*方.*名\s*称:(?P<field>.*)\n*', re.M),
    }
    
    global_regex_findall_dict = {
        '税号列表' : re.compile(r'纳税人识别号(:|：)\s*(?P<field>[0-9a-zA-Z]{18})\s+', re.M),
        '名称列表' : re.compile(r'名.*称(:|：)\s*(?P<field>\S*)\s+', re.M),
    }

    
    
    # Display barcode and QR code location in image 
    def q_display(self, img):
        decodedObjects = pyzbar.decode(img)
        # Loop over all decoded objects
        for decodedObject in decodedObjects: 
            points = decodedObject.polygon

            # If the points do not form a quad, find convex hull
            if len(points) > 4 : 
                convexHull = cv2.convexHull(np.array([point for point in points], dtype=np.float32))
                convexHull = list(map(tuple, np.squeeze(convexHull)))
            else : 
                convexHull = points;

            # Number of points in the convex hull
            n = len(convexHull)

            # Draw the convext hull
            for j in range(0,n):
                cv2.line(img, convexHull[j], convexHull[ (j+1) % n], (255,0,0), 3)

        # Display results 
        cv2.imshow("Results", img);
        cv2.waitKey(0);

    def q_extract_qrcode_info(self, file_path, display=False):
        try:
            doc = fitz.open(file_path)
            png = doc[0].getPixmap(matrix=fitz.Matrix(2,2), alpha=False).getPNGdata()
            nparr = np.frombuffer(png, np.uint8)
            img_np = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
            decodedObjects = pyzbar.decode(img_np)
            values = decodedObjects[0].data.decode(encoding='utf-8').split(',')
            if(display):
                self.q_display(img_np)
            print(values)
            for keys in self.qrcode_keys:
                ret = dict(zip(keys, values))
                try:
                    ret['金额'] = eval(ret['金额'])
                    ret['开票日期'] = datetime.strptime(ret['开票日期'], '%Y%m%d').date()
                except:
                    pass
                else:
                    break
            for key in ['s0', 's1', 's7']:
                ret.pop(key, None)
            return ret
        except Exception as e:
            print(e)
            return {}
    
    def q_extract_pdf_info(self, file_path):
        try:
#             text = subprocess.Popen(['pdf2txt.py', file_path], stdout=subprocess.PIPE).stdout.read().decode('utf-8').replace('\n\n','\n').strip()
#             text = subprocess.Popen(['python', 'D:\\Anaconda3\\envs\\myenv\\Scripts\\pdf2txt.py', file_path], stdout=subprocess.PIPE).stdout.read().decode('utf-8').replace('\n\n','\n').strip()
#             result = parse_pdf(file_path)
#             text = result['pages'][0].extract_text()
#             text = parse_pdf_with_PdfMiner3K(path)
            with pdfplumber.open(file_path) as pdf:
                page0 = pdf.pages[0]
                text = page0.extract_text()
                tables = page0.extract_tables()
#                 print(text)
#                 print('-'*10)
#                 for table in tables:
#                     for row in table:
#                         print(row)
                pdf.close()
            
            
            
            ret = {}
            # search local tables
            for key in self.local_regex_search_dict:
                for table in tables[0]:
                    cleaned_table_text_list = [info for info in table if info is not None]
                    cleaned_table_text = ''.join(cleaned_table_text_list)
                    mt = self.local_regex_search_dict[key].search(cleaned_table_text)
                    if mt and (ret.get(key,'')==''):
                        ret[key] = mt.groupdict()['field']
                        ret[key] = ret.get(key,'').replace(' ','')
                        break
            # search global text
            for key in self.global_regex_search_dict:
                mt = self.global_regex_search_dict[key].search(text)
                if mt and (ret.get(key,'')==''):
                    ret[key] = mt.groupdict()['field']
                    ret[key] = ret.get(key,'').replace(' ','')
            # findall global text
            mt = self.global_regex_findall_dict['税号列表'].findall(text)
            if mt and (ret.get('购买方税号','')=='' or ret.get('销售方税号','')==''):
                if len(mt)==2:
                    ret['购买方税号'] = mt[0][1].replace(' ','')
                    ret['销售方税号'] = mt[1][1].replace(' ','')
            mt = self.global_regex_findall_dict['名称列表'].findall(text)
            if mt and (ret.get('购买方名称','')=='' or ret.get('销售方名称','')==''):
                if len(mt)==2:
                    ret['购买方名称'] = mt[0][1].replace(' ','')
                    ret['销售方名称'] = mt[1][1].replace(' ','')
                
            return ret
        except Exception as e:
            print(e)
            return {}

    def q_extract(self, file_path):
        ret = dict(self.q_extract_pdf_info(file_path), **self.q_extract_qrcode_info(file_path))
        ret['文件名'] = os.path.split(file_path)[1]
        return ret

In [81]:
def q_extract_pdf_invoices(input_dir='./', output_file='output.xlsx', recursive=True, normalize=False):
    ie = QPDFInvoiceExtraction()

    df = []
    for root, folders, files in os.walk(input_dir):
        for file_name in files:
            if file_name.endswith('.pdf'):
                src_pdf = os.path.join(root, file_name)
                ret = ie.q_extract(src_pdf)
                df.append(ret)
                if normalize and '发票号码' in ret and '价税合计(小写)' in ret:
                    dst_pdf = os.path.join(root, '%s_%s.pdf'%(ret['发票号码'], ret['价税合计(小写)']))
                    os.rename(src_pdf, dst_pdf)
        if not recursive:
            break
    
    pd.DataFrame(df).to_excel(output_file, index=False)

In [82]:
q_extract_pdf_invoices('data/invoice')

['01', '10', '011001900111', '89750186', '109.17', '20190306', '62121462210420364099', '53C8', '']
['01', '10', '021001900111', '23763823', '85.33', '20190322', '70446058541415909133', 'E4CD', '']
['01', '10', '011001900111', '91893712', '1238.00', '20190406', '80411517403942583178', 'BFC6', '']
['01', '10', '011001900111', '41971196', '29.03', '20190407', '58931841401388187442', '154C', '']
['01', '10', '012001800311', '61399056', '113.84', '20190511', '12169070313878525689', '384D', '']
['01', '10', '037021800211', '35124513', '188.80', '20190322', '58045030901365657124', '5BE7', '']
['01', '10', '012001800111', '77169995', '132.04', '20190511', '17894245662825932250', '568B', '']
['01', '10', '013001720011', '68461693', '77.07', '20190312', '49041258811560453196', '0DCA', '']
['01', '10', '037021800211', '37273696', '111.03', '20190328', '66581827951932721688', '758C', '']
['01', '10', '050001800111', '33395352', '26.42', '20190415', '54394378043250846173', '4CCC']


In [71]:
# pdf_file_name = '日用品20190322_98p99'
pdf_file_name = '滴滴电子发票20190404_117p26'
# pdf_file_name = '飞常准用车电子发票20190419_136'
#     pdf_file_name = '日用品20190417_197p98'
#     pdf_file_name = '食品20190408_91p8'
path = 'data\\invoice\\{pdf_file_name}.pdf'.format(pdf_file_name=pdf_file_name)

In [72]:
ie = QPDFInvoiceExtraction()
ie.q_extract_qrcode_info(path, False)

['01', '10', '012001800311', '61399056', '113.84', '20190511', '12169070313878525689', '384D', '']


{'发票代码': '012001800311',
 '发票号码': '61399056',
 '金额': 113.84,
 '开票日期': datetime.date(2019, 5, 11),
 '校验码': '12169070313878525689'}

In [73]:
ie.q_extract_pdf_info(path)

{'开票地': '天津增值税电子普通发票',
 '价税合计(小写)': '117.26',
 '价税合计(大写)': '壹佰壹拾柒圆贰角陆分',
 '购买方税号': '91110105306602928Y',
 '销售方税号': '911201163409833307',
 '购买方名称': '北京慧远云程科技有限公司',
 '销售方名称': '滴滴出行科技有限公司'}

In [74]:
ie.q_extract(path)

['01', '10', '012001800311', '61399056', '113.84', '20190511', '12169070313878525689', '384D', '']


{'开票地': '天津增值税电子普通发票',
 '价税合计(小写)': '117.26',
 '价税合计(大写)': '壹佰壹拾柒圆贰角陆分',
 '购买方税号': '91110105306602928Y',
 '销售方税号': '911201163409833307',
 '购买方名称': '北京慧远云程科技有限公司',
 '销售方名称': '滴滴出行科技有限公司',
 '发票代码': '012001800311',
 '发票号码': '61399056',
 '金额': 113.84,
 '开票日期': datetime.date(2019, 5, 11),
 '校验码': '12169070313878525689',
 '文件名': '滴滴电子发票20190404_117p26.pdf'}

In [75]:
text_3k = parse_pdf_with_PdfMiner3K(path)
text_3k

'机器编号:\n499099801283\n购\n买\n方\n名\u3000\u3000\u3000\u3000称:\n纳税人识别号:\n北京慧远云程科技有限公司\n91110105306602928Y\n地 址、\n电 话:\n开户行及账号:\n天津增值税电子普通发票\n发票代码:\n发票号码:\n开票日期:\n校验码:\n012001800311\n61399056\n2019年05月11日\n12169 07031 38785 25689\n密\n码\n区\n0316962*6+4-668437811-4--04-\n00068797/13-<*7+/673<-/81/42\n//61-6*58>84*86876637681+5>-\n>*65//95+6010+2319*<>28+785>\n货物或应税劳务、服务名称\n规格型号\n*运输服务*客运服务费\n无\n单位\n次\n数\u3000量\n单\u3000价\n金\u3000额\n1\n113.84\n113.84\n税率\n3%\n税\u3000额\n3.42\n合\n计\n价税合计（大写）\n壹佰壹拾柒圆贰角陆分\n￥113.84\n￥3.42\n（小写）\n￥117.26\n销\n售\n方\n名\u3000\u3000\u3000\u3000称:\n纳税人识别号:\n地 址、\n电 话:\n滴滴出行科技有限公司\n911201163409833307\n天津经济技术开发区南港工业区综合服务区办公楼C座103室12单元022-59002850\n开户行及账号:\n招商银行股份有限公司天津自由贸易试验区分行122905939910401\n备\n注\n收 款 人:\n张力强\n复 核:\n蔡静\n开 票 人:\n王秀丽\n销 售 方:（章）\n'

In [76]:
# re_test = re.compile(r'购\s*买\s*方\n(.*\n)*?(?P<field>[0-9a-zA-Z]{18})', re.M)
re_test = re.compile(r'(.*\n)*\n(?P<field>.*)\n[0-9a-zA-Z]{18}\n', re.M)
# re_test = re.compile(r'销\s*售\s*方\n(.*\n)*?(?P<field>[0-9a-zA-Z]{18})', re.M)
# re_test = re.compile(r'价税合计.*[¥￥](?P<field>\d+\.\d+)', re.S)
# re_test = re.compile(r'(?P<field>[壹贰叁肆伍陆柒捌玖拾]\s?[零壹贰叁肆伍陆柒捌玖拾佰仟万亿整元圆角分\s]+[整元圆角分])')
mt_test = re_test.search(text_3k)
print(mt_test)
if mt_test:
    ret_test = mt_test.groupdict()['field']
print(ret_test)

<re.Match object; span=(40, 73), match='\n北京慧远云程科技有限公司\n91110105306602928Y\n'>
北京慧远云程科技有限公司


In [77]:
with pdfplumber.open(path) as pdf:
    page0 = pdf.pages[0]
    text_plumber = page0.extract_text()

In [78]:
text_plumber

'天津增值税电子普通发票 发票代码:012001800311\n发票号码:61399056\n开票日期:2019年05月11日\n机器编号: 499099801283 校 验 码:12169 07031 38785 25689\n名\u3000\u3000\u3000\u3000称: 北京慧远云程科技有限公司 0316962*6+4-668437811-4--04-\n购 密\n纳税人识别号: 91110105306602928Y 00068797/13-<*7+/673<-/81/42\n买 码\n地 址、电 话: //61-6*58>84*86876637681+5>-\n方 区\n开户行及账号: >*65//95+6010+2319*<>28+785>\n货物或应税劳务、服务名称 规格型号 单位 数\u3000量 单\u3000价 金\u3000额 税率 税\u3000额\n*运输服务*客运服务费 无 次 1 113.84 113.84 3% 3.42\n合 计 ￥113.84 ￥3.42\n价税合计（大写） 壹佰壹拾柒圆贰角陆分 （小写）￥117.26\n名\u3000\u3000\u3000\u3000称:滴滴出行科技有限公司\n销 备\n纳税人识别号:911201163409833307\n售\n地 址、电 话:天津经济技术开发区南港工业区综合服务区办公楼C座103室12单元022-59002850\n方 注\n开户行及账号:招商银行股份有限公司天津自由贸易试验区分行122905939910401\n收 款 人: 张力强 复 核: 蔡静 开 票 人: 王秀丽 销 售 方:（章）'

In [79]:
# re_test2 = re.compile(r'价税合计.*[¥￥](?P<field>\d+\.\d+)', re.S)
# re_test2 = re.compile(r'(?P<field>[壹贰叁肆伍陆柒捌玖拾]\s?[零壹贰叁肆伍陆柒捌玖拾佰仟万亿整元圆角分\s]+[整元圆角分])')
# re_test2 = re.compile(r'名.*称(:|：)\s*(?P<field>\S*)\s+', re.M)
re_test2 = re.compile(r'纳税人识别号(:|：)\s*(?P<field>[0-9a-zA-Z]{18})\s+', re.M)
# mt_test2 = re_test2.search(text_plumber)
mt_test2 = re_test2.findall(text_plumber)
print(mt_test2)
# if mt_test2:
#     ret_test2 = mt_test2.groupdict()['field']
# print(ret_test2)

[(':', '91110105306602928Y'), (':', '911201163409833307')]


In [80]:
re_test3 = re.compile(r'名.*称(:|：)\s*(?P<field>\S*)\s+', re.M)
# re_test3 = re.compile(r'纳税人识别号(:|：)\s*(?P<field>[0-9a-zA-Z]{18})\s+', re.M)
mt_test3 = re_test3.finditer(text_plumber)
for m in mt_test3:
    print(m)
    print(m.groupdict()['field'])

<re.Match object; span=(110, 131), match='名\u3000\u3000\u3000\u3000称: 北京慧远云程科技有限公司 '>
北京慧远云程科技有限公司
<re.Match object; span=(433, 451), match='名\u3000\u3000\u3000\u3000称:滴滴出行科技有限公司\n'>
滴滴出行科技有限公司
