In [None]:
!pip install pymupdf



In [None]:
from operator import itemgetter
import fitz
import json

In [None]:
def fonts(doc, granularity=False):
    """Extracts fonts and their usage in PDF documents.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param granularity: also use 'font', 'flags' and 'color' to discriminate text
    :type granularity: bool
    :rtype: [(font_size, count), (font_size, count}], dict
    :return: most used fonts sorted by count, font style information
    """
    styles = {}
    font_counts = {}

    for page in doc:
        blocks = page.getText("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # block contains text
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if granularity:
                            identifier = "{0}_{1}_{2}_{3}".format(s['size'], s['flags'], s['font'], s['color'])
                            styles[identifier] = {'size': s['size'], 'flags': s['flags'], 'font': s['font'],
                                                  'color': s['color']}
                        else:
                            identifier = "{0}".format(s['size'])
                            styles[identifier] = {'size': s['size'], 'font': s['font']}

                        font_counts[identifier] = font_counts.get(identifier, 0) + 1  # count the fonts usage

    font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)

    if len(font_counts) < 1:
        raise ValueError("Zero discriminating fonts found!")

    return font_counts, styles

In [None]:
def font_tags(font_counts, styles):
    """Returns dictionary with font sizes as keys and tags as value.
    :param font_counts: (font_size, count) for all fonts occuring in document
    :type font_counts: list
    :param styles: all styles found in the document
    :type styles: dict
    :rtype: dict
    :return: all element tags based on font-sizes
    """
    p_style = styles[font_counts[0][0]]  # get style for most used font by count (paragraph)
    p_size = p_style['size']  # get the paragraph's size

    # sorting the font sizes high to low, so that we can append the right integer to each tag
    font_sizes = []
    for (font_size, count) in font_counts:
        font_sizes.append(float(font_size))
    font_sizes.sort(reverse=True)

    # aggregating the tags for each font size
    idx = 0
    size_tag = {}
    for size in font_sizes:
        idx += 1
        if size == p_size:
            idx = 0
            size_tag[size] = '<p>'
        if size > p_size:
            size_tag[size] = '<h{0}>'.format(idx)
        elif size < p_size:
            size_tag[size] = '<s{0}>'.format(idx)

    return size_tag

In [None]:
def headers_para(doc_name,doc, size_tag):
    """Scrapes headers & paragraphs from PDF and return texts with element tags.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param size_tag: textual element tags for each size
    :type size_tag: dict
    :rtype: list
    :return: texts with pre-prended element tags
    """
    header_para = []  # list with headers and paragraphs
    first = True  # boolean operator for first header
    previous_s = {}  # previous span
    count=0
    
    for page in doc:
        count=count+1
        blocks = page.getText("dict")["blocks"]
        
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # this block contains text
                
                # REMEMBER: multiple fonts and sizes are possible IN one block

                block_string = ""  # text found in block
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        
                        if s['text'].strip():  # removing whitespaces:
                            
                            if first:
                                previous_s = s
                                first = False
                                block_string = size_tag[s['size']] + s['text']
                               
                            else:
                                if s['size'] == previous_s['size']:

                                    if block_string and all((c == "|") for c in block_string):
                                        # block_string only contains pipes
                                        block_string = size_tag[s['size']] + s['text']
                                    if block_string == "":
                                        # new block has started, so append size tag
                                        block_string = size_tag[s['size']] + s['text']
                                    else:  # in the same block, so concatenate strings
                                        block_string += " " + s['text']

                                else:
                                    if len(block_string): #and '<h' in block_string:  # IGNORE tags - i.e. Script tags - Text from Images, Tables, Footers
                                    
                                        header_para.append([block_string,count+1])
                                    block_string = size_tag[s['size']] + s['text']

                                previous_s = s

                    # new block started, indicating with a pipe
                    #block_string += "\n"
                #header_para.append({"name":doc_name,"para_content":block_string, "page":count})
                if len(block_string): #and '<s6>F' in block_string:   # IGNORE tags - i.e. Script tags - Text from Images, Tables, Footers
                    header_para.append([block_string, count])

    content = []
    #for i in header_para:
      #stream = doc.xref_stream_raw(xref).replace(b'Banking ', b'Sample')
      #doc.updateStream(xref, stream)
      #print(page.get_text("text"))
      #if i[0].startswith("<s") != False:
        #i[0] = i[0].replace("<p>","")
        #print(i[0])
        #content = i[0].split(">")
        #print(content[1])
        #for page in doc:
          #stream = doc.xref_stream_raw(xref).replace(b'Architecture ', b'Sample')
          #searcc = page.search_for("Architecture")
          #print(searcc)
          #doc.updateStream(xref, stream)
    for page in doc:
      for xref in page.getContents():
        chars = [b'a',b'b',b'c',b'd',b'e',b'f',b'g',b'h',b'i',b'j',b'k',b'l',b'm',b'n']
        for c in chars:
          stream = doc.xref_stream(xref).replace(b"Banking", b'*')
        doc.updateStream(xref, stream)
        print(doc.xref_stream(xref))
    doc.save("/content/BIAN-How-to-Guide-Applying-the-BIAN-Standard-V70-Final-V100.pdf")
    #print(doc)




    return header_para

In [None]:
def main():

    document = '/content/BIAN-How-to-Guide-Applying-the-BIAN-Standard-V70-Final-V10.pdf'
    doc = fitz.open(document)
    
    font_counts, styles = fonts(doc, granularity=False)
    
    size_tag = font_tags(font_counts, styles)
    
    elements = headers_para(document,doc, size_tag)
    #print(json.dumps(elements, indent = 3, ensure_ascii=False)) 
    
if __name__ == '__main__':
    main()

b'/Artifact <</Attached [/Top]/Type/Pagination/Subtype/Header>> BDC q\r\n0.000008871 0 595.32 841.92 re\r\nW* n\r\nBT\r\n/F1 12 Tf\r\n1 0 0 1 231.29 746.98 Tm\r\n/GS7 gs\r\n0 g\r\n/GS8 gs\r\n0 G\r\n[( )] TJ\r\nET\r\nQ\r\nq\r\n0.000008871 0 595.32 841.92 re\r\nW* n\r\nQ\r\nq\r\n0.000008871 0 595.32 841.92 re\r\nW* n\r\nBT\r\n/F1 12 Tf\r\n1 0 0 1 70.824 735.7 Tm\r\n0 g\r\n0 G\r\n[( )] TJ\r\nET\r\nQ\r\nq\r\n0.000008871 0 595.32 841.92 re\r\nW* n\r\nQ\r\nq\r\n0.000008871 747 595.32 59.52 re\r\nW* n\r\n158.5 0 0 59.5 72.35 747.02 cm\r\n/Image9 Do Q\r\n EMC  /P <</MCID 0>> BDC q\r\n0.000008871 0 595.32 841.92 re\r\nW* n\r\nBT\r\n/F1 24 Tf\r\n1 0 0 1 297.65 710.62 Tm\r\n0.753 g\r\n0.753 G\r\n[( )] TJ\r\nET\r\nQ\r\n EMC  /P <</MCID 1>> BDC q\r\n0.000008871 0 595.32 841.92 re\r\nW* n\r\nBT\r\n/F1 24 Tf\r\n1 0 0 1 297.65 678.1 Tm\r\n0.753 g\r\n0.753 G\r\n[( )] TJ\r\nET\r\nQ\r\n EMC  /P <</MCID 2>> BDC q\r\n0.000008871 0 595.32 841.92 re\r\nW* n\r\nBT\r\n/F1 24 Tf\r\n1 0 0 1 208.25 645.58 Tm\r\n0

In [None]:
while True:pass

In [None]:
!jupyter notebook --NotebookApp.iopub_data_rate_limit=9000000.0
!jupyter notebook --NotebookApp.rate_limit_window=12.0

### To Get Pages from PDF comtaining images/tables as separate images with it's image TAG 

In [None]:
def headers_para(doc_name,doc, size_tag):
    """Scrapes headers & paragraphs from PDF and return texts with element tags.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param size_tag: textual element tags for each size
    :type size_tag: dict
    :rtype: list
    :return: texts with pre-prended element tags
    """
    header_para = []  # list with headers and paragraphs
    first = True  # boolean operator for first header
    previous_s = {}  # previous span
    count=0
    img_pages = []
    for page in doc:
        count=count+1
        blocks = page.getText("dict")["blocks"]
        
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # this block contains text
                
                # REMEMBER: multiple fonts and sizes are possible IN one block

                block_string = ""  # text found in block
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        
                        if s['text'].strip():  # removing whitespaces:
                            
                            if first:
                                previous_s = s
                                first = False
                                block_string = size_tag[s['size']] + s['text']
                               
                            else:
                                if s['size'] == previous_s['size']:

                                    if block_string and all((c == "|") for c in block_string):
                                        # block_string only contains pipes
                                        block_string = size_tag[s['size']] + s['text']
                                    if block_string == "":
                                        # new block has started, so append size tag
                                        block_string = size_tag[s['size']] + s['text']
                                    else:  # in the same block, so concatenate strings
                                        block_string += " " + s['text']

                                else:
                                    if len(block_string) and '<s6>F' in block_string:  # IGNORE tags - i.e. Script tags - Text from Images, Tables, Footers
                                        if len(block_string)<100:
                                          header_para.append([block_string,count+1])
                                          img_pages.append(count)                                        
                                    block_string = size_tag[s['size']] + s['text']

                                previous_s = s

                    # new block started, indicating with a pipe
                    #block_string += "\n"
                #header_para.append({"name":doc_name,"para_content":block_string, "page":count})
                if len(block_string) and '<s6>F' in block_string:   # IGNORE tags - i.e. Script tags - Text from Images, Tables, Footers
                    if len(block_string)<100:
                      header_para.append([block_string, count])
                      img_pages.append(count)  

    return header_para, img_pages

In [None]:
def main():
    img_pages = []
    document = '/content/BIAN-How-to-Guide-Applying-the-BIAN-Standard-V70-Final-V10.pdf'
    doc = fitz.open(document)
    
    font_counts, styles = fonts(doc, granularity=False)
    
    size_tag = font_tags(font_counts, styles)
    
    elements, img_pages = headers_para(document,doc, size_tag)
    #print(json.dumps(elements, indent = 3, ensure_ascii=False)) 
    print(elements)
    print(len(elements))
    print(img_pages)
    print(len(img_pages))
    #doc.select(img_pages)
    #new_doc.save("/content/BIAN-How-to-Guide-Applying-the-BIAN-Standard-V70-Final-V10.pdf")
    #doc.save()
    i = 0
    for j in elements:
      img_name = j[0]
      page = doc.loadPage(img_pages[i]-1)  # number of page
      pix = page.getPixmap()
      output = img_name.replace('<s6>','') + '.png'
      pix.writePNG(output)
      i = i + 1

if __name__ == '__main__':
    main()

In [None]:
for page in doc:
    for xref in page._getContents():
        stream = doc._getXrefStream(xref).replace(b'Banking', b'Sample')
        doc._updateStream(xref, stream)

In [None]:
!pip install PyPDF2

In [None]:
import PyPDF2
pdfReader = PyPDF2.PdfFileReader(open('/content/BIAN-How-to-Guide-Applying-the-BIAN-Standard-V70-Final-V10.pdf', 'rb'))
pdfReader.getPage(8).extractText().replace('\n','')