In [3]:
from pathlib import Path
from typing import Iterable, Any
from copy import deepcopy
import json
import re

from pdfminer.high_level import extract_pages

class PDFExtractor:
    def __init__(self):
        
        self.pdfDataObjEmpty = {
            "string": "",
            "fontSizeMedian": 0,
            "fontSizes": {},
            "headingLevel": 0,
            "decoratedTexts": [],
            "decoratedTextCollector": {
                "start": -1,
                "end": -1,
                "decoration": ""
            },
            "ctr": 0,
            "offset": 0,
        }
        
        self.pdfDataArr = []
        self.pdfDataObj = deepcopy(self.pdfDataObjEmpty)
        self.blacklist = {
            "strings": [],
            "regex": []
        }
        
        self.replacements = {}
        
        self.fontSizes:set = set()
                
    def get_indented_name(self, o: Any, depth: int) -> str:
        """Indented name of class"""
        return '  ' * depth + o.__class__.__name__

    def get_name(self, o: Any) -> str:
        """name of class"""
        return o.__class__.__name__


    def get_optional_fontinfo(self, o: Any) -> str:
        """Font info of LTChar if available, otherwise empty string"""
        if hasattr(o, 'fontname') and hasattr(o, 'size'):
            return f'{o.fontname} {round(o.size)}pt'
        return ''


    def get_optional_text(self, o: Any) -> str:
        """Text of LTItem if available, otherwise empty string"""
        if hasattr(o, 'get_text'):
            return o.get_text().strip()
        return ''
    
    def show_ltitem_hierarchy(self, o: Any, depth=0):
        """Show location and text of LTItem and all its descendants"""
        
        if depth == 0:
            print('element                        fontname             text')
            print('------------------------------ -------------------- -----')
        
        # if depth == 3:
        #     return

        print(
            f'{self.get_indented_name(o, depth):<30.30s} '
            f'{self.get_optional_fontinfo(o):<20.20s} '
            f'{self.get_optional_text(o)}'
        )
        
        if isinstance(o, Iterable):
            for i in o:
                self.show_ltitem_hierarchy(i, depth=depth + 1)
                
    def resetData(self):
        self.pdfDataArr = []
        self.pdfDataObj = deepcopy(self.pdfDataObjEmpty)
        
    def setBlacklist(self, blacklist):
        self.blacklist = blacklist
        
    def setReplacements(self, replacements):
        self.replacements = replacements
        
    def isObjValid(self):
        testStr = self.pdfDataObj["string"].strip()
        
        if testStr == "":
            return False
        
        if testStr in self.blacklist["strings"]:
            return False
        
        for r in self.blacklist["regex"]:
            if re.search(r, testStr):
                return False
        
        return True        
    def replace(self):
        for key, value in self.replacements.items():
            self.pdfDataObj["string"] = self.pdfDataObj["string"].replace(key, value)
            
        self.pdfDataObj["string"] = ''.join([c if c.isprintable() else ' ' for c in self.pdfDataObj["string"]])
        
        
        temp = deepcopy(self.pdfDataObj["string"])
        self.pdfDataObj["string"] = self.pdfDataObj["string"].lstrip()
        if temp != self.pdfDataObj["string"]:
            self.pdfDataObj["offset"] += len(temp) - len(self.pdfDataObj["string"])
        
        for key, value in self.replacements.items():
            self.pdfDataObj["string"] = self.pdfDataObj["string"].replace(key, value)
        
    def evalHeadings(self, threshold:int):
        headingFonts = list(self.fontSizes)
        headingFonts.sort(reverse=True)
        headingFonts = headingFonts[:threshold]
        
        for row in self.pdfDataArr:
            try:
                headingIdx = headingFonts.index(row["fontSizeMedian"])
                row["headingLevel"] = headingIdx+1
            except:
                pass
        self.isHeadingEvaluated = True
        
    def evalTextDecorations(self):
        if not self.isHeadingEvaluated:
            return
        
        for idx, row in enumerate(list(self.pdfDataArr)):
            if row["headingLevel"] > 0:
                continue
            
            offset = -row["offset"]
            
            for decorations in row["decoratedTexts"]:
                left = decorations["start"]+offset
                right = decorations["end"]+1+offset
                offset += 4
                if left < 0:
                    left = 0
                
                row["string"] = row["string"][:left] + "**" +  row["string"][left:right] + "**" + row["string"][right:]

        i = 0
        n = len(self.pdfDataArr)
        while i < n:
            row = self.pdfDataArr[i]
            prev = self.pdfDataArr[i-1]
            if prev["string"][:2] == "- " and row["string"][:2] != "- " and prev["headingLevel"]>0:
                prev["string"] += " " + row["string"]
                del self.pdfDataArr[i]
                n = n - 1
            else:
                i = i + 1
                
    def getMedianFont(self):
        self.pdfDataObj["fontSizeMedian"] = sorted(self.pdfDataObj["fontSizes"], key=self.pdfDataObj["fontSizes"].get, reverse=True)[0]
        self.fontSizes.add(self.pdfDataObj["fontSizeMedian"])
                    
    def pushObj2Arr(self):
        if not self.isObjValid():
            self.pdfDataObj = deepcopy(self.pdfDataObjEmpty)
            return
        
        self.getMedianFont()
        self.replace()
    
        self.pdfDataObj["string"] = self.pdfDataObj["string"].strip()
        
        if self.pdfDataObj["decoratedTextCollector"]["start"] != -1:                        
            self.pdfDataObj["decoratedTexts"] += [deepcopy(self.pdfDataObj["decoratedTextCollector"])]
            
        self.pdfDataArr += [self.pdfDataObj]
        
        self.pdfDataObj = deepcopy(self.pdfDataObjEmpty)
        
    def setObjFontInfo(self, font):
        
        size = int(font.split()[-1][:-2])
        
        if size in self.pdfDataObj["fontSizes"]:
            self.pdfDataObj["fontSizes"][size] += 1
            
        else:
            self.pdfDataObj["fontSizes"][size] = 1
                    
    def setStr2Obj(self, s:str):
        s = s.strip()
        self.pdfDataObj["string"] = s
    
    def setObjInfo(self, char:str, font:str=""):
        if font != "":
            self.setObjFontInfo(font)
        
        # detect start of bold
        if char == "":
            return
        if "bold" in font.lower():
            if self.pdfDataObj["decoratedTextCollector"]["start"] == -1:
                self.pdfDataObj["decoratedTextCollector"]["start"] = self.pdfDataObj["ctr"]
                self.pdfDataObj["decoratedTextCollector"]["decoration"] = "bold"
            else:
                self.pdfDataObj["decoratedTextCollector"]["end"] = self.pdfDataObj["ctr"]
        else:
            if self.pdfDataObj["decoratedTextCollector"]["start"] != -1:
                self.pdfDataObj["decoratedTexts"] += [deepcopy(self.pdfDataObj["decoratedTextCollector"])]
                
                self.pdfDataObj["decoratedTextCollector"]["start"] = -1
                self.pdfDataObj["decoratedTextCollector"]["end"] = -1
            

    def extractRecursive(self, o:Any):
        if self.get_name(o)=="LTTextBoxHorizontal":
            if(self.pdfDataObj["string"].strip() != ""):
                self.pushObj2Arr()
                
            self.setStr2Obj(self.get_optional_text(o))
        
        elif self.get_name(o) == "LTChar":
            self.setObjInfo(self.get_optional_text(o), self.get_optional_fontinfo(o))
            self.pdfDataObj["ctr"] += 1
        elif self.get_name(o) == "LTAnno":
            self.pdfDataObj["ctr"] += 1
                    
        if isinstance(o, Iterable):
            for i in o:
                self.extractRecursive(i)
            
    def extract(self, filePath, showPrint:bool=False):
        self.resetData()
        
        self.filePath = filePath
        
        if showPrint:
            pages = extract_pages(self.filePath)
            self.show_ltitem_hierarchy(pages)
        
        pages = extract_pages(self.filePath)
        self.extractRecursive(pages)
    
    def toMarkdown(self, filePath:str, heading_threshold:int):
        self.evalHeadings(heading_threshold)
        self.evalTextDecorations()
        
        md = ""
        
        for row in self.pdfDataArr:
            if row['headingLevel'] > 0:
                md += '#'*row['headingLevel'] + ' ' + row["string"] + "\n"
            else:
                md += '- ' + row["string"] + "\n"
                
        with open(filePath, 'w', encoding="utf-8") as f:
            f.write(md)            
        
        
    def printExtracted(self):
        s = json.dumps({"data": self.pdfDataArr, "length": len(self.pdfDataArr), "fontSizes": list(self.fontSizes)}, indent=4)
        print(s)
        
        
        
blacklist = {
    "strings": ['Operating System Concepts – 8th Edition', 'Silberschatz, Operating System Concepts – 8th Edition Galvin and Gagne ©2009', 'Silberschatz, Galvin and Gagne ©2009'],
    "regex": ['^\d+\.\d+$']
}

replacements = {
    '\n': ' ',
}

heading_threshold = 2

filePath = "Lecture notes\Silbershatz\ch2.pdf"
extractor = PDFExtractor()
extractor.setBlacklist(blacklist)
extractor.setReplacements(replacements)
extractor.extract(filePath,True)
# extractor.toMarkdown("pdf_test.md", heading_threshold)
# extractor.toMarkdown("D:\\1. Acads\\4.2\\Special Problem\\SP test vault\\Silberschatz ch1 unformatted.md")

# extractor.printExtracted()

element                        fontname             text
------------------------------ -------------------- -----
generator                                           
  LTPage                                            
    LTTextBoxHorizontal                             Chapter 2:  Operating-System 
Structures
      LTTextLineHorizontal                          Chapter 2:  Operating-System
        LTChar                 Arial,Bold 43pt      C
        LTChar                 Arial,Bold 43pt      h
        LTChar                 Arial,Bold 43pt      a
        LTChar                 Arial,Bold 43pt      p
        LTChar                 Arial,Bold 43pt      t
        LTChar                 Arial,Bold 43pt      e
        LTChar                 Arial,Bold 43pt      r
        LTChar                 Arial,Bold 43pt      
        LTChar                 Arial,Bold 43pt      2
        LTChar                 Arial,Bold 43pt      :
        LTChar                 Arial,Bold 43pt      
        LTCha