In [31]:
!pip install pymupdf

Collecting pymupdf
  Downloading PyMuPDF-1.24.11-cp38-abi3-win_amd64.whl (16.0 MB)
     ---------------------------------------- 16.0/16.0 MB 9.4 MB/s eta 0:00:00
Installing collected packages: pymupdf
Successfully installed pymupdf-1.24.11


In [4]:
import fitz
import pprint

In [27]:
additional_header_set = {
    'besondere',
    'diverses',
    'ehrenamt',
    'erfahrungen',
    'fach',
    'fachkompetenzen',
    'fähigkeiten',
    'it',
    'kenntnisse',
    'kompetenzen',
    'persönliche',
    'professional',
    'qualifikationen',
    'skills',
    'sonstige',
    'sonstiges',
    'sprachkenntnisse',
    'themen',
    'themenkompetenzen',
    'tätigkeiten',
    'weitere',
    'zusatzqualifikationen',
    'edv',
    'ikt',
}

education_set = {
    "ausbildung",
    "ausbildungsweg",
    "schulische",
    "schulbildung",
    "studium",
    "schul",
    "berufsausbildung",
    "berufliche",
    "weiterbildung",
    "beruflicher",
    "weiterbildungen",
    "weiterbildung",
    "seminare",
    "bildung"
}

work_set = {
    'arbeitserfahrung',
    'arbeitserfahrungen',
    'berufliche',
    'erfahrung',
    'erfahrungen',
    'skills',
    'beruflicher',
    'werdegang',
    'berufserfahrung',
    'berufserfahrungen'
}

stopwords = {
    'und',
    'oder',
    '&',
    '/',
    'sowie',
    'auch',
    ' ',
    ''}

In [29]:
def extract_headings(blocks, headings_set, stopword_set):
    results = []
    for block in blocks:
        if "lines" in block:
            for line in block["lines"]:
                for span in line["spans"]:
                    text = span["text"]
                    font = span["font"]
                    font_size = span["size"]
                    is_bold = "Bold" in font
                    is_italic = "Italic" in font
                    color = span["color"]
                    text_list = text.replace("-", " ").strip().lower().split(" ")
                    text_set = set(text_list)
                    if text_set.issubset(headings_set.union(stopwords)):
                        if(len(text_list) < 2):
                            if(text_list[0] in stopwords):
                                continue

                        results.append({
                            "text": text,
                            "font": font,
                            "font_size": font_size,
                            "bold": is_bold,
                            "italic": is_italic,
                            "color": color
                        })
    return results

In [88]:
def has_same_styling(a, b=None, span=None):
    if b is not None:
        # compare with custom defined dict
        return a["font"] == b["font"] and a["font_size"] == b["font_size"] and a["bold"] == b["bold"] and a["italic"] == b["italic"] and a["color"] == b["color"]
    
    # compare with span dict (different keys)
    return a["font"] == span["font"] and a["font_size"] == span["size"] and a["bold"] == ("Bold" in span["font"]) and a["italic"] == ("Italic" in span["font"]) and a["color"] == span["color"]
    
def match_extracted_headings(work, education, diverse):
    styles = dict()
    for w in work:
        for e in education:
            for d in diverse:
                equal = True
                equal = equal and (w["font"] == e["font"] and w["font"] == d["font"])
                equal = equal and (w["font_size"] == e["font_size"] and w["font_size"] == d["font_size"])
                equal = equal and (w["bold"] == e["bold"] and w["bold"] == d["bold"])
                equal = equal and (w["italic"] == e["italic"] and w["italic"] == d["italic"])
                equal = equal and (w["color"] == e["color"] and w["color"] == d["color"])
                
                if equal:
                    styles = {
                        "font": w["font"],
                        "font_size": w["font_size"],
                        "bold": w["bold"],
                        "italic": w["italic"],
                        "color": w["color"]
                    }

    remove_list = []
    for w in work:
        if not has_same_styling(w, b=styles):
            remove_list.append(w)
    work_res = [w for w in work if w not in remove_list]
    remove_list = []
    
    for e in education:
        if not has_same_styling(e, b=styles):
            remove_list.append(e)
    education_res = [e for e in education if e not in remove_list]
    remove_list = []
    
    for d in diverse:
        if not has_same_styling(w, b=styles):
            remove_list.append(d)
    diverse_res = [d for d in diverse if d not in remove_list]
    
    return work_res, education_res, diverse_res, styles

In [30]:
def extract(path):
    doc = fitz.open(path)
    results_work = []
    results_education = []
    results_diverse = []
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]
        
        results_work.extend(extract_headings(blocks, work_set, stopwords))
        results_education.extend(extract_headings(blocks, education_set, stopwords))
        results_diverse.extend(extract_headings(blocks, additional_header_set, stopwords))
        
    res = match_extracted_headings(results_work, results_education, results_diverse)
        
    pprint.pp(res[0])
    print("=======================")
    pprint.pp(res[1])
    print("=======================")
    pprint.pp(res[2])
    print("=======================")
    pprint.pp(res[3])
    
    doc.close()
    return res

In [None]:
work, education, diverse, styles = extract("FILENAME.pdf")

In [98]:
def is_relevant_heading(headings_list, span):
    for h in headings_list:
        if h["text"] == span["text"]:
            return True
    return False

def extract_text_segments(path, work_headings, education_headings, diverse_headings, headings_style):
    doc = fitz.open(path)
    work_list = []
    def _process_work(text):
        if len(text) <= 1:
            work_list.append("\n")
        else:
            work_list.append(text)
    
    education_list = []
    def _process_education(text):
        if len(text) <= 1:
            education_list.append("\n")
        else:
            education_list.append(text)
            
    diverse_list = []
    def _process_diverse(text):
        if len(text) <= 1:
            diverse_list.append("\n")
        else:
            diverse_list.append(text)
    
    process_func = lambda x: None
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]
        for block in blocks:
            if "lines" in block:
                for line in block["lines"]:
                    for span in line["spans"]:
                        if has_same_styling(headings_style, span=span):
                            if is_relevant_heading(work_headings, span):
                                process_func = _process_work
                            elif is_relevant_heading(education_headings, span):
                                process_func = _process_education
                            elif is_relevant_heading(diverse_headings, span):
                                process_func = _process_diverse
                            else:
                                process_func = lambda x: None
                        process_func(span["text"])
    
    work_str = " ".join(work_list)
    education_str = " ".join(education_list)
    diverse_str = " ".join(diverse_list)
    doc.close()
    return work_str, education_str, diverse_str

In [1]:
work_str, education_str, diverse_str = extract_text_segments("FILENMAE.pdf", work, education, diverse, styles)

NameError: name 'extract_text_segments' is not defined

In [None]:
print(work_str)

In [None]:
print(education_str)

In [None]:
print(diverse_str)