In [1]:
import pdfplumber
import random
from os import listdir
from os.path import isfile, join

debug=False

class Vorlesung:
    def __init__(self):
        self.tag = ""
        self.name = ""
        self.longname = ""
        self.dozent = ""
        self.raum = ""
        self.startBlock = ""
        self.endBlock = ""
        self.notes = ""

    def __str__(self):
        return (
            " "
            + self.name
            + ". "
            + self.dozent
            + ". "
            + self.raum
            + ". "
            + str(self.tag)
            + ". "
            + str(self.startBlock)
            + "-"
            + str(self.endBlock)
            + ". "
            + str(self.notes)
        )
def numberToDay(num):
    if abs(num-110)<5:
        return 'mo'
    elif abs(num-248)<5:
        return 'di'
    elif abs(num-385)<5:
        return 'mi'
    elif abs(num-523)<5:
        return 'do'
    elif abs(num-661)<5:
        return 'fr'

def numberToBlockstart(num):
    if abs(num-111)<5:
        return 1
    elif abs(num-172)<5:
        return 2
    elif abs(num-232)<5:
        return 3
    elif abs(num-294)<5:
        return 4
    elif abs(num-353)<5:
        return 5
    elif abs(num-413)<5:
        return 6
    elif abs(num-473)<5:
        return 7
    elif abs(num-534)<5:
        return 8

def pageToTimetable(page):
    if debug:
        img=page.to_image()
        img.reset()
    titleCrop=page.crop((0,60,page.width,80))
    title=titleCrop.extract_text()
    rects =  [ x for x in page.rects
            if x["width"] > 100 and x["width"] < 200 and x["height"]>30]
    vls=[]
    for rect in rects:
        if debug:
            img.draw_rect(rect,stroke_width=4,stroke=(random.randint(0,255), random.randint(0,255), random.randint(0,255))) 
        box=(round(rect['x0']),round(rect['top']),round(rect['x1']),round(rect['bottom']))     
        cropt=page.crop(box, relative=True)
        text=cropt.extract_text().split()
        vl=Vorlesung()
        vl.startBlock=numberToBlockstart(box[1])
        vl.endBlock=numberToBlockstart(box[3])-1
        vl.tag=numberToDay(box[0])
        if text[0]==text[1]:
            vl.name=text[0]
            vl.dozent=text[2]+", "+text[3]
            
            if len(text)>5 and text[4]!=text[5]:
                vl.raum=text[4]+", "+text[5]
            else:
                vl.raum=text[4]
            if len(text)>6:
                vl.notes=text[6]
        else:
            vl.name=text[0]
            vl.dozent=text[1]
            vl.raum=text[2]
            if len(text)>3:
                vl.notes=text[3]
        if debug:
            print(vl)
        vls.append(vl)
    
    if debug:
        display(img)
    return {title: vls} 


In [2]:
def parsePdf(path):
    pdf=pdfplumber.open(path)
    timetables={}
    for page in pdf.pages:
        if "Langname" in page.extract_text():
            pass
        else:
            timetables=timetables|pageToTimetable(page)
        if debug:
            print(page.page_number)

    if debug:
        print(timetables)
    return {list(timetables.keys())[0].split()[0]:timetables}



In [8]:
def parseDir(dir):
    onlyfiles = [f for f in listdir(dir) if isfile(join(dir, f))]
    timetables={}
    for file in onlyfiles:
        if debug:
            print(file)
        if ".pdf" ==file[-4:]:
            timetables=timetables|parsePdf(dir+"/"+file)
    return timetables

In [9]:
parseDir("Stundenpläne")

{'MP': {'MP': [<__main__.Vorlesung at 0x7f2441d543b0>,
   <__main__.Vorlesung at 0x7f2442b86d20>,
   <__main__.Vorlesung at 0x7f2442b86120>,
   <__main__.Vorlesung at 0x7f2442b87170>,
   <__main__.Vorlesung at 0x7f2442b876e0>,
   <__main__.Vorlesung at 0x7f2442b86ea0>,
   <__main__.Vorlesung at 0x7f2442b874d0>,
   <__main__.Vorlesung at 0x7f2442b859d0>,
   <__main__.Vorlesung at 0x7f2442b87a70>,
   <__main__.Vorlesung at 0x7f2442b86000>,
   <__main__.Vorlesung at 0x7f2441b012e0>,
   <__main__.Vorlesung at 0x7f2442b87110>]},
 'MKI': {'MKI': [<__main__.Vorlesung at 0x7f244c2b6ed0>,
   <__main__.Vorlesung at 0x7f244c2b4560>,
   <__main__.Vorlesung at 0x7f244c2b51f0>,
   <__main__.Vorlesung at 0x7f244c2b7860>,
   <__main__.Vorlesung at 0x7f244c2b5430>,
   <__main__.Vorlesung at 0x7f244c2b4ad0>,
   <__main__.Vorlesung at 0x7f244c2b5280>,
   <__main__.Vorlesung at 0x7f244c2b7320>,
   <__main__.Vorlesung at 0x7f244c2b4380>,
   <__main__.Vorlesung at 0x7f244c2b5250>,
   <__main__.Vorlesung at 