# Extract PDF
This Script tries to detect, when someone was speaking and extracts it as raw as possible. Creates a json for each pdf.  

Logic: Find Text in italic at line start. Means: New person is speaking. Copy everything until next italic or bold text.

Tests:
* Empty italic Line before Name: ../export/Files/2021-09-27-17fa216f98fd4d38afd5afb53743ec65-332.pdf, p 18
* Multiline-Name: 2021-08-30-eabdc84b3d6b4153ae371fcdbdab6b68-332.pdf, p 17

In [1]:
import pandas as pd
import json
from pathlib import Path
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer, LTChar
import glob
import xml.etree.ElementTree as ET

In [2]:
def read_pdf(f):

    textbuf = ""
    pagebuf = []
    reading = False
    skip_until_bold = False

    paragraphs = []

    fakes = ['Detailberatung', 'Titel und Ingress', 'I. und II.', 'III.', 'Abstimmung', 'IV.',
    'Schlussabstimmung']

    pages = extract_pages(f)

    i = 0
    for page_layout in pages:
        i += 1

        for iParagraph, element in enumerate(page_layout):
            if isinstance(element, LTTextContainer):

                #print("[%s]" % element.get_text())

                lines = list(element)

                # Before going into detection, fix new page problem:
                # On every new page there is a page number. Sometimes there is a empty line between
                # So do: Remove empty lines if first "text" line does NOT contains italic
                #iLine[0]
                if iParagraph == 0:

                    # Remove first line (Page Number)
                    lines.pop(0)

                    # Find first "text" line and remove others above
                    index = -1
                    for iLine, l in enumerate(lines):
                        if len(l.get_text().strip()) > 0:
                            # Is Text Line!
                            index = iLine
                            break

                    # If index >= 0, pop until text line
                    if index >= 0:
                        for j in range(0, index):
                            lines.pop(0)

                # Now go line by line and find kings and queens.
                for iLine, text_line in enumerate(lines):

                    text = text_line.get_text()
                    line_above = lines[iLine - 1]

                    first_char = list(text_line)[0]

                    # Skip everything that contains "Antworten auf Anfrage"
                    if 'Antworten auf Anfragen' in text:
                        skip_until_bold = True

                    elif isinstance(first_char, LTChar) and ('bold' in first_char.fontname.lower()):
                        skip_until_bold = False

                    if skip_until_bold: continue
                    
                    # Search for Italic. Then it could be a queen
                    if isinstance(first_char, LTChar) and ('italic' in first_char.fontname.lower()):
                        
                        # Stop, if still reading. EXCEPT: Line above was also italic (multilines)
                        if (
                            (reading and iLine == 0)
                            or (iLine > 0 and ( #Multiline check. Line above is italic and empty
                                not('italic' in list(lines[iLine - 1])[0].fontname.lower())
                                or(list(lines[iLine - 1])[0].get_text().strip() == '')
                                )) 
                        ):
                            # Maybe the italic is inline text? Check if Line above was empty line or not existing
                            if (
                                (iLine == 0)
                                or (line_above.get_text().strip() == '') # Inline italic Check
                            ):
                                paragraphs.append({'t': textbuf, 'p': pagebuf[0] if len(pagebuf) > 0 else i})
                                reading = False
                                textbuf = ""
                                pagebuf = []
                        
                        # "Fake" check
                        is_fake = False
                        for fake in fakes:
                            if fake in text:
                                is_fake = True
                                break
                        reading = not is_fake
                    
                    # Stop Reading if Bold detected
                    if isinstance(first_char, LTChar) and ('bold' in first_char.fontname.lower()):
                        paragraphs.append({'t': textbuf, 'p': pagebuf[0] if len(pagebuf) > 0 else i})

                        reading = False
                        textbuf = ""
                        pagebuf = []

                    # Fill Buffer if we are reading
                    if reading:
                        textbuf += text
                        pagebuf.append(i)

    # If still reading, add to dict
    if reading:
        paragraphs.append({'t': textbuf, 'p': pagebuf[0] if len(pagebuf) > 0 else i})


    return paragraphs

#p = read_pdf('../export/Files/2021-09-27-17fa216f98fd4d38afd5afb53743ec65-332.pdf')

#for x in p:
#    print("----")
#    print(x)

In [3]:
# Get all files
df = pd.read_csv(Path('../export/dokumente.csv'))

# Only "Protokoll"
df = df[df.dokument_kategorie == 'Protokoll']

# Loop file
for i, row in df.iterrows():

    # Create Name
    fname = Path('../export/extracts/%s' % row['_filename']).with_suffix('.json')

    if not fname.is_file():
        #print(fname)
        # Parse PDF
        paragraphs = read_pdf(Path('../export/Files/') / row['_filename'])

        # Save
        with open(fname, 'w', encoding='UTF-8') as f:
            f.write(json.dumps(paragraphs, ensure_ascii=False))

print("finito")

finito


## Extract Traktanden
This we need for further analysis

In [None]:
# Parse all XML and create Dataframe
ns = {
    's': 'http://www.cmiag.ch/cdws/searchDetailResponse',
    'd': 'http://www.cmiag.ch/cdws/SitzungenDetail'
    }

records = []


for f in glob.glob(str(Path('../export/SITZUNGENDETAIL/*.xml'))):
    tree = ET.parse(f)
    root = tree.getroot()


    for sitzung in root.findall('.//s:Hit/d:Sitzung', ns):
        for traktandum in sitzung.findall('d:Traktanden/d:Traktandum', ns):
            edok = sitzung.find('d:Sitzungsdokumente/d:Dokument/d:eDokument', ns)
            dok = edok.attrib['ID'] if edok else None

            sitzung_start = sitzung.find('d:Datum/d:Start', ns).text if sitzung.find('d:Sitzungsbeginn/d:Start', ns) is None else sitzung.find('d:Sitzungsbeginn/d:Start', ns).text
            records.append({
                'sitzung': sitzung_start,
                'dokument': dok,
                'titel': traktandum.find('d:Titel', ns).text,
                '_filename': Path(f).stem
            })
    #break

df = pd.DataFrame(records)

df.to_csv(Path('../export/traktanden.csv'), index=False)

In [None]:
#for x in p:
#    print("----")
#    print(x)