A quick code to create the posts from old agendas. Content seperately extracted with LLM.

In [31]:
import pdfplumber
import os
from datetime import datetime
import re

# function to extract seminar content by date
def extract_seminars_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        full_text = ""
        for page in pdf.pages:
            full_text += page.extract_text()

    # regex pattern to split by dates (dd.mm.yyyy format)
    seminar_pattern = re.split(r'(\d{2}\.\d{2}\.\d{4})', full_text)

    # pair date and content
    seminars = []
    for i in range(1, len(seminar_pattern) - 1, 2):
        date = seminar_pattern[i].strip()  # Date
        content = seminar_pattern[i + 1].strip()  # Seminar content
        seminars.append((date, content))
    
    return seminars

# function to create a Jekyll post for each seminar
def create_jekyll_post(post_title, post_date, post_content, affiliation, venue, talk_title):
    formatted_date = post_date.strftime("%Y-%m-%d")
    # create the filename with only the speaker's name and date (no affiliation)
    file_name = f"{formatted_date}-{post_title.replace(' ', '-').lower()}.md"

    front_matter = f"""---
layout: post
title: "{post_title}"
date: {post_date.strftime("%Y-%m-%d %H:%M:%S")}
categories: seminar
---
"""

    # add Title, Venue, and Affiliation to the content
    full_content = (
        front_matter
        + f"\n**Talk Title**: {talk_title}\n"
        + f"**Affiliation**: {affiliation}\n"
        + f"**Venue**: {venue}\n\n"
        + post_content
    )

    # save the post using UTF-8 encoding
    with open(os.path.join('_posts', file_name), 'w', encoding='utf-8') as post_file:
        post_file.write(full_content)

    print(f"Post created: {file_name}")

# function to extract the speaker's name, affiliation, title, and venue
def extract_seminar_details(seminar_content):
    lines = seminar_content.split("\n")
    
    # extract the speaker's name and affiliation from the first line
    name_and_affiliation = lines[0]
    match = re.match(r'([^\(]+)\s+\((.+)\)', name_and_affiliation)
    
    if match:
        speaker_name = match.group(1).strip()
        affiliation = match.group(2).strip()
    else:
        speaker_name = name_and_affiliation.strip().split(",")[0].strip()
        affiliation = "Unknown Affiliation"

    # assume the talk title is on the next line
    talk_title = lines[1].strip() if len(lines) > 1 else "Unknown Title"

    # extract the venue (look for "Room" or similar keywords, typically 3rd or 4th line)
    venue = "Unknown Venue"
    for line in lines:
        if "Room" in line or "Berlin" in line:
            venue = line.strip()
            break

    return speaker_name, affiliation, talk_title, venue

# function to process PDF and create posts for each seminar
def process_pdf_to_jekyll_posts(pdf_path):
    # extract seminars from the PDF
    seminars = extract_seminars_from_pdf(pdf_path)

    # loop through each seminar and create individual posts
    for seminar in seminars:
        seminar_date, seminar_content = seminar

        # convert seminar_date to datetime
        post_date = datetime.strptime(seminar_date, "%d.%m.%Y")

        # extract seminar details (speaker name, affiliation, talk title, venue)
        speaker_name, affiliation, talk_title, venue = extract_seminar_details(seminar_content)

        # create the Jekyll post with all the details
        create_jekyll_post(speaker_name, post_date, seminar_content, affiliation, venue, talk_title)

# example PDF file path (update this to your actual file path)
pdf_files = [
    "agenda_file",
]
# loop through your PDF files and create individual posts
for pdf_file in pdf_files:
    process_pdf_to_jekyll_posts(pdf_file)


Post created: 2013-05-10-no-lecture.md
Post created: 2013-05-17-dr.-gregor-schwerhoff,-potsdam-institut-für-klimafolgenforschung.md
Post created: 2013-05-24-30.05.013-prof.-john-roemer.md
Post created: 2013-05-31-prof.-dr.-thomas-eichner.md
Post created: 2013-06-07-prof.-dr.-rainald-borck.md
Post created: 2013-06-14-prof.-dr.-christoph-böhringer.md
Post created: 2013-06-21-prof.-dr.-rick-van-der-ploeg.md
Post created: 2013-06-28-lydia-blaschtschak.md
Post created: 2013-07-05-.md
Post created: 2013-07-12-philipp-m.-richter.md
