In [1]:
pip install pdfplumber markdown

Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
Collecting markdown
  Downloading Markdown-3.7-py3-none-any.whl.metadata (7.0 kB)
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-win_amd64.whl.metadata (48 kB)
Collecting cryptography>=36.0.0 (from pdfminer.six==20231228->pdfplumber)
  Downloading cryptography-43.0.1-cp39-abi3-win_amd64.whl.metadata (5.4 kB)
Collecting cffi>=1.12 (from cryptography>=36.0.0->pdfminer.six==20231228->pdfplumber)
  Downloading cffi-1.17.1-cp312-cp312-win_amd64.whl.metadata (1.6 kB)
Collecting pycparser (from cffi>=1.12->cryptography>=36.0.0->pdfminer.six==20231228->pdfplumber)
  Downloading pycparser-2.22-py3-none-any.whl.metadata (943 bytes)
Downloading pdfplumber-0.11.4-py3-none-any.whl (59 kB)
Downloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
   -



In [31]:
import pdfplumber
import os
from datetime import datetime
import re

# Function to extract seminar content by date
def extract_seminars_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        full_text = ""
        for page in pdf.pages:
            full_text += page.extract_text()

    # Regex pattern to split by dates (dd.mm.yyyy format)
    seminar_pattern = re.split(r'(\d{2}\.\d{2}\.\d{4})', full_text)

    # Pair date and content
    seminars = []
    for i in range(1, len(seminar_pattern) - 1, 2):
        date = seminar_pattern[i].strip()  # Date
        content = seminar_pattern[i + 1].strip()  # Seminar content
        seminars.append((date, content))
    
    return seminars

# Function to create a Jekyll post for each seminar
def create_jekyll_post(post_title, post_date, post_content, affiliation, venue, talk_title):
    formatted_date = post_date.strftime("%Y-%m-%d")
    # Create the filename with only the speaker's name and date (no affiliation)
    file_name = f"{formatted_date}-{post_title.replace(' ', '-').lower()}.md"

    front_matter = f"""---
layout: post
title: "{post_title}"
date: {post_date.strftime("%Y-%m-%d %H:%M:%S")}
categories: seminar
---
"""

    # Add Title, Venue, and Affiliation to the content
    full_content = (
        front_matter
        + f"\n**Talk Title**: {talk_title}\n"
        + f"**Affiliation**: {affiliation}\n"
        + f"**Venue**: {venue}\n\n"
        + post_content
    )

    # Save the post using UTF-8 encoding
    with open(os.path.join('_posts', file_name), 'w', encoding='utf-8') as post_file:
        post_file.write(full_content)

    print(f"Post created: {file_name}")

# Function to extract the speaker's name, affiliation, title, and venue
def extract_seminar_details(seminar_content):
    lines = seminar_content.split("\n")
    
    # Extract the speaker's name and affiliation from the first line
    name_and_affiliation = lines[0]
    match = re.match(r'([^\(]+)\s+\((.+)\)', name_and_affiliation)
    
    if match:
        speaker_name = match.group(1).strip()
        affiliation = match.group(2).strip()
    else:
        speaker_name = name_and_affiliation.strip().split(",")[0].strip()
        affiliation = "Unknown Affiliation"

    # Assume the talk title is on the next line
    talk_title = lines[1].strip() if len(lines) > 1 else "Unknown Title"

    # Extract the venue (look for "Room" or similar keywords, typically 3rd or 4th line)
    venue = "Unknown Venue"
    for line in lines:
        if "Room" in line or "Berlin" in line:
            venue = line.strip()
            break

    return speaker_name, affiliation, talk_title, venue

# Function to process PDF and create posts for each seminar
def process_pdf_to_jekyll_posts(pdf_path):
    # Extract seminars from the PDF
    seminars = extract_seminars_from_pdf(pdf_path)

    # Loop through each seminar and create individual posts
    for seminar in seminars:
        seminar_date, seminar_content = seminar

        # Convert seminar_date to datetime
        post_date = datetime.strptime(seminar_date, "%d.%m.%Y")

        # Extract seminar details (speaker name, affiliation, talk title, venue)
        speaker_name, affiliation, talk_title, venue = extract_seminar_details(seminar_content)

        # Create the Jekyll post with all the details
        create_jekyll_post(speaker_name, post_date, seminar_content, affiliation, venue, talk_title)

# Example PDF file path (update this to your actual file path)
pdf_files = [
    "C:/Users/nikolaij/Nextcloud/RSERC/RSERC/Past RSERC Programs/Mail from Franziska Holz/RSERC-Programm Term 2013.pdf",
]
#"C:/Users/nikolaij/Nextcloud/RSERC/RSERC/Past RSERC Programs/Mail from Franziska Holz/Program_RSERC_WiSe2122_Dec"
# Loop through your PDF files and create individual posts
for pdf_file in pdf_files:
    process_pdf_to_jekyll_posts(pdf_file)


Post created: 2013-05-10-no-lecture.md
Post created: 2013-05-17-dr.-gregor-schwerhoff,-potsdam-institut-für-klimafolgenforschung.md
Post created: 2013-05-24-30.05.013-prof.-john-roemer.md
Post created: 2013-05-31-prof.-dr.-thomas-eichner.md
Post created: 2013-06-07-prof.-dr.-rainald-borck.md
Post created: 2013-06-14-prof.-dr.-christoph-böhringer.md
Post created: 2013-06-21-prof.-dr.-rick-van-der-ploeg.md
Post created: 2013-06-28-lydia-blaschtschak.md
Post created: 2013-07-05-.md
Post created: 2013-07-12-philipp-m.-richter.md
