In [4]:
import requests
import pdfplumber
import re

In [17]:


def clean_string(text):
    """
    Replaces newline characters with spaces, and removes forward slashes.

    Args:
      text: The string to clean.

    Returns:
      The cleaned string.
    """
    text = text.replace('\n', ' ')  # Replace newline characters with spaces
    text = text.replace('/', '')   # Remove forward slashes
    return text


def create_stufen_data(extracted_data):
    """
    Creates a list of Stellenart-Stufe combinations with corresponding salary data.

    Args:
        extracted_data: A list of lists containing salary data extracted from the PDF.

    Returns:
        A list of lists, where each inner list represents a Stellenart-Stufe combination with salary data.
    """
    stufen_data = []

    # Hardcoded Stufe ranges for each Stellenart
    stufen_ranges = {
        'E14': ['E14 Stufe 5', 'E14 Stufe 6', 'E15 Stufe 1', 'E15 Stufe 2', 'E15 Stufe 3', 'E15 Stufe 4'],
        'E13': ['E13 Stufe 3', 'E13 Stufe 4', 'E13 Stufe 5', 'E13 Stufe 6', 'E14 Stufe 1', 'E14 Stufe 2'],
        'Ä1': ['Ä1 Stufe 2', 'Ä1 Stufe 3', 'Ä1 Stufe 4', 'Ä1 Stufe 5', 'Ä1 Stufe 6', 'Ä2 Stufe 1'],
        'E12': ['E12 Stufe 2', 'E12 Stufe 3', 'E12 Stufe 4', 'E12 Stufe 5', 'E12 Stufe 6', 'E13 Stufe 1'],
        'E9': ['E9 Stufe 1', 'E9 Stufe 2', 'E9 Stufe 3', 'E9 Stufe 4', 'E9 Stufe 5', 'E9 Stufe 6',
               'E10 Stufe 1', 'E10 Stufe 2', 'E10 Stufe 3', 'E10 Stufe 4', 'E10 Stufe 5', 'E10 Stufe 6',
               'E11 Stufe 1', 'E11 Stufe 2', 'E11 Stufe 3', 'E11 Stufe 4', 'E11 Stufe 5', 'E11 Stufe 6',
               'E12 Stufe 1', 'E12 Stufe 2', 'E12 Stufe 3', 'E12 Stufe 4', 'E12 Stufe 5', 'E12 Stufe 6'],
        'E2': ['E2 Stufe 1', 'E2 Stufe 2', 'E2 Stufe 3', 'E2 Stufe 4', 'E2 Stufe 5', 'E2 Stufe 6',
               'E3 Stufe 1', 'E3 Stufe 2', 'E3 Stufe 3', 'E3 Stufe 4', 'E3 Stufe 5', 'E3 Stufe 6',
               'E4 Stufe 1', 'E4 Stufe 2', 'E4 Stufe 3', 'E4 Stufe 4', 'E4 Stufe 5', 'E4 Stufe 6',
               'E5 Stufe 1', 'E5 Stufe 2', 'E5 Stufe 3', 'E5 Stufe 4', 'E5 Stufe 5', 'E5 Stufe 6',
               'E6 Stufe 1', 'E6 Stufe 2', 'E6 Stufe 3', 'E6 Stufe 4', 'E6 Stufe 5', 'E6 Stufe 6',
               'E7 Stufe 1', 'E7 Stufe 2', 'E7 Stufe 3', 'E7 Stufe 4', 'E7 Stufe 5', 'E7 Stufe 6',
               'E8 Stufe 1', 'E8 Stufe 2', 'E8 Stufe 3', 'E8 Stufe 4', 'E8 Stufe 5', 'E8 Stufe 6',
               'E9 Stufe 1', 'E9 Stufe 2']
    }

    for i, row in enumerate(extracted_data):
        stellenart_key = row[2].split()[0] + row[2].split()[1]  # Extract the Stellenart key (e.g., 'E14')
        #print(stellenart_key)
        if stellenart_key in stufen_ranges:
            #print("Hey")
            for stufe in stufen_ranges[stellenart_key]:
                stellenart, stufe_num = stufe.split(' Stufe ')
                stufen_data.append([stellenart, stufe_num, row[0], row[1]])
                #print(f"Stellenart: {stellenart}, Stufe: {stufe_num}, Jahreskosten: {row[0]}, Monatskosten: {row[1]}")

    return stufen_data

In [20]:
url = 'https://www.dfg.de/formulare/60_12/v/60_12_-2023-_de.pdf'
url_2 = 'https://www.dfg.de/resource/blob/323036/2f7e7eb3e4110dd63ae9bad50a3602e4/60-12-2024-de-data.pdf'

def make_table(url):
    
    response = requests.get(url)
    with open("test_file.pdf", "wb") as pdf_file:
        pdf_file.write(response.content)

    with pdfplumber.open('test_file.pdf') as f:
        for i in f.pages:
            extracted_table = i.extract_tables()
    del extracted_table[0][1]
    table = []
    for i in range(2,8):
        data = extracted_table[0][i]
        table.append(data[1:4])

    #print(table)
    stufen_data = create_stufen_data(table)
    
    #print(extracted_table)
    return stufen_data



In [22]:
sd= make_table(url)
print(sd)

[['E14', '5', '94.500', '7.875'], ['E14', '6', '94.500', '7.875'], ['E15', '1', '94.500', '7.875'], ['E15', '2', '94.500', '7.875'], ['E15', '3', '94.500', '7.875'], ['E15', '4', '94.500', '7.875'], ['E13', '3', '80.100', '6.675'], ['E13', '4', '80.100', '6.675'], ['E13', '5', '80.100', '6.675'], ['E13', '6', '80.100', '6.675'], ['E14', '1', '80.100', '6.675'], ['E14', '2', '80.100', '6.675'], ['Ä1', '2', '102.300', '8.525'], ['Ä1', '3', '102.300', '8.525'], ['Ä1', '4', '102.300', '8.525'], ['Ä1', '5', '102.300', '8.525'], ['Ä1', '6', '102.300', '8.525'], ['Ä2', '1', '102.300', '8.525'], ['E13', '3', '74.100', '6.175'], ['E13', '4', '74.100', '6.175'], ['E13', '5', '74.100', '6.175'], ['E13', '6', '74.100', '6.175'], ['E14', '1', '74.100', '6.175'], ['E14', '2', '74.100', '6.175'], ['E9', '1', '61.500', '5.125'], ['E9', '2', '61.500', '5.125'], ['E9', '3', '61.500', '5.125'], ['E9', '4', '61.500', '5.125'], ['E9', '5', '61.500', '5.125'], ['E9', '6', '61.500', '5.125'], ['E10', '1', '6

In [None]:



# Example usage (assuming extracted_data is your list of lists from the PDF)
extracted_data = [
    ['94.500', '7.875', 'E 14 Stufe 5\nbis E 15\nStufe 4'],
    ['80.100', '6.675', 'E 13 Stufe 3\nbis E 14\nStufe 2'],
    ['102.300', '8.525', 'Ä 1 Stufe 2\nbis\nÄ 2 Stufe 1'],
    ['74.100', '6.175', 'E 13 Stufe 2\nbis E 14\nStufe 1'],
    ['61.500', '5.125', 'E 9 bis E 12'],
    ['54.300', '4.525', 'E 2 Stufe 1\nbis\nE 9 Stufe 2']
]

stufen_data = create_stufen_data(extracted_data)

# Print the data in a tabular format
print("Stellenart | Stufe | Jahreskosten | Monatskosten")
print("-" * 50)
for row in stufen_data:
    print(f"{row[0]:<12}|{row[1]:<7}|{row[2]:<14}|{row[3]:<15}")