In [1]:
SRC = 'https://www.kyoukaikenpo.or.jp/~/media/Files/shared/hokenryouritu/h31/ippan/h31313tokyo.pdf'
DST = '2019-04.csv'

In [2]:
import datetime
import hashlib
import io
import pathlib
import urllib.request

try:
    import tabula
except ModuleNotFoundError:
    !pip install tabula-py
    import tabula

datetime.datetime.now().astimezone(datetime.timezone.utc).isoformat()

'2022-05-07T14:58:06.492462+00:00'

In [3]:
data = io.BytesIO()

with urllib.request.urlopen(SRC) as f:
    data.write(f.read())

hashlib.sha256(data.getvalue()).hexdigest()

'99d96249e4545015f23362f6c58c8c996dd5ea2e065965e039396f37258cd6db'

In [4]:
data.seek(0)

df = tabula.read_pdf(data, lattice=True, pages=1)[0]

df = df.iloc[:, [1, 2, 3, 9, 10]]

df.columns = """
標準報酬:等級
標準報酬:月額
報酬月額
全額
折半額
""".strip().split()

df = df[~df['標準報酬:等級'].isnull()]
df = df[df['標準報酬:等級'].str.contains(r'\([0-9]+\)')]
df['標準報酬:等級'] = df['標準報酬:等級'].str.replace(
    r'[^(]*\(([0-9]+)\)',
    lambda m: m.group(1),
    regex=True
)
df['標準報酬:等級'] = df['標準報酬:等級'].astype(int)
df = df.set_index('標準報酬:等級').sort_index()

assert df.shape[0] == 31, f'等級数 ({df.shape[0]}) に過不足があります。'

s = df['報酬月額'].str.replace(r'[^0-9~]', '', regex=True)
df_ = s.str.split('~', n=1, expand=True)
df_ = df_.rename(columns={0: '報酬月額:以上', 1: '報酬月額:未満'})
df = df.merge(df_, left_index=True, right_index=True)

del df['報酬月額']

for x in df.columns:
    df[x] = df[x].str.replace(',', '')

df['標準報酬:月額'] = df['標準報酬:月額'].astype(int)

for x in """
報酬月額:以上
報酬月額:未満
全額
折半額
""".strip().split():
    df[x] = df[x].astype(float)

df.loc[df.index.min(), '報酬月額:以上'] = float('-inf')
df.loc[df.index.max(), '報酬月額:未満'] = float('inf')

for x in """
折半額
""".strip().split():
    df['控除額'] = (df['折半額'] + .49).astype(int)

df = df["""
標準報酬:月額
報酬月額:以上
報酬月額:未満
全額
折半額
控除額
""".strip().split()]

df.to_csv(DST)

hashlib.sha256(pathlib.Path(DST).read_bytes()).hexdigest()

'83f73ff6862583289afdc163262042301bd84088b2e99cd00176c20cf274f98d'