In [None]:
import os
import tabula
import pandas as pd
import datetime
from pypdf import PdfReader
import re

In [2]:
place_list = ['札幌', '函館', '福島', '新潟', '東京', '中山', '中京', '京都', '阪神', '小倉']
place_list_eng = ['sapporo', 'hakodate', 'fukushima', 'niigata', 'tokyo', 'nakayama', 'chukyo', 'kyoto', 'hanshin', 'kokura']
file_names_raw = os.listdir('download/')
file_names = ['download/' + name for name in file_names_raw]

In [3]:
# ---------------------------------------- 開催日程のデータを取得、整形 -------------------------------
# 開催日程の一覧をデータフレームとして取得
# 後の日程が上に来ていることに注意
kaisai = pd.read_csv('kaisai.csv', encoding='shift-jis').iloc[:,:6]

# datetimeオブジェクトの列を追加
def add_datetime(row):
    y = row['年'] + 2000
    m = row['月']
    d = row['日']
    return datetime.date(y, m, d)

kaisai['dt'] = kaisai.apply(add_datetime, axis=1)

# 曜日を追加
def get_weekday(row):
    weekday = row['dt'].weekday()
    wd_list = ['月', '火', '水', '木', '金', '土', '日']
    return wd_list[weekday]

kaisai['曜日'] = kaisai.apply(get_weekday, axis=1)

In [4]:
# 全角数字を半角数字（文字列）に変換する関数
def zenkaku_to_hankaku(s):
    zenkaku = '０１２３４５６７８９'
    hankaku = '0123456789'
    trans_table = str.maketrans(zenkaku, hankaku)
    return s.translate(trans_table)

for year in range(18, 25):
    for i in range(10):
        place = place_list[i]
        df_yp = kaisai.loc[(kaisai['年'] == year) & (kaisai['場所'] == place)]
        if df_yp.empty:
            continue
        kai_max = max(df_yp['回次'])
        for kai in range(1, kai_max+1):
            df_kai = df_yp.loc[df_yp['回次'] == kai]
            
            file_name = 'download/' + str(2000 + year) + place_list_eng[i] + '0' + str(kai) + '.pdf'
            if file_name not in file_names:
                continue
            # PDFに含まれる文章を読み取り
            pdftext = PdfReader(file_name).pages[0].extract_text() #クッション値データを含むかどうかの判定に使う
            # 改行記号\nでsplit
            text_list = pdftext.split('\n')
            # 「注記」を含む文章を削除
            text_list = [text for text in text_list if '注記' not in text]
            # 各週の開催日次のリスト
            pattern = r"第(\d+)日"
            nichiji_list = [[int(zenkaku_to_hankaku(s_zenkaku)) for s_zenkaku in re.findall(pattern, text)] for text in text_list if re.search(pattern, text)]

            # PDFに含まれる表のリスト
            baba_table_list = tabula.read_pdf(file_name, pages=1, stream=True)

            # print(df_kai)
            # print(file_name)
            # print(indices_split)

            if 'クッション値' in pdftext: # クッション値のデータを含む場合
                for j in range(len(nichiji_list)):
                    df_j = pd.concat([baba_table_list[2*j], baba_table_list[2*j+1]])
                    nichijis_j = nichiji_list[j]
                    for nichiji in nichijis_j:
                        day = df_kai.loc[df_kai['日次']==nichiji, '曜日'].values[0]
                        condition = (kaisai['年'] == year) & (kaisai['場所'] == place) & (kaisai['回次'] == kai) & (kaisai['日次'] == nichiji)
                        kaisai.loc[condition, '芝ゴール前'] = df_j.loc[df_j['Unnamed: 0']=='芝コース含水率', day + '曜日'].values[0]
                        kaisai.loc[condition, 'ダートゴール前'] = df_j.loc[df_j['Unnamed: 0']=='ダートコース含水率', day + '曜日'].values[0]
                        df_4c = df_j.loc[df_j['Unnamed: 0']=='(パーセント)']
                        kaisai.loc[condition, '芝4コーナー'] = df_4c.iloc[0, df_4c.columns.get_loc(day + '曜日')]
                        kaisai.loc[condition, 'ダート4コーナー'] = df_4c.iloc[1, df_4c.columns.get_loc(day + '曜日')]
                        kaisai.loc[condition, '芝クッション値'] = df_j.loc[df_j['Unnamed: 0']=='芝コースクッション値', day + '曜日'].values[0]
            else:
                for j in range(len(nichiji_list)):
                    df_j = baba_table_list[j]
                    nichijis_j = nichiji_list[j]
                    for nichiji in nichijis_j:
                        # print(index)
                        day = df_kai.loc[df_kai['日次']==nichiji, '曜日'].values[0]
                        condition = (kaisai['年'] == year) & (kaisai['場所'] == place) & (kaisai['回次'] == kai) & (kaisai['日次'] == nichiji)
                        kaisai.loc[condition, '芝ゴール前'] = df_j.loc[df_j['Unnamed: 0']=='芝コース含水率', day + '曜日'].values[0]
                        kaisai.loc[condition, 'ダートゴール前'] = df_j.loc[df_j['Unnamed: 0']=='ダートコース含水率', day + '曜日'].values[0]
                        df_4c = df_j.loc[df_j['Unnamed: 0']=='(パーセント)']
                        kaisai.loc[condition, '芝4コーナー'] = df_4c.iloc[0, df_4c.columns.get_loc(day + '曜日')]
                        kaisai.loc[condition, 'ダート4コーナー'] = df_4c.iloc[1, df_4c.columns.get_loc(day + '曜日')]


                
            
            


Failed to import jpype dependencies. Fallback to subprocess.
No module named 'jpype'
Got stderr: 1 18, 2026 11:44:29 午前 org.apache.fontbox.ttf.CmapSubtable processSubtype14
警告: Format 14 cmap table is not supported and will be ignored
1 18, 2026 11:44:29 午前 org.apache.fontbox.ttf.CmapSubtable processSubtype14
警告: Format 14 cmap table is not supported and will be ignored
1 18, 2026 11:44:29 午前 org.apache.fontbox.ttf.CmapSubtable processSubtype14
警告: Format 14 cmap table is not supported and will be ignored
1 18, 2026 11:44:29 午前 org.apache.fontbox.ttf.CmapSubtable processSubtype14
警告: Format 14 cmap table is not supported and will be ignored

Got stderr: 1 18, 2026 11:44:29 午前 org.apache.fontbox.ttf.CmapSubtable processSubtype14
警告: Format 14 cmap table is not supported and will be ignored
1 18, 2026 11:44:29 午前 org.apache.fontbox.ttf.CmapSubtable processSubtype14
警告: Format 14 cmap table is not supported and will be ignored
1 18, 2026 11:44:29 午前 org.apache.fontbox.ttf.CmapSubtable pro

In [5]:
kaisai = kaisai.dropna(subset=['芝ゴール前'])
kaisai.to_csv('cond.csv', encoding='utf-8')

In [6]:
kaisai

Unnamed: 0,年,月,日,回次,場所,日次,dt,曜日,芝ゴール前,ダートゴール前,芝4コーナー,ダート4コーナー,芝クッション値
0,24,12,28,5,中山,9,2024-12-28,土,12.5,2.1,12.6,2.2,9.5
1,24,12,28,7,京都,9,2024-12-28,土,9.0,1.2,7.6,1.6,11.4
2,24,12,22,5,中山,8,2024-12-22,日,12.4,1.6,13.3,1.7,10.0
3,24,12,22,7,京都,8,2024-12-22,日,9.1,2.1,7.4,1.8,11.3
4,24,12,21,5,中山,7,2024-12-21,土,12.3,1.9,12.2,2.0,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1853,18,7,29,2,新潟,2,2018-07-29,日,11.9,2.8,13.9,1.7,
1854,18,7,29,2,小倉,2,2018-07-29,日,7.1,1.8,7.5,2.6,
1855,18,7,28,1,札幌,1,2018-07-28,土,12.6,2.8,13.4,2.6,
1856,18,7,28,2,新潟,1,2018-07-28,土,11.9,2.3,13.9,1.9,
