In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
import itertools
import tqdm
from datetime import datetime
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
game_links = []
for month in range(3, 11):
    # URLの月部分を変更
    url = f"https://npb.jp/games/2024/schedule_{month:02d}_detail.html"
    print(f"Fetching data from: {url}")
    
    # HTMLを取得
    response = requests.get(url)
    response.encoding = 'utf-8'

    # BeautifulSoupでHTMLをパース
    soup = BeautifulSoup(response.text, 'html.parser')

    # 特定パターンのリンクのみを抽出
    links = [a['href'] for a in soup.find_all('a', href=True) if re.match(r"^/scores/2024/\d{4}/", a['href'])]
    links = ["https://npb.jp" + str(l) + "box.html" for l in links]
    game_links.extend(links)  # リストを一次元で追加
    
    # 抽出したリンクを表示
    print(f"Links for month {month:02d}:")
game_links = game_links[:-19]

Fetching data from: https://npb.jp/games/2024/schedule_03_detail.html
Links for month 03:
Fetching data from: https://npb.jp/games/2024/schedule_04_detail.html
Links for month 04:
Fetching data from: https://npb.jp/games/2024/schedule_05_detail.html
Links for month 05:
Fetching data from: https://npb.jp/games/2024/schedule_06_detail.html
Links for month 06:
Fetching data from: https://npb.jp/games/2024/schedule_07_detail.html
Links for month 07:
Fetching data from: https://npb.jp/games/2024/schedule_08_detail.html
Links for month 08:
Fetching data from: https://npb.jp/games/2024/schedule_09_detail.html
Links for month 09:
Fetching data from: https://npb.jp/games/2024/schedule_10_detail.html
Links for month 10:


In [3]:
def seiseki_log(url):
    # ページを取得
    response = requests.get(url, timeout=10)
    response.encoding = 'utf-8'

    # BeautifulSoupでHTMLをパース
    soup = BeautifulSoup(response.text, 'html.parser')

    # 特定のテーブルを取得
    table1 = soup.find('table', id='tablefix_t_b')
    table2 = soup.find('table', id='tablefix_b_b')

    df_ = pd.DataFrame(columns=['dt', 'my_team', 'oppo_team', '守備', '選手', '打数', '得点', '安打', '打点', '盗塁', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'])
    for i, table in enumerate([table1, table2]):
    # テーブルを表形式に変換
        data = []
        if table:
            rows = table.find_all('tr')
            for row in rows:
                row_data = []
                cols = row.find_all(['th', 'td'])
                for col in cols:
                    # リンクがある場合はリンクから数字部分だけを抽出
                    link = col.find('a')
                    if link:
                        match = re.search(r'/(\d+)\.html$', link['href'])
                        if match:
                            cell_data = match.group(1)  # 数字部分のみを取得
                        else:
                            cell_data = ''  # リンクが数字を含まない場合は空文字にする
                    else:
                        cell_data = col.get_text(strip=True)
                    row_data.append(cell_data)
                data.append(row_data[1:])
                
            # Pandas DataFrameに変換
            df_add = pd.DataFrame(data[:-1])
            # 1行目を列名に設定
            df_add.columns = df_add.iloc[0]
            # 1行目を削除
            df_add = df_add.drop(df_add.index[0])
            # インデックスをリセット
            df_add.reset_index(drop=True, inplace=True)
            
            df_add['my_team'] = soup.find_all('h4')[i].text
            df_add['oppo_team'] = soup.find_all('h4')[abs(1 - i)].text
            
            match = re.search(r"https://npb\.jp/scores/(\d{4})/(\d{4})", url)
            if match:
                year = match.group(1)  # 2024
                month_day = match.group(2)  # 0521

                # 日付を適切な形式に変換
                date_str = f"{year}-{month_day[:2]}-{month_day[2:]}"
                df_add['dt'] = datetime.strptime(date_str, "%Y-%m-%d")
            df_ = pd.concat([df_, df_add])
    return df_

In [4]:
df_base = seiseki_log(game_links[0])
error_list = []
for game in tqdm.tqdm(game_links[1:]):
    try:
        df_append = seiseki_log(game)
        df_base = pd.concat([df_base, df_append])
    except:
        error_list.append(game)
        print(game)
    time.sleep(1)
df_base

  4%|▍         | 39/887 [00:47<16:48,  1.19s/it]

https://npb.jp/scores/2024/0405/m-b-01/box.html


 28%|██▊       | 252/887 [05:17<12:50,  1.21s/it]

https://npb.jp/scores/2024/0521/h-e-08/box.html


 54%|█████▍    | 482/887 [09:57<08:09,  1.21s/it]

https://npb.jp/scores/2024/0710/m-e-13/box.html


100%|██████████| 887/887 [18:11<00:00,  1.23s/it]


Unnamed: 0,dt,my_team,oppo_team,守備,選手,打数,得点,安打,打点,盗塁,...,4,5,6,7,8,9,10,11,12,Unnamed: 21
0,2024-03-29,阪神タイガース,読売ジャイアンツ,(中),71075138,4,0,1,0,0,...,-,右　飛,-,-,遊ゴ失,-,,,,
1,2024-03-29,阪神タイガース,読売ジャイアンツ,(二),41445153,4,0,2,0,0,...,-,-,左　飛,-,中　飛,-,,,,
2,2024-03-29,阪神タイガース,読売ジャイアンツ,(右),43145157,3,0,0,0,0,...,-,-,四　球,-,二　飛,-,,,,
3,2024-03-29,阪神タイガース,読売ジャイアンツ,(一),21325134,4,0,0,0,0,...,三　振,-,中　飛,-,-,三　振,,,,
4,2024-03-29,阪神タイガース,読売ジャイアンツ,(三),41045153,4,0,0,0,0,...,三　振,-,三邪飛,-,-,三　振,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13,2024-10-09,東北楽天ゴールデンイーグルス,埼玉西武ライオンズ,(二),61965151,4,0,0,0,0,...,-,-,三　振,三　振,-,三邪飛,-,投犠打,-,
14,2024-10-09,東北楽天ゴールデンイーグルス,埼玉西武ライオンズ,(捕),61165134,2,0,0,0,0,...,-,-,右　飛,-,-,-,-,-,-,
15,2024-10-09,東北楽天ゴールデンイーグルス,埼玉西武ライオンズ,打,61165132,1,0,0,0,0,...,-,-,-,左　飛,-,-,-,-,-,
16,2024-10-09,東北楽天ゴールデンイーグルス,埼玉西武ライオンズ,捕,71375138,1,0,0,0,0,...,-,-,-,-,-,右　飛,-,-,-,


In [5]:
df_base.to_csv('daseki_log_2024.csv')