table of contents
1. 検索クエリの調査
1. テーブル取得

In [1]:
import os

import urllib.request
from datetime import datetime

from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.alert import Alert
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

In [2]:
DRIVER_PATH = '/Users/yamamuros83/Documents/Development/Kokudo/chromedriver'

In [3]:
URL = 'https://data.j-league.or.jp/SFPR01/'

# 検索クエリの調査

## ソースの表示

In [4]:
data = urllib.request.urlopen(URL)
soup = BeautifulSoup(data, "html")

In [5]:
# Seleniumをあらゆる環境で起動させるChromeオプション
options = Options()
options.add_argument('--disable-gpu');
options.add_argument('--disable-extensions')
options.add_argument('--proxy-server="direct://"')
options.add_argument('--proxy-bypass-list=*')
options.add_argument('--start-maximized')
options.add_argument('--headless')

# ブラウザの起動
driver = webdriver.Chrome(executable_path=DRIVER_PATH, chrome_options=options)

driver.get(URL)

  # This is added back by InteractiveShellApp.init_path()


In [6]:
selector = Select(driver.find_element_by_name("competition_year"))
year_list = [option.text for option in selector.options if option.text != '▼']
print(year_list)
year = '2016年'
selector.select_by_visible_text(year)

['2020年', '2019年', '2018年', '2017年', '2016年', '2015年', '2014年']


In [7]:
selector = Select(driver.find_element_by_name('competition_frame_id'))
competition_list = [option.text for option in selector.options if option.text != '▼' and option.text.endswith('リーグ')]
print(competition_list)
competition = competition_list[0]
selector.select_by_visible_text(competition)

['Ｊ１リーグ', 'Ｊ２リーグ', 'Ｊ３リーグ', 'Ｊサテライトリーグ']


In [12]:
stage_selector = Select(driver.find_element_by_name('competition_id'))
stage_list = [option.text for option in stage_selector.options if option.text != '▼']
print(stage_list)

stage = stage_list[0]
stage_selector.select_by_visible_text(stage)

['１ｓｔ', '２ｎｄ']


In [14]:
selector = Select(driver.find_element_by_name('team_id'))
team_list = [option.text for option in selector.options if option.text != '▼']
print(team_list)
team = team_list[0]
selector.select_by_visible_text(team)

['仙台', '鹿島', '浦和', '大宮', '柏', 'FC東京', '川崎Ｆ', '横浜FM', '湘南', '甲府', '新潟', '磐田', '名古屋', 'Ｇ大阪', '神戸', '広島', '福岡', '鳥栖']


In [9]:
wait = WebDriverWait(driver, 10)
search_btn = wait.until(EC.element_to_be_clickable((By.ID, 'search')))
search_btn.click()

# テーブル取得

In [10]:
def get_participate_df(year, competition, team, data_dir):
    
    # 最初に戻るには2回押す
    wait = WebDriverWait(driver, 10)
    for i in range(2):
        prev_btn = wait.until(EC.element_to_be_clickable((By.ID, 'prevBtnB')))
        prev_btn.click()
    
    df_list = list()
    for i in range(3):
        table = BeautifulSoup(driver.page_source, "html").find('table', id='search_result')
        # カラム
        c_list = [th.text for th in table.find_all('th', 'bg wd02')]
        # 試合情報
        c_data_array = np.array(
            [td.text.replace('\n', '').replace('\t', '') 
             for td in table.find_all(lambda tag: tag.name == 'td' and tag.get('class') == ['bg'])]
        ).reshape(-1, len(c_list))
        # 出場記録
        v_array = np.array(
            [td.text.replace('\n','').replace('\t','')
                 for td in table.find_all(lambda tag: tag.name == 'td' and ((tag.get('class') == ['bd-l'])or(tag.get('class') == None)))]
        ).reshape(-1, len(c_list))
        # 選手インデックス
        index_list = [th.text.replace('\u3000','') for th in table.find_all('th', 'bd-l sort al-l name-c')]
        # dataframe化
        df = pd.DataFrame(index=index_list, columns=c_list, data=v_array)
        df_list.append(df)
        
        if i != 3:
            next_btn = wait.until(EC.element_to_be_clickable((By.ID, 'nextBtnB')))
            next_btn.click()
    
    # 横に結合
    df = pd.concat(df_list, axis=1)
    
    # ローカル保存
    df.to_csv(os.path.join(data_dir, year, competition, f'{team}.csv'))
    
    home_btn = wait.until(EC.element_to_be_clickable((By.ID, 'back')))
    home_btn.click()

In [13]:
data_dir = os.path.join('..', 'data')
os.makedirs(os.path.join(data_dir, year, competition))
get_participate_df(year, competition, team, data_dir)