In [3]:
#crawling 
#!pip install selenium webdriver-manager pandas

In [33]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

In [43]:
def init_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver


In [44]:
def load_page(driver):
    driver.get("https://football-observatory.com/Tool-Performance")
    WebDriverWait(driver, 10).until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR, "iframe")))

    select_element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "filter-value-tab1.2"))
    )
    select = Select(select_element)
    select.select_by_value("All")
    time.sleep(2)  # 로딩 대기


In [45]:
def extract_data(driver):
    tab = driver.find_element(By.ID, "tab1")
    table_holder = tab.find_element(By.CLASS_NAME, "tabulator-tableholder")
    table = tab.find_element(By.CLASS_NAME, "tabulator-table")

    all_data = []
    prev_style = None

    while True:
        driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", table_holder)
        time.sleep(0.7)

        style = table.get_attribute("style")
        rows = table.find_elements(By.CLASS_NAME, "tabulator-row")

        for row in rows:
            cells = row.find_elements(By.CLASS_NAME, "tabulator-cell")
            row_data = tuple(cell.text.strip() for cell in cells)
            if row_data not in all_data:
                all_data.append(row_data)

        if style == prev_style:
            break
        prev_style = style

    header_elements = driver.find_elements(By.CSS_SELECTOR, ".tabulator-col")
    headers = [h.text.strip() for h in header_elements]

    return all_data, headers


In [46]:
def save_csv(data, headers, filename):
    df = pd.DataFrame(data, columns=headers)
    df.to_csv(filename, index=False, encoding="utf-8-sig")
    print(f"✅ {filename} 저장 완료! (총 {len(data)}건)")


In [47]:
def crawl_all_data():
    driver = init_driver()

    try:
        load_page(driver)
        data, headers = extract_data(driver)
        save_csv(data, headers, "football_observatory_tab1_all.csv")

    finally:
        driver.quit()
