In [3]:
#crawling 
#!pip install selenium webdriver-manager pandas

In [2]:
import time
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

In [3]:
def init_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver


In [9]:
def load_page(driver):
    driver.get("https://football-observatory.com/Tool-Performance")
    WebDriverWait(driver, 10).until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR, "iframe")))

    select_element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "filter-value-tab1.2"))
    )
    select = Select(select_element)
    select.select_by_value("All")
    time.sleep(2)  # 로딩 대기


In [5]:
def extract_data(driver):
    tab = driver.find_element(By.ID, "tab1")
    table_holder = tab.find_element(By.CLASS_NAME, "tabulator-tableholder")
    table = tab.find_element(By.CLASS_NAME, "tabulator-table")

    all_data = []
    prev_style = None

    while True:
        driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", table_holder)
        time.sleep(0.7)

        style = table.get_attribute("style")
        rows = table.find_elements(By.CLASS_NAME, "tabulator-row")

        for row in rows:
            cells = row.find_elements(By.CLASS_NAME, "tabulator-cell")
            row_data = tuple(cell.text.strip() for cell in cells)
            if row_data not in all_data:
                all_data.append(row_data)

        if style == prev_style:
            break
        prev_style = style

    header_elements = driver.find_elements(By.CSS_SELECTOR, ".tabulator-col")
    headers = [h.text.strip() for h in header_elements]

    return all_data, headers


In [6]:
def save_csv(data, headers, filename):
    df = pd.DataFrame(data, columns=headers)
    df.to_csv(filename, index=False, encoding="utf-8-sig")
    print(f"✅ {filename} 저장 완료! (총 {len(data)}건)")


In [7]:
def crawl_all_data():
    driver = init_driver()

    try:
        load_page(driver)
        data, headers = extract_data(driver)
        save_csv(data, headers, "football_observatory_tab1_all.csv")

    finally:
        driver.quit()


In [8]:
df = pd.read_csv("football_observatory_tab1_all.csv", encoding="utf-8-sig")
print(df.shape)     # (행, 열) 확인
df.head()           # 상위 5개 미리보기

FileNotFoundError: [Errno 2] No such file or directory: 'football_observatory_tab1_all.csv'

# 마크다운 : transferkt 

In [12]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

def scrape_transfermarkt():
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # 창 없이 실행
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    try:
        url = "https://www.transfermarkt.co.kr/marktwertetop/wertvollstemannschaften"
        driver.get(url)
        driver.implicitly_wait(10)

        # 주요 테이블 요소 찾기
        table = driver.find_element(By.CSS_SELECTOR, "table.items")

        # 헤더 추출
        headers = [th.text.strip() for th in table.find_elements(By.CSS_SELECTOR, "thead th")]

        # 데이터 행 추출
        data = []
        rows = table.find_elements(By.CSS_SELECTOR, "tbody tr")
        for row in rows:
            cells = row.find_elements(By.CSS_SELECTOR, "td")
            if len(cells) > 0:
                row_data = [cell.text.strip() for cell in cells]
                data.append(row_data)

        # 데이터프레임으로 정리
        df = pd.DataFrame(data, columns=headers)  # 첫 번째 열은 순위 번호 등 비어 있음
        return df

    finally:
        driver.quit()

# 사용 예시
df_transfermarkt = scrape_transfermarkt()
print(df_transfermarkt.head())


   #             클럽              경쟁        시장 가치
0  1    Real Madrid          LaLiga  1.33 bil. €
1  2     맨체스터 시티 FC  Premier League  1.32 bil. €
2  3     Arsenal FC  Premier League  1.09 bil. €
3                                               
4                                               
