In [1]:
pip install selenium

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [41]:
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from collections import defaultdict
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

DesiredCapabilities.CHROME["pageLoadStrategy"] = "none"
# Selenium 설정
option = Options()
option.add_argument('headless')
option.add_argument("disable-gpu")
driver = webdriver.Chrome(options=option)

# 기본 URL과 페이지 타입
base_url = 'https://fbref.com/en/players/92e7e919/matchlogs/{year1}-{year2}/'
page_types = ['summary', 'passing', 'passing_types', 'gca', 'defense', 'possession']

all_data = defaultdict(dict)

# 연도 범위 설정 (2011-2024)
start_year = 2011
end_year = 2024

# 연도별로 순회
for year in range(start_year, end_year, 2):
    year1 = str(year)
    year2 = str(year + 1)
    print(year)
    for page in page_types:
        url = base_url.format(year1=year1, year2=year2) + page + '/Son-Heung-min-Match-Logs'
        driver.get(url)
        
        print(page)
        
        # 페이지 로드
        WebDriverWait(driver, 5).until(
        EC.presence_of_element_located((By.ID, 'matchlogs_all')))
        
        # 테이블 데이터 추출
        table = driver.find_element(By.ID, 'matchlogs_all')
        headers = [header.get_attribute('data-stat') for header in table.find_elements(By.XPATH, './/thead/tr/th')]
        rows = table.find_elements(By.XPATH, './/tbody/tr[@data-row]')
        
        for row in rows:
            row_data = {}
            date = row.find_element(By.TAG_NAME, 'th').text
            
            
            cells = row.find_elements(By.TAG_NAME, 'td')
            for cell in cells:
                data_stat = cell.get_attribute('data-stat')
                if 'pct' in data_stat or 'xg' in data_stat or 'xa' in data_stat:
                    continue
                row_data[data_stat] = cell.text
            
            all_data[date].update(row_data)
    
# JSON 파일로 저장
with open('son_heung_min_match_stats.json', 'w', encoding='utf-8') as file:
    json.dump(all_data, file, ensure_ascii=False, indent=4)

# 출력 (확인용)
print(json.dumps(all_data, ensure_ascii=False, indent=4))

# 브라우저 종료
driver.quit()


2011
summary
passing
passing_types
gca
defense
possession
2013
summary
passing
passing_types
gca
defense
possession
2015
summary
passing
passing_types
gca
defense
possession
2017
summary
passing
passing_types
gca
defense
possession
2019
summary
passing
passing_types
gca
defense
possession
2021
summary
passing
passing_types
gca
defense
possession
2023
summary
passing
passing_types
gca
defense
possession
{
    "2011-08-13": {
        "dayofweek": "Sat",
        "comp": "Bundesliga",
        "round": "Matchweek 2",
        "venue": "Home",
        "result": "D 2–2",
        "team": "Hamburger SV",
        "opponent": "Hertha BSC",
        "game_started": "Y",
        "position": "",
        "minutes": "90",
        "goals": "1",
        "assists": "",
        "pens_made": "0",
        "pens_att": "",
        "shots": "",
        "shots_on_target": "",
        "cards_yellow": "0",
        "cards_red": "0",
        "fouls": "",
        "fouled": "",
        "offsides": "",
        "crosses"

In [None]:
#년도간 누락필드 추가

import json

# JSON 파일 경로
json_file = 'son_heung_min_match_stats.json'

# JSON 파일 불러오기
with open(json_file, 'r', encoding='utf-8') as f:
    data = json.load(f)

# 추가할 필드들
fields_to_add = ['offsides', 'fouled', 'fouls', 'pens_conceded', 'pens_won', 'own_goals']

# 각 객체에 필드가 없으면 추가
for date, details in data.items():
    print(details)
    for field in fields_to_add:
        if field not in details:
            print(date)
            details[field] = ""

# 변경된 데이터를 JSON 파일로 저장
with open('stats_uni', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

print(f"필드가 추가된 JSON 파일 '{json_file}'이(가) 업데이트 되었습니다.")

In [51]:

import json

# JSON 파일 로드
input_file = 'stats_uni.json'
with open(input_file, 'r', encoding='utf-8') as file:
    data = json.load(file)

# 결과와 점수 변환
for date, match_data in data.items():
    if 'result' in match_data:
        result = match_data['result']
        if result.startswith('W'):
            match_data['result'] = 'Win'
        elif result.startswith('L'):
            match_data['result'] = 'Loose'
        elif result.startswith('D'):
            match_data['result'] = 'Drew'
        
        # 점수 분리
        if '–' in result:
            scores = result.split('–')
            match_data['win_score'] = scores[0].strip().replace('W ', '').replace('L ', '').replace('D ', '')
            match_data['loose_score'] = scores[1].strip()

# 예측 데이터 포함하는 키 삭제
data_list = []
for match_data in data:
    keys_to_remove = []
    for key in match_data:
        if isinstance(key, str) and ('xg' in key or 'pct' in key or 'xa' in key):
            keys_to_remove.append(key)
    
    for key in keys_to_remove:
        del match_data[key]
    
    data_list.append(match_data)


# 수정된 데이터를 새로운 JSON 파일에 저장
output_file = 'f_son_heung_min_match_stats.json'
with open(output_file, 'w', encoding='utf-8') as file:
    json.dump(data, file, ensure_ascii=False, indent=4)

print(f"수정된 데이터를 {output_file}에 저장하였습니다.")


수정된 데이터를 f_son_heung_min_match_stats.json에 저장하였습니다.


In [26]:
#JSON obj형태 파일을 리스트로 변환

import json

# 입력 및 출력 파일명 설정
output_file = 'l_son_heung_min_match_stats.json'
input_file = 'f_son_heung_min_match_stats.json'

# JSON 파일 로드
with open(input_file, 'r', encoding='utf-8') as f_input:
    data = json.load(f_input)

data_list = []
for date, match_data in data.items():
    match_data['date'] = date
    data_list.append(match_data)



# 리스트로 변환된 데이터를 JSON 파일에 저장
with open(output_file, 'w', encoding='utf-8') as f_output:
    json.dump(data_list, f_output, ensure_ascii=False, indent=4)

print(f"수정된 데이터를 {output_file}에 저장하였습니다.")

수정된 데이터를 son_heung_min_god.json에 저장하였습니다.


In [49]:
## 골 어시 데이터 추가


import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.options import Options
from collections import defaultdict
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

DesiredCapabilities.CHROME["pageLoadStrategy"] = "none"
# Selenium 설정
options = Options()
#options.add_argument('headless')
options.add_argument("disable-gpu")
options.add_argument("--blink-setting=imagesEnable=false")
driver = webdriver.Edge(options=options)

# 기본 URL과 페이지 타입
base_url = 'https://fbref.com/en/players/92e7e919/matchlogs/{year1}-{year2}/'
page_types = ['summary', 'passing', 'passing_types', 'gca', 'defense', 'possession']

all_data = defaultdict(dict)

# 연도 범위 설정 (2011-2024)
start_year = 2011
end_year = 2024


logs = {}

# 연도별로 순회
for year in range(start_year, end_year, 2):
    year1 = str(year)
    year2 = str(year + 1)
    
    url = base_url.format(year1=year1, year2=year2) + '/Son-Heung-min-Match-Logs'
    driver.get(url)
        
    # 테이블이 로드될 때까지 잠시 대기
    WebDriverWait(driver, 5).until(
    EC.presence_of_element_located((By.CSS_SELECTOR, "#matchlogs_all > thead > tr.over_header > th:nth-child(1)")))
        
    # 테이블 추출
    table = driver.find_element(By.ID, 'matchlogs_all')
        
    # 테이블 데이터 추출
    rows = table.find_elements(By.XPATH, './/tbody/tr[@data-row]')
    
    
    
    for row in rows:
        cells = row.find_elements(By.TAG_NAME, 'td')
        match_date = row.find_element(By.TAG_NAME, 'th').text
        match_log= []

        print(match_date)
        try :
            match_ref = row.find_element(By.CSS_SELECTOR, "td.left.group_start > a").get_attribute('href')

        except :
            continue
        
        script = f"window.open('{match_ref}');"
        driver.execute_script(script)
        
        driver.switch_to.window( driver.window_handles[1] )
        WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#events_wrap")))

        match = driver.find_element(By.CSS_SELECTOR, '#events_wrap > div')
        match_data = match.find_elements(By.XPATH, 'div')

        for element in match_data:
            if 'Son Heung-min' in element.text:
                event_type = element.find_element(By.CSS_SELECTOR, 'div:nth-child(2) > div').get_attribute('class').replace('event_icon ', '')
                event_time = element.find_element(By.CSS_SELECTOR, 'div:nth-child(1)').text
                try:
                    small = element.find_element(By.CSS_SELECTOR, 'div:nth-child(2) > small').text
                    if small == 'Assist: Son Heung-min':
                        event_type = 'assist'
                    elif small == 'for Son Heung-min':
                        event_type = 'substitute_out'
                    elif small == 'Penalty Kick':
                        event_type == 'PK'
                except:
                    pass
                match_log.append({"time":event_time, "type":event_type})
        print(match_log)
            
        driver.close()
        logs[match_date] = match_log
        driver.switch_to.window( driver.window_handles[0] )
        
        
        
    
# JSON 파일로 저장
with open('match_logs.json', 'w', encoding='utf-8') as file:
    json.dump(logs, file, ensure_ascii=False, indent=4)

# 출력 (확인용)
print(json.dumps(logs, ensure_ascii=False, indent=4))

# 브라우저 종료
driver.quit()



TimeoutException: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF603FD8152+13538]
	Microsoft::Applications::Events::EventProperty::~EventProperty [0x00007FF603F61E09+595497]
	(No symbol) [0x00007FF603D7E6CF]
	(No symbol) [0x00007FF603DC2960]
	(No symbol) [0x00007FF603DC2A1F]
	(No symbol) [0x00007FF603DFD627]
	(No symbol) [0x00007FF603DE203F]
	(No symbol) [0x00007FF603DB8147]
	(No symbol) [0x00007FF603DFB1EE]
	(No symbol) [0x00007FF603DE1C63]
	(No symbol) [0x00007FF603DB766E]
	(No symbol) [0x00007FF603DB683C]
	(No symbol) [0x00007FF603DB7221]
	Microsoft::Applications::Events::EventProperty::to_string [0x00007FF6041996F4+1099860]
	Microsoft::Applications::Events::EventProperty::~EventProperty [0x00007FF603EDD8FC+53532]
	Microsoft::Applications::Events::EventProperty::~EventProperty [0x00007FF603ED0E25+1605]
	Microsoft::Applications::Events::EventProperty::to_string [0x00007FF604198685+1095653]
	Microsoft::Applications::Events::ILogConfiguration::operator* [0x00007FF603F6C981+27777]
	Microsoft::Applications::Events::ILogConfiguration::operator* [0x00007FF603F66D04+4100]
	Microsoft::Applications::Events::ILogConfiguration::operator* [0x00007FF603F66E3B+4411]
	Microsoft::Applications::Events::EventProperty::~EventProperty [0x00007FF603F5CFC0+575456]
	BaseThreadInitThunk [0x00007FFEAFEF257D+29]
	RtlUserThreadStart [0x00007FFEB0A6AF28+40]
