### transfermarkt 선수가치 크롤러

In [13]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import re
from unidecode import unidecode

1. 포지션별 선수가치

In [14]:
headers = {
    'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"
}

position_urls = {
    'forwards': 'Sturm',
    'midfielders': 'Mittelfeld',
    'defenders': 'Abwehr',
    'goalkeepers': 'Torwart'
}

all_positions = []

for position_key, position_value in position_urls.items():
    for page in range(1, 21): 
        url = f'https://www.transfermarkt.com/spieler-statistik/wertvollstespieler/marktwertetop/plus/ausrichtung/{position_value}/spielerposition_id/alle/altersklasse/alle/jahrgang/0/land_id/0/kontinent_id/0/yt0/Anzeigen/0//page/{page}?ajax=yw1'
        res = requests.get(url, headers=headers)
        soup = BeautifulSoup(res.content, 'html.parser')
        player_info = soup.find_all('tr', {'class': ['odd', 'even']})

        for info in player_info:
            player = info.find_all('td')
            all_positions.append({
                'name': unidecode(player[3].text.strip()),
                'position': player[4].text.strip(),
                'age': player[5].text.strip(),
                'nation': player[6].img['alt'] if player[6].find('img') else '',
                'club': player[7].img['alt'] if player[7].find('img') else '',
                'value': player[8].text.strip()
                })

positions_df = pd.DataFrame(all_positions)
positions_df = positions_df.drop_duplicates()

positions_df.to_csv('../data/tfm_positions.csv', index=False, encoding='utf-8-sig')
positions_df

Unnamed: 0,name,position,age,nation,club,value
0,Lamine Yamal,Right Winger,18,Spain,FC Barcelona,€200.00m
1,Erling Haaland,Centre-Forward,25,Norway,Manchester City,€180.00m
2,Kylian Mbappe,Centre-Forward,26,France,Real Madrid,€180.00m
3,Vinicius Junior,Left Winger,25,Brazil,Real Madrid,€170.00m
4,Bukayo Saka,Right Winger,23,England,Arsenal FC,€150.00m
...,...,...,...,...,...,...
1995,Brad Collins,Goalkeeper,28,England,Coventry City,€1.00m
1996,Juan Soriano,Goalkeeper,27,Spain,CD Leganés,€1.00m
1997,Alex Palmer,Goalkeeper,28,England,Ipswich Town,€1.00m
1998,Pierluigi Gollini,Goalkeeper,30,Italy,AS Roma,€1.00m


2. 연령대별 선수가치

In [3]:
headers = {
    'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"
}

age_groups = ['u21', '23-30', 'o34']
all_players = []

for age_group in age_groups:
    for page in range(1, 21):
        url = f'https://www.transfermarkt.com/spieler-statistik/wertvollstespieler/marktwertetop/plus/ausrichtung/alle/spielerposition_id/alle/altersklasse/{age_group}/jahrgang/0/land_id/0/kontinent_id/0/yt0/Anzeigen/0//page/{page}?ajax=yw1'
        res = requests.get(url, headers=headers)
        soup = BeautifulSoup(res.content, 'html.parser')
        player_info = soup.find_all('tr', {'class': ['odd', 'even']})

        for info in player_info:
            player = info.find_all('td')
            if len(player) < 9:
                continue

            all_players.append({
                'name': unidecode(player[3].text.strip()),
                'position': player[4].text.strip(),
                'age': player[5].text.strip(),
                'nation': player[6].img['alt'] if player[6].find('img') else '',
                'club': player[7].img['alt'] if player[7].find('img') else '',
                'value': player[8].text.strip()
            })

        time.sleep(1) 

age_group_df = pd.DataFrame(all_players)
age_group_df = age_group_df.drop_duplicates(subset=['name', 'club']) 
age_group_df.to_csv('../data/tfm_age.csv', index=False, encoding='utf-8-sig')

age_group_df

Unnamed: 0,name,position,age,nation,club,value
0,Lamine Yamal,Right Winger,18,Spain,FC Barcelona,€200.00m
1,Desire Doue,Right Winger,20,France,Paris Saint-Germain,€90.00m
2,Pau Cubarsi,Centre-Back,18,Spain,FC Barcelona,€80.00m
3,Joao Neves,Defensive Midfield,20,Portugal,Paris Saint-Germain,€80.00m
4,Estevao,Right Winger,18,Brazil,Chelsea FC,€60.00m
...,...,...,...,...,...,...
1495,Ronivaldo,Centre-Forward,36,Austria,FC Blau-Weiss Linz,€350k
1496,Juanma Delgado,Centre-Forward,34,Spain,V-Varen Nagasaki,€350k
1497,Myung-joo Lee,Central Midfield,35,"Korea, South",Incheon United,€350k
1498,Michele Castagnetti,Defensive Midfield,35,Italy,US Cremonese,€350k


3. 대륙 별 선수가치

In [2]:
headers = {
    'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"
}

confederation_ids = {
    'UEFA': 1, # 유럽 축구 연맹
    'AFC': 2, # 아시아 축구 연맹
    'CAF': 3, # 아프리카 축구 연맹
    'CONCACAF': 4, # 북중미 축구 연맹
    'CONMEBOL': 5, # 남미 축구 연맹
    'OFC': 6 # 오세아니아 축구 연맹
}

confederation = []

for confederation_key, kontinent_id in confederation_ids.items():
    for page in range(1, 21):
        url = f'https://www.transfermarkt.com/spieler-statistik/wertvollstespieler/marktwertetop/plus/ausrichtung/alle/spielerposition_id/alle/altersklasse/alle/jahrgang/0/land_id/0/kontinent_id/{kontinent_id}/yt0/Anzeigen/0//page/{page}?ajax=yw1'
        res = requests.get(url, headers=headers)
        soup = BeautifulSoup(res.content, 'html.parser')
        player_info = soup.find_all('tr', {'class': ['odd', 'even']})

        for info in player_info:
            player = info.find_all('td')
            confederation.append({
                'name': unidecode(player[3].text.strip()),
                'position': player[4].text.strip(),
                'age': player[5].text.strip(),
                'nation': player[6].find('img').get('alt') if player[6].find('img') else '',
                'club': player[7].find('img').get('alt') if player[7].find('img') else '',
                'value': player[8].text.strip(),
                'confederation': confederation_key
                })

confederation_df = pd.DataFrame(confederation)
confederation_df = confederation_df.drop_duplicates()

confederation_df.to_csv('../data/tfm_confederation.csv', index=False, encoding='utf-8-sig')
confederation_df

Unnamed: 0,name,position,age,nation,club,value,confederation
0,Kaoru Mitoma,Left Winger,28,Japan,Brighton & Hove Albion,€40.00m,UEFA
1,Min-jae Kim,Centre-Back,28,"Korea, South",Bayern Munich,€40.00m,UEFA
2,Abdukodir Khusanov,Centre-Back,21,Uzbekistan,Manchester City,€35.00m,UEFA
3,Takefusa Kubo,Right Winger,24,Japan,Real Sociedad,€30.00m,UEFA
4,Kang-in Lee,Right Winger,24,"Korea, South",Paris Saint-Germain,€25.00m,UEFA
...,...,...,...,...,...,...,...
2995,Illan Meslier,Goalkeeper,25,France,Leeds United,€16.00m,OFC
2996,Konstantin Tyukavin,Centre-Forward,23,Russia,Dynamo Moscow,€16.00m,OFC
2997,Finn Azaz,Attacking Midfield,24,Ireland,Middlesbrough FC,€16.00m,OFC
2998,Lloyd Kelly,Centre-Back,26,England,Juventus FC,€16.00m,OFC


4. 기타리그 별 선수가치

In [5]:
headers = {
    'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"
}

league_codes = {
    'MLS': 'MLS1',  # 미국 리그
    'Championship': 'GB2', # 잉글랜드 2부리그
    'Eredivisie': 'NL1',  # 네덜란드 리그
    'Süper Lig': 'TR1',  # 튀르키예 리그
    'Scottish Premiership': 'SC1',  # 스코틀랜드 리그
    'Swiss SuperLeague': 'C1',  # 스위스 리그
    'Austria Bundesliga': 'A1',  # 오스트리아 리그
    'Danish Superligaen': 'DK1',  # 덴마크 리그
    'Saudi ProLeague': 'SA1',  # 사우디아라비아 리그
    'Liga Portugal': 'PO1',  # 포르투갈 리그
    'Russian PremierLiga': 'RU1',  # 러시아 리그
    'Brasileiro Série A': 'BRA1',  # 브라질 리그
    'Liga Profesional': 'ARGC',  # 아르헨티나 리그
    'Liga MX Apertura': 'MEXA',  # 멕시코 리그
    'Jupiler ProLeague': 'BE1',  # 벨기에 리그
    'SuperLeague1 Greece': 'GR1',  # 그리스 리그
    'A-League Men': 'AUS1',  # 호주 리그
    'J1 League': 'JAP1',  # 일본 리그
    'K-League1': 'RSK1',  # K리그
    'CSL': 'CSL'  # 중국 리그
}

other_leagues = []

for league_name, code in league_codes.items():
    for page in range(1, 5):
        url = f'https://www.transfermarkt.com/jumplist/marktwerte/wettbewerb/{code}/page/{page}?ajax=yw1'
        res = requests.get(url, headers=headers)
        soup = BeautifulSoup(res.content, 'html.parser')
        player_info = soup.find_all('tr', class_=['odd', 'even'])

        for info in player_info:
            try:
                inline_table = info.find('table', class_='inline-table')
                name = position = ''
                
                if inline_table:
                    trs = inline_table.find_all('tr')
                    name_tag = trs[0].find('a') if len(trs) > 0 else None
                    name = name_tag.text.strip() if name_tag else ''
                    name = unidecode(name)
                    
                    position_td = trs[1].find('td') if len(trs) > 1 else None
                    position = position_td.text.strip() if position_td else ''

                tds = info.find_all('td', class_='zentriert')
                age = nation = club = ''
                
                for td in tds:
                    if td.text.strip().isdigit():
                        age = td.text.strip()
                    elif td.find('img') and 'flaggenrahmen' in td.find('img').get('class', []):
                        nation_img = td.find('img')
                        nation = nation_img['alt'] if nation_img and nation_img.has_attr('alt') else ''
                    elif td.find('img'):
                        club_img = td.find('img')
                        club = club_img['alt'] if club_img and club_img.has_attr('alt') else ''

                value_td = info.find('td', class_='rechts hauptlink')
                value_tag = value_td.find('a') if value_td else None
                value = value_tag.text.strip() if value_tag else value_td.text.strip() if value_td else ''

                other_leagues.append({
                    'name': name,
                    'position': position,
                    'age': age,
                    'nation': nation,
                    'club': club,
                    'value': value,
                    'league': league_name
                })

            except Exception as e:
                print(f"[Error] {e}")

        time.sleep(1)

other_leagues_df = pd.DataFrame(other_leagues)
other_leagues_df = other_leagues_df[other_leagues_df['name'].str.strip() != '']

other_leagues_df.to_csv('../data/tfm_otherleagues.csv', index=False, encoding='utf-8-sig')
other_leagues_df

Unnamed: 0,name,position,age,nation,club,value,league
0,Riqui Puig,Central Midfield,25,Spain,Los Angeles Galaxy,€20.00m,MLS
1,Emmanuel Latte Lath,Centre-Forward,26,Cote d'Ivoire,Atlanta United FC,€18.00m,MLS
2,Lionel Messi,Right Winger,38,Argentina,Inter Miami CF,€18.00m,MLS
3,Evander,Attacking Midfield,27,Brazil,FC Cincinnati,€14.00m,MLS
4,Kevin Denkey,Centre-Forward,24,Togo,FC Cincinnati,€14.00m,MLS
...,...,...,...,...,...,...,...
2275,Jin Cheng,Attacking Midfield,30,China,Zhejiang FC,€350k,CSL
2276,Qianglong Tao,Centre-Forward,23,China,Zhejiang FC,€350k,CSL
2277,Shiqin Wang,Left-Back,22,China,Zhejiang FC,€350k,CSL
2278,Rodrigo Henrique,Left Winger,32,Brazil,Meizhou Hakka,€350k,CSL


#### 최종 병합

1. positions_df + age_group_df 병합과정

In [15]:
merged1_df = pd.concat([positions_df, age_group_df], ignore_index=True)

duplicate_rows = pd.merge(positions_df, age_group_df, on=['name', 'position', 'age', 'nation', 'club', 'value'])
print(f'positions_df 수: {len(positions_df)}, age_group_df 수: {len(age_group_df)}, 중복 데이터 수: {len(duplicate_rows)}')

merged1_df = merged1_df.drop_duplicates()
merged1_df

positions_df 수: 2000, age_group_df 수: 1500, 중복 데이터 수: 821


Unnamed: 0,name,position,age,nation,club,value
0,Lamine Yamal,Right Winger,18,Spain,FC Barcelona,€200.00m
1,Erling Haaland,Centre-Forward,25,Norway,Manchester City,€180.00m
2,Kylian Mbappe,Centre-Forward,26,France,Real Madrid,€180.00m
3,Vinicius Junior,Left Winger,25,Brazil,Real Madrid,€170.00m
4,Bukayo Saka,Right Winger,23,England,Arsenal FC,€150.00m
...,...,...,...,...,...,...
3495,Ronivaldo,Centre-Forward,36,Austria,FC Blau-Weiss Linz,€350k
3496,Juanma Delgado,Centre-Forward,34,Spain,V-Varen Nagasaki,€350k
3497,Myung-joo Lee,Central Midfield,35,"Korea, South",Incheon United,€350k
3498,Michele Castagnetti,Defensive Midfield,35,Italy,US Cremonese,€350k


2. (positions_df + age_group_df) + confederation_df 병합과정

In [16]:
# 인덱스 초기화
confederation_df = confederation_df.reset_index(drop=True)
merged1_df = merged1_df.reset_index(drop=True)

# 병합 기준 key 컬럼
key_cols = ['name', 'position', 'age', 'nation', 'club', 'value']

# 컬럼 타입 맞추기
for col in key_cols:
    if col in confederation_df.columns and col in merged1_df.columns:
        if col == 'age':
            confederation_df[col] = pd.to_numeric(confederation_df[col], errors='coerce')
            merged1_df[col] = pd.to_numeric(merged1_df[col], errors='coerce')
        else:
            confederation_df[col] = confederation_df[col].astype(str)
            merged1_df[col] = merged1_df[col].astype(str)

# 컬럼 순서 정렬
columns_order = ['name', 'position', 'age', 'nation', 'club', 'value', 'confederation', 'league']
for col in columns_order:
    if col not in confederation_df.columns:
        confederation_df[col] = None
    if col not in merged1_df.columns:
        merged1_df[col] = None
confederation_df = confederation_df[columns_order]
merged1_df = merged1_df[columns_order]

# 병합
merged2_df = pd.concat([confederation_df, merged1_df], ignore_index=True)

# 중복 데이터 찾기
duplicate2_rows = pd.merge(confederation_df, merged1_df, on=key_cols)
print(f'confederation_df 수: {len(confederation_df)}, pos+ages수: {len(merged1_df)}, 중복 데이터 수: {len(duplicate2_rows)}')

merged2_df = merged2_df.sort_values(by='confederation', ascending=False)
merged2_df = merged2_df.drop_duplicates(subset=key_cols)

merged2_df

confederation_df 수: 2871, pos+ages수: 2679, 중복 데이터 수: 1134


Unnamed: 0,name,position,age,nation,club,value,confederation,league
0,Kaoru Mitoma,Left Winger,28.0,Japan,Brighton & Hove Albion,€40.00m,UEFA,
330,So Kawahara,Defensive Midfield,27.0,Japan,Kawasaki Frontale,€850k,UEFA,
343,Ryuya Nishio,Centre-Back,24.0,Japan,Cerezo Osaka,€850k,UEFA,
342,Motohiko Nakajima,Second Striker,26.0,Japan,Cerezo Osaka,€850k,UEFA,
341,Hirokazu Ishihara,Right-Back,26.0,Japan,Urawa Red Diamonds,€850k,UEFA,
...,...,...,...,...,...,...,...,...
5545,Ronivaldo,Centre-Forward,36.0,Austria,FC Blau-Weiss Linz,€350k,,
5546,Juanma Delgado,Centre-Forward,34.0,Spain,V-Varen Nagasaki,€350k,,
5547,Myung-joo Lee,Central Midfield,35.0,"Korea, South",Incheon United,€350k,,
5548,Michele Castagnetti,Defensive Midfield,35.0,Italy,US Cremonese,€350k,,


3. (positions_df + age_group_df + confederation_df) + other_leagues_df 병합과정

In [17]:
# 인덱스 초기화
other_leagues_df = other_leagues_df.reset_index(drop=True)
merged2_df = merged2_df.reset_index(drop=True)

# 병합 기준 key 컬럼
key_cols = ['name', 'position', 'age', 'nation', 'club', 'value']

# 컬럼 타입 맞추기
for col in key_cols:
    if col in merged2_df.columns and col in other_leagues_df.columns:
        if col == 'age':
            merged2_df[col] = pd.to_numeric(merged2_df[col], errors='coerce')
            other_leagues_df[col] = pd.to_numeric(other_leagues_df[col], errors='coerce')
            
            merged2_df = merged2_df.dropna(subset=['age'])
            other_leagues_df = other_leagues_df.dropna(subset=['age'])
            merged2_df['age'] = merged2_df['age'].astype(int)
            other_leagues_df['age'] = other_leagues_df['age'].astype(int)
        else:
            merged2_df[col] = merged2_df[col].astype(str)
            other_leagues_df[col] = other_leagues_df[col].astype(str)

# 중복 데이터 찾기
duplicate3_rows = pd.merge(merged2_df, other_leagues_df, on=key_cols)
print(f'other_leagues_df 수: {len(other_leagues_df)}, (pos+ages+confs) 수: {len(merged2_df)}, 중복 데이터 수: {len(duplicate3_rows)}')

transfermarkt_df = pd.concat([merged2_df, other_leagues_df], ignore_index=True)
transfermarkt_df = transfermarkt_df.drop_duplicates(subset=key_cols)

transfermarkt_df.to_csv('../data/transfermarkt_data.csv', index=False, encoding='utf-8-sig')
transfermarkt_df

other_leagues_df 수: 2000, (pos+ages+confs) 수: 4371, 중복 데이터 수: 1042


Unnamed: 0,name,position,age,nation,club,value,confederation,league
0,Kaoru Mitoma,Left Winger,28,Japan,Brighton & Hove Albion,€40.00m,UEFA,
1,So Kawahara,Defensive Midfield,27,Japan,Kawasaki Frontale,€850k,UEFA,
2,Ryuya Nishio,Centre-Back,24,Japan,Cerezo Osaka,€850k,UEFA,
3,Motohiko Nakajima,Second Striker,26,Japan,Cerezo Osaka,€850k,UEFA,
4,Hirokazu Ishihara,Right-Back,26,Japan,Urawa Red Diamonds,€850k,UEFA,
...,...,...,...,...,...,...,...,...
6366,Jin Cheng,Attacking Midfield,30,China,Zhejiang FC,€350k,,CSL
6367,Qianglong Tao,Centre-Forward,23,China,Zhejiang FC,€350k,,CSL
6368,Shiqin Wang,Left-Back,22,China,Zhejiang FC,€350k,,CSL
6369,Rodrigo Henrique,Left Winger,32,Brazil,Meizhou Hakka,€350k,,CSL
