### 주제: 5대 리그 (PL, LaLiga, Serie A, Bundesliga, Ligue 1) 내 이적시장 목록 크롤링 및 자동화

In [28]:
import re
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
    }

In [137]:
league_code = ['GB1', 'ES1', 'IT1', 'L1', 'FR1']    # PL, LaLiga, Serie A, Bundesliga, Ligue 1

transfer_info = []      # 이적 정보 추가할 리스트 생성

for league in league_code:
    start_page = 1      # 1 페이지부터 크롤링 시작

    while True:
        url = f'https://www.transfermarkt.com/transfers/neuestetransfers/statistik/plus/plus/1/galerie/0/wettbewerb_id/{league}/land_id//minMarktwert/500.000/maxMarktwert/500.000.000/minAbloese/0/maxAbloese/500.000.000/yt0/Show/page/{start_page}?ajax=yw1'
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')

        transfer_lists = soup.select('tr.odd, tr.even')
        end_page = int(soup.select('.tm-pagination__list-item > a')[-1].attrs['href'].split('/')[-1])   # league 별 이적시장 끝 페이지

        for transfer_list in transfer_lists:
            info = transfer_list.select('td')

            player_id = info[2].select_one('a').attrs['href'].split('/')[-1]
            name = info[2].text.strip()
            position = info[3].text.strip()
            age = info[4].text.strip()
            nation = info[5].img['alt']
            left = info[6].img['alt']
            joined = info[10].img['alt']
            transfer_date = info[14].text.strip()
            market_value = info[15].text.strip()
            fee = info[16].text.strip()
            transfer_id = info[16].select_one('a').attrs['href'].split('/')[-1]

            info_list = [player_id, name, position, age, nation, left, joined, transfer_date, market_value, fee, transfer_id]
            transfer_info.append(info_list)

        start_page += 1     # 다음 페이지 crawling
        if start_page > int(end_page):  # 마지막 페이지 도달 시 break
            break
        

In [135]:
np.array(transfer_info).shape

(453, 11)

In [138]:
df = pd.DataFrame(transfer_info, columns=['Player Id', 'Name', 'Position', 'Age', 'Nation', 'Left', 'Joined', 'Transfer Date', 'Market Value', 'Fee', 'Transfer Id'])
df

Unnamed: 0,Player Id,Name,Position,Age,Nation,Left,Joined,Transfer Date,Market Value,Fee,Transfer Id
0,557459,Micky van de Ven,Centre-Back,22,Netherlands,VfL Wolfsburg,Tottenham Hotspur,"Aug 8, 2023",€30.00m,€40.00m,4569107
1,503981,Tino Livramento,Right-Back,20,England,Southampton FC,Newcastle United,"Aug 8, 2023",€25.00m,€37.20m,4568858
2,401604,Michal Karbownik,Left-Back,22,Poland,Brighton & Hove Albion,Hertha BSC,"Aug 8, 2023",€3.00m,?,4568430
3,532541,Luke Cundle,Central Midfield,21,England,Wolverhampton Wanderers,Plymouth Argyle,"Aug 7, 2023",€3.00m,loan transfer,4567610
4,285845,Ainsley Maitland-Niles,Central Midfield,25,England,Arsenal FC,Olympique Lyon,"Aug 7, 2023",€8.00m,free transfer,4567252
...,...,...,...,...,...,...,...,...,...,...,...
448,494236,Andy Pelmard,Centre-Back,23,France,FC Basel 1893,Clermont Foot 63,"Jul 12, 2023",€4.30m,€1.90m,4497672
449,279570,Daler Kuzyaev,Central Midfield,30,Russia,Zenit St. Petersburg,Le Havre AC,"Jul 12, 2023",€10.00m,free transfer,4470357
450,804664,Elies Mahmoud,Right Midfield,22,France,Le Havre AC,FC Stade-Lausanne-Ouchy,"Jul 11, 2023",€800k,?,4496881
451,229006,Franck Honorat,Right Winger,26,France,Stade Brestois 29,Borussia Mönchengladbach,"Jul 11, 2023",€10.00m,€8.00m,4496080


### 데이터 전처리

In [139]:
# 5대 리그 사이에서 이적이 발생할 경우, 중복 발생하므로 Transfer Id와 Name 기준으로 중복 제거 후 reindexing
df.drop_duplicates(['Transfer Id'], inplace=True)
df.drop_duplicates(['Name'], inplace=True)
df.reset_index(inplace=True, drop=True)

In [140]:
# Player Id, Age, Transfer Id 컬럼 정수형으로 변환
df = df.astype({'Player Id': 'int', 'Age': 'int', 'Transfer Id': 'int'})

In [141]:
df.dtypes

Player Id         int32
Name             object
Position         object
Age               int32
Nation           object
Left             object
Joined           object
Transfer Date    object
Market Value     object
Fee              object
Transfer Id       int32
dtype: object

In [142]:
# 날짜 형식 변환 및 날짜 순으로 정렬
df['Transfer Date'] = pd.to_datetime(df['Transfer Date'])
df.sort_values(by='Transfer Date', ascending=False, inplace=True)

In [143]:
# 시장 가치 데이터 변환
df['Market Value'] = df['Market Value'].apply(lambda x: re.search('[0-9\.]{1,}[mk]', x).group())
df['Market Value'] = df['Market Value'].apply(lambda x: int(float(str(x)[:-1])*10e6) if str(x)[-1] == 'm' else int(float(str(x)[:-1])*10e3))

In [144]:
# 임대/이적 구분
df['Transfer Type'] = df['Fee'].apply(lambda x: 'loan' if re.search('[lL]oan.+', x) else 'unknown' if x == 'unknown' else 'transfer')

In [145]:
# 비용 데이터 변환
df['Fee'] = df['Fee'].apply(lambda x: re.search('[0-9\.]{1,}[mk]', x).group() if re.search('[0-9\.]{1,}[mk]', x) else np.NaN if x == 'unknown' else 0)
df['Fee'] = df['Fee'].apply(lambda x: int(float(str(x)[:-1])*10e6) if str(x)[-1] == 'm' else int(float(str(x)[:-1])*10e3) if str(x)[-1] == 'k'else x)

In [146]:
# Market Value, Fee column명 변경
df.rename(columns={'Market Value': 'Market Value (€)', 'Fee': 'Fee (€)'}, inplace=True)

In [147]:
df

Unnamed: 0,Player Id,Name,Position,Age,Nation,Left,Joined,Transfer Date,Market Value (€),Fee (€),Transfer Id,Transfer Type
0,557459,Micky van de Ven,Centre-Back,22,Netherlands,VfL Wolfsburg,Tottenham Hotspur,2023-08-08,300000000,400000000,4569107,transfer
94,68645,Martín Montoya,Right-Back,32,Spain,Real Betis Balompié,Without Club,2023-08-08,12000000,0,4568782,transfer
321,540664,Dilane Bakwa,Left Winger,20,France,FC Girondins Bordeaux,RC Strasbourg Alsace,2023-08-08,40000000,0,4568918,transfer
269,338668,Senne Lynen,Central Midfield,24,Belgium,Royale Union Saint Gilloise,SV Werder Bremen,2023-08-08,40000000,20000000,4568958,transfer
1,503981,Tino Livramento,Right-Back,20,England,Southampton FC,Newcastle United,2023-08-08,250000000,372000000,4568858,transfer
...,...,...,...,...,...,...,...,...,...,...,...,...
92,396638,Manor Solomon,Left Winger,24,Israel,Shakhtar Donetsk,Tottenham Hotspur,2023-07-11,180000000,0,4495375,transfer
172,331498,Jordi Mboula,Right Winger,24,Spain,RCD Mallorca,Hellas Verona,2023-07-11,12000000,0,4494446,transfer
171,215099,Daley Sinkgraven,Left-Back,28,Netherlands,Bayer 04 Leverkusen,UD Las Palmas,2023-07-11,25000000,0,4495634,transfer
170,65467,Vicente Iborra,Defensive Midfield,35,Spain,Villarreal CF,Olympiacos Piraeus,2023-07-11,12000000,0,4495715,transfer


In [148]:
import pymysql

In [149]:
def read_config(p:str) -> dict:
    """
    p: config file 경로
    """
    with open(p, 'r') as f:
        lines = f.readlines()

    config_dict = {}
    for l in lines:
        idx = l.index('=')
        k = l[:idx]
        v = l[idx+1:]
        config_dict[k] = v.strip()
    config_dict["port"] = 3307
    
    return config_dict

In [150]:
db_config = read_config('./db_config')
db_config

{'host': 'localhost',
 'port': 3307,
 'user': 'leoni',
 'password': 'password',
 'database': 'transfer_market',
 'charset': 'utf8mb4'}

In [153]:
try:
    conn = pymysql.connect(**db_config)
    # conn = pymysql.connect(**config_dict)
    print("연결 성공")
except Exception as e:
    print("연결 실패", e)

연결 성공


In [157]:
# 10분마다 반복
from crawler import crawler
from preprocessing import preprocessing
import time

# df_new = crawler()
# df_new = preprocessing(df_new)
# while True:
    # 1. 크롤링
    df_new = crawler()
    df_new = preprocessing(df_new)
    # 2. 신규 아이디 체크
    for transfer_id in df_new['Transfer Id']:
        # 신규 이적이 발생했을때,
        if transfer_id not in list(df['Transfer Id']):
            with conn.cursor() as cur:
                # Player table에 선수가 있는지 먼저 체크 (playerid)
                for player_id in df_new['Player Id']:
                    # 있으면 pass
                    if player_id in list(df['Player Id']):
                        pass
                    # 없으면 Transfer Id 값에 맞게 Player table에 선수 정보 insert
                    else:
                        sql_player = 'INSERT INTO `Player` (`PlayerId`, `PlayerName`, `Age`, `Nation`, `Position`, `MarketValue`) VALUES (%s, %s, %s, %s, %s, %s)'
                        cur.executemany(sql_player, df[df_new['Transfer Id']==transfer_id][['Player Id', 'Name', 'Age', 'Nation', 'Position', 'Market Value (€)']].values.tolist() )
                    # Transfer table에 신규 이적 정보 insert
                    sql_transfer = 'INSERT INTO `Transfer` (`TransferId`, `PlayerId`, `PlayerName`, `TransferDate`, `Fee`, `Left`, `Joined`, `TransferType`) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)'
                    cur.executemany(sql_transfer, df[df_new['Transfer Id']==transfer_id][['Transfer Id', 'Player Id', 'Name', 'Transfer Date', 'Fee (€)', 'Left', 'Joined', 'Transfer Type']].values.tolist() )
            conn.commit()
        # 신규 이적 없을 시, break
        else:
            break
    # 10분 뒤 다시 신규 이적 발생 여부 check
    time.sleep(600)

    df_new

KeyboardInterrupt: 

In [None]:
df_new['Transfer Id']

0      4568430
25     4567479
88     4567324
87     4567542
70     4566134
        ...   
49     4545177
104    4544810
85     4535848
84     4536390
86     4534738
Name: Transfer Id, Length: 105, dtype: int32

In [27]:
for i in df_new['Transfer Id']:
    print(i in list(df['Transfer Id']))

NameError: name 'df_new' is not defined

In [22]:
df_new['Transfer Id']

25    4563677
89    4564271
71    4561536
27    4561082
50    4562414
       ...   
84    4536390
85    4535848
86    4534738
88    4472618
87    4534426
Name: Transfer Id, Length: 106, dtype: int32

In [None]:
df[df['Transfer Id']==transfer_id][['Player Id', 'Name', 'Age', 'Nation', 'Position', 'Market Value (€)']].values.tolist()

In [None]:
df_new