### 전처리코드
1. 대륙별 축구연맹 결측치 채우기
2. 국가명 표기 통일
3. 선수 포지션 표기 통일
4. 선수 몸값 (유로 -> 원화 표기변경)
5. 팀 표기명 하나로 통일

In [16]:
import pandas as pd
import numpy as np
from unidecode import unidecode

In [6]:
player_df = pd.read_csv('../data/transfermarkt_data.csv', encoding='utf-8-sig')
player_df

Unnamed: 0,name,position,age,nation,club,value,confederation,league
0,Kaoru Mitoma,Left Winger,28,Japan,Brighton & Hove Albion,€40.00m,UEFA,
1,So Kawahara,Defensive Midfield,27,Japan,Kawasaki Frontale,€850k,UEFA,
2,Ryuya Nishio,Centre-Back,24,Japan,Cerezo Osaka,€850k,UEFA,
3,Motohiko Nakajima,Second Striker,26,Japan,Cerezo Osaka,€850k,UEFA,
4,Hirokazu Ishihara,Right-Back,26,Japan,Urawa Red Diamonds,€850k,UEFA,
...,...,...,...,...,...,...,...,...
5324,Jin Cheng,Attacking Midfield,30,China,Zhejiang FC,€350k,,CSL
5325,Qianglong Tao,Centre-Forward,23,China,Zhejiang FC,€350k,,CSL
5326,Shiqin Wang,Left-Back,22,China,Zhejiang FC,€350k,,CSL
5327,Rodrigo Henrique,Left Winger,32,Brazil,Meizhou Hakka,€350k,,CSL


In [None]:
# 컬럼별 고유값/결측값 개수 확인
print(player_df.nunique())
print(player_df.dtypes)
print(player_df.isna().sum())

name             5283
position           16
age                26
nation            145
club              759
value             108
confederation       6
league             20
dtype: int64
name             object
position         object
age               int64
nation           object
club             object
value            object
confederation    object
league           object
dtype: object
name                0
position            0
age                 0
nation              0
club                0
value               0
confederation    2503
league           4371
dtype: int64


In [17]:
player_df['name'] = player_df['name'].apply(unidecode)

In [None]:
# 대륙 연맹별 국가 리스트

# 유럽 축구 연맹
UEFA = [ "Albania", "Andorra", "Armenia", "Austria", "Azerbaijan",
    "Belarus", "Belgium", "Bosnia-Herzegovina", "Bulgaria",
    "Croatia", "Cyprus", "Czech Republic", "Denmark", "England",
    "Estonia", "Faroe Islands", "Finland", "France", "Georgia",
    "Germany", "Gibraltar", "Greece", "Hungary", "Iceland",
    "Israel", "Italy", "Kazakhstan", "Kosovo", "Latvia", "Liechtenstein",
    "Lithuania", "Luxembourg", "Malta", "Moldova", "Monaco",
    "Montenegro", "Netherlands", "North Macedonia", "Northern Ireland",
    "Norway", "Poland", "Portugal", "Republic of Ireland", "Romania",
    "Russia", "San Marino", "Scotland", "Serbia", "Slovakia",
    "Slovenia", "Spain", "Sweden", "Switzerland", "Türkiye",
    "Ukraine", "Wales" ]

# 아시아 축구 연맹
AFC = [ "Afghanistan", "Australia", "Bahrain", "Bangladesh", "Bhutan",
    "Brunei", "Cambodia", "China", "Chinese Taipei", "Guam",
    "Hong Kong", "India", "Indonesia", "Iran", "Iraq",
    "Japan", "Jordan", "Kuwait", "Kyrgyzstan", "Laos",
    "Lebanon", "Macau", "Malaysia", "Maldives", "Mongolia",
    "Myanmar", "Nepal", "Korea, North", "Oman", "Pakistan",
    "Palestine", "Philippines", "Qatar", "Saudi Arabia", "Singapore",
    "Korea, South", "Sri Lanka", "Syria", "Tajikistan", "Thailand",
    "Timor-Leste", "Turkmenistan", "United Arab Emirates", "Uzbekistan", "Vietnam",
    "Yemen" ]

# 아프리카 축구 연맹
CAF = [ "Algeria", "Angola", "Benin", "Botswana", "Burkina Faso",
    "Burundi", "Cabo Verde", "Cameroon", "Central African Republic", "Chad",
    "Comoros", "Congo", "Democratic Republic of the Congo", "Djibouti", "Egypt",
    "Equatorial Guinea", "Eritrea", "Eswatini", "Ethiopia", "Gabon",
    "Gambia", "Ghana", "Guinea", "Guinea-Bissau", "Ivory Coast",
    "Kenya", "Lesotho", "Liberia", "Libya", "Madagascar",
    "Malawi", "Mali", "Mauritania", "Mauritius", "Morocco",
    "Mozambique", "Namibia", "Niger", "Nigeria", "Rwanda",
    "São Tomé and Príncipe", "Senegal", "Seychelles", "Sierra Leone", "Somalia",
    "South Africa", "South Sudan", "Sudan", "Tanzania", "Togo",
    "Tunisia", "Uganda", "Zambia", "Zimbabwe" ]

# 북중미 축구 연맹
CONCACAF = [ "Anguilla", "Antigua and Barbuda", "Aruba", "Bahamas", "Barbados",
    "Belize", "Bermuda", "Bonaire", "British Virgin Islands", "Canada",
    "Cayman Islands", "Costa Rica", "Cuba", "Curacao", "Dominica",
    "Dominican Republic", "El Salvador", "French Guiana", "Grenada", "Guadeloupe",
    "Guatemala", "Guyana", "Haiti", "Honduras", "Jamaica",
    "Martinique", "Mexico", "Montserrat", "Nicaragua", "Panama",
    "Puerto Rico", "Saint Kitts and Nevis", "Saint Lucia", "Saint Martin", "Saint Vincent and the Grenadines",
    "Sint Maarten", "Suriname", "Trinidad and Tobago", "Turks and Caicos Islands", "United States",
    "US Virgin Islands" ]

# 남미 축구 연맹
CONMEBOL = [ "Argentina", "Bolivia", "Brazil", "Chile", "Colombia",
    "Ecuador", "Paraguay", "Peru", "Uruguay", "Venezuela" ]

# 오세아니아 축구 연맹
OFC = [ "American Samoa", "Cook Islands", "Fiji", "New Caledonia", "New Zealand",
    "Papua New Guinea", "Samoa", "Solomon Islands", "Tahiti", "Tonga",
    "Vanuatu" ]

confederation_map = {
    "UEFA": UEFA,
    "AFC": AFC,
    "CAF": CAF,
    "CONCACAF": CONCACAF,
    "CONMEBOL": CONMEBOL,
    "OFC": OFC
}

def get_confederation(nation):
    for confed, country_list in confederation_map.items():
        if nation.strip() in country_list:
            return confed
    return 'Unknown'

player_df['confederation'] = player_df['nation'].apply(get_confederation)
player_df

Unnamed: 0,name,position,age,nation,club,value,confederation,league
0,Kaoru Mitoma,Left Winger,28,Japan,Brighton & Hove Albion,€40.00m,AFC,
1,So Kawahara,Defensive Midfield,27,Japan,Kawasaki Frontale,€850k,AFC,
2,Ryuya Nishio,Centre-Back,24,Japan,Cerezo Osaka,€850k,AFC,
3,Motohiko Nakajima,Second Striker,26,Japan,Cerezo Osaka,€850k,AFC,
4,Hirokazu Ishihara,Right-Back,26,Japan,Urawa Red Diamonds,€850k,AFC,
...,...,...,...,...,...,...,...,...
5324,Jin Cheng,Attacking Midfield,30,China,Zhejiang FC,€350k,AFC,CSL
5325,Qianglong Tao,Centre-Forward,23,China,Zhejiang FC,€350k,AFC,CSL
5326,Shiqin Wang,Left-Back,22,China,Zhejiang FC,€350k,AFC,CSL
5327,Rodrigo Henrique,Left Winger,32,Brazil,Meizhou Hakka,€350k,CONMEBOL,CSL


In [21]:
print(player_df.isna().sum())
print(player_df['confederation'].value_counts())


name                0
position            0
age                 0
nation              0
club                0
value               0
confederation       0
league           4371
dtype: int64
confederation
UEFA        2310
CONMEBOL     859
AFC          642
CONCACAF     466
CAF          463
OFC          452
Unknown      137
Name: count, dtype: int64


In [None]:
## 데이터 전처리 - position

# 전체 포지션
Forwards = ['Centre-Forward', 'Second Striker', 'Winger','Right Winger','Left Winger','ST','CF','LW','RW']
Midfielders = ['Attacking Midfielders','Right Midfielders','Left Midfielders','Central Midfielders','Defensive Midfielders','MOC','CM','CDM','LM','RM']
Defenders = ['Centre-Back','Full-Back','Right-Back','Left-Back','Sweeper','CB','RB','LB']
Goalkeepers = ['Goalkeeper','GK']

# 세부 포지션
Striker = ['Centre-Forward','Second Striker','ST','CF']
Right_Winger = ['Right Winger','RW']
Left_Winger = ['Left Winger','LW']

Attacking_Midfielder = ['Attacking Midfielders','MOC']
Central_Midfielder = ['Central Midfielders','CM']
Side_Midfielder = ['Right Midfielders','Left Midfielders','LM','RM']
Defensive_Midfielder = ['Defensive Midfielders','CDM']

Centre_Back = ['Centre-Back','Sweeper','CB']
Right_Back = ['Right-Back','RB']
Left_Back = ['Left-Back','LB']

Goalkeeper = ['Goalkeeper','GK']