# Matching Transfermarkt Players with Football Manager Players

Our goal is that every player in Transfermarkt active in 2020 is matched with a FM20 player

In [17]:
from src.libs import *

players_df = pd.read_csv("datasets/players2020.csv")
valuations_df = pd.read_csv("datasets/player_valuations.csv")
clubs_df = pd.read_csv("datasets/clubs.csv")
leagues_df = pd.read_csv("datasets/leagues.csv")
fm20_df = pd.read_csv("datasets/datafm20.csv")

Let's have a look at the columns of both the FM20 dataframes and the players dataframes

In [9]:
fm20_df.columns

Index(['Unnamed: 0', 'Name', 'Position', 'Club', 'Division', 'Based', 'Nation',
       'Height', 'Weight', 'Age', 'Preferred Foot', 'Best Pos', 'Best Role',
       'Value', 'Wage', 'CA', 'PA', 'Wor', 'Vis', 'Thr', 'Tec', 'Tea', 'Tck',
       'Str', 'Sta', 'TRO', 'Ref', 'Pun', 'Pos', 'Pen', 'Pas', 'Pac', '1v1',
       'OtB', 'Nat', 'Mar', 'L Th', 'Lon', 'Ldr', 'Kic', 'Jum', 'Hea', 'Han',
       'Fre', 'Fla', 'Fir', 'Fin', 'Ecc', 'Dri', 'Det', 'Dec', 'Cro', 'Cor',
       'Cnt', 'Cmp', 'Com', 'Cmd', 'Bra', 'Bal', 'Ant', 'Agi', 'Agg', 'Aer',
       'Acc'],
      dtype='object')

In [10]:
players_df.columns

Index(['player_id', 'club_id', 'name', 'pretty_name', 'country_of_birth',
       'country_of_citizenship', 'date_of_birth', 'position', 'sub_position',
       'foot', 'height_in_cm', 'url'],
      dtype='object')

Some relevant features for matching are apparent:
- Name (pretty_name)
- Position / Best Pos (position)
- Club (club_id)
- Based (according to the player club in 2020)
- Nation (country_of_birth or country_of_citizenship)
- Height (height_in_cm)
- Preferred Foot (foot)
- Age (can be calculated from the date_of_birth)

From here, we only need the clubs information in 2020, which we can get:

In [11]:
clubs_df.columns

Index(['club_id', 'name', 'pretty_name', 'domestic_competition_id',
       'total_market_value', 'squad_size', 'average_age', 'foreigners_number',
       'foreigners_percentage', 'national_team_players', 'stadium_name',
       'stadium_seats', 'net_transfer_record', 'coach_name', 'url'],
      dtype='object')

Now let's merge the datasets of players and clubs

In [21]:
clubs_df['club'] = clubs_df['pretty_name']

leagues_df['league'] = leagues_df['name']

player_clubs_df = players_df.merge(clubs_df[['club_id', 'club', 'domestic_competition_id']], on="club_id")

player_clubs_df.head()

Unnamed: 0,player_id,club_id,name,pretty_name,country_of_birth,country_of_citizenship,date_of_birth,position,sub_position,foot,height_in_cm,url,club,domestic_competition_id
0,25727,16,lukasz-piszczek,Lukasz Piszczek,Poland,Poland,1985-06-03,Defender,Right-Back,Right,184,https://www.transfermarkt.co.uk/lukasz-piszcze...,Borussia Dortmund,L1
1,91849,16,thomas-delaney,Thomas Delaney,Denmark,Denmark,1991-09-03,Midfield,Defensive Midfield,Left,182,https://www.transfermarkt.co.uk/thomas-delaney...,Borussia Dortmund,L1
2,59027,16,roman-burki,Roman Burki,Switzerland,Switzerland,1990-11-14,Goalkeeper,,Right,187,https://www.transfermarkt.co.uk/roman-burki/pr...,Borussia Dortmund,L1
3,388513,16,mateu-morey-bauza,Mateu Morey Bauza,Spain,Spain,2000-03-02,Defender,Right-Back,Right,173,https://www.transfermarkt.co.uk/mateu-morey-ba...,Borussia Dortmund,L1
4,274461,16,felix-passlack,Felix Passlack,Germany,Germany,1998-05-29,Defender,Right-Back,Right,170,https://www.transfermarkt.co.uk/felix-passlack...,Borussia Dortmund,L1


We can also use the domestic_competition_id to get the name of the competition

In [29]:
player_matching_df = player_clubs_df.merge(leagues_df[['league_id', 'league']], left_on="domestic_competition_id", right_on="league_id")

player_matching_df.head()

Unnamed: 0,player_id,club_id,name,pretty_name,country_of_birth,country_of_citizenship,date_of_birth,position,sub_position,foot,height_in_cm,url,club,domestic_competition_id,league_id,league
0,25727,16,lukasz-piszczek,Lukasz Piszczek,Poland,Poland,1985-06-03,Defender,Right-Back,Right,184,https://www.transfermarkt.co.uk/lukasz-piszcze...,Borussia Dortmund,L1,L1,bundesliga
1,91849,16,thomas-delaney,Thomas Delaney,Denmark,Denmark,1991-09-03,Midfield,Defensive Midfield,Left,182,https://www.transfermarkt.co.uk/thomas-delaney...,Borussia Dortmund,L1,L1,bundesliga
2,59027,16,roman-burki,Roman Burki,Switzerland,Switzerland,1990-11-14,Goalkeeper,,Right,187,https://www.transfermarkt.co.uk/roman-burki/pr...,Borussia Dortmund,L1,L1,bundesliga
3,388513,16,mateu-morey-bauza,Mateu Morey Bauza,Spain,Spain,2000-03-02,Defender,Right-Back,Right,173,https://www.transfermarkt.co.uk/mateu-morey-ba...,Borussia Dortmund,L1,L1,bundesliga
4,274461,16,felix-passlack,Felix Passlack,Germany,Germany,1998-05-29,Defender,Right-Back,Right,170,https://www.transfermarkt.co.uk/felix-passlack...,Borussia Dortmund,L1,L1,bundesliga


- The market_value_in_gbp is the critical feature, so we will try to find a way to fill the 7152 missing records by using the player valuations df.
- Check the 1 record which doesn't have country of citizenship. If it also doesn't have country of birth, I will remove it
- The 26 records without date of birth will be kept, as this information is also in the FM20 dataset (if it gets matched)
- Sub position is not a critical feature, so it is ok for it to be null
- The 1884 records without foot will be kept, as this information is also in the FM20 dataset (if it gets matched)



In [30]:
fm20_df.head()

Unnamed: 0.1,Unnamed: 0,Name,Position,Club,Division,Based,Nation,Height,Weight,Age,...,Cmp,Com,Cmd,Bra,Bal,Ant,Agi,Agg,Aer,Acc
0,0,Lionel Messi,"AM (RC), ST (C)",Barcelona,Spanish First Division,Spain (First Division),ARG,170,72,32,...,18,2,2,10,19,19,19,7,2,18
1,1,Cristiano Ronaldo,"AM (RL), ST (C)",Juventus,Italian Serie A,Italy (Serie A),POR,185,83,34,...,14,4,1,16,14,18,13,6,2,15
2,2,Kylian MbappÃ©,"AM (RL), ST (C)",Paris SG,Ligue 1 Conforama,France (Ligue 1 Conforama),FRA,178,73,20,...,18,2,1,12,14,17,16,6,4,20
3,3,Manuel Neuer,GK,FC Bayern,Bundesliga,Germany (Bundesliga),GER,192,90,33,...,20,12,17,14,14,19,15,8,14,13
4,4,Neymar,"M (L), AM (LC), ST (C)",Paris SG,Ligue 1 Conforama,France (Ligue 1 Conforama),BRA,175,68,27,...,17,3,1,14,14,16,18,10,4,17
