In [1]:
# Import all src.libs

from src.libs import *

In [2]:
# Import datasets

fm20_dataset = pd.read_csv(PROJECT_FOLDER + "datasets/datafm20.csv")
transfermarkt_players_dataset = pd.read_csv(PROJECT_FOLDER + "datasets/players.csv")
transfermarkt_valuations_dataset = pd.read_csv(PROJECT_FOLDER + "datasets/player_valuations.csv")

In [3]:
# Prepare dataset - merge fm20_df and less_28_players

# Use only players with less than 28 yo

transfermarkt_base_players_df = transfermarkt_players_dataset[:]
transfermarkt_base_players_df.query("last_season >= 2021 & date_of_birth >= '1992-01-01' & position != 'Goalkeeper'", inplace=True)
transfermarkt_base_players_df.dropna(subset='market_value_in_gbp')
transfermarkt_base_players_df.sort_values(by='market_value_in_gbp', ascending=False, inplace=True)

fm20_base_players_df = fm20_dataset.query("Age <= 27 & `Best Pos` != 'GK'")
fm20_base_players_df.sort_values(by='Value', ascending=False, inplace=True)

In [4]:
# Map country_of_citizenship (Transfermarkt) to country codes (FM20)

fm20_name_nation_mapping = fm20_dataset.groupby('Nation').head(5)[['Name', 'Nation']]
fm20_name_nation_mapping['Transfermarkt_Name'] = fm20_name_nation_mapping['Name'].apply(name_to_id).apply(lambda x: x.replace('-scaron-', 's'))

transfermarkt_name_nation_mapping = transfermarkt_players_dataset[['name', 'country_of_citizenship']]

player_nationalities_mapping = fm20_name_nation_mapping.merge(transfermarkt_name_nation_mapping, left_on='Transfermarkt_Name', right_on='name')
player_nationalities_mapping.drop_duplicates(subset='Nation', inplace=True)

fm20_base_players_df = fm20_base_players_df.merge(player_nationalities_mapping[['Nation', 'country_of_citizenship']], on='Nation')

In [5]:
# Map name (Transfermarkt) to Name (FM20)

fm20_base_players_df['Transfermarkt_Name'] = fm20_base_players_df['Name'].apply(name_char_replacer).apply(name_to_id)

In [6]:
# Merge dataframes

transfermarkt_fm20_merged_df = fm20_base_players_df.merge(transfermarkt_base_players_df, left_on=['Transfermarkt_Name', 'country_of_citizenship'], right_on=['name', 'country_of_citizenship'])

transfermarkt_fm20_merged_df.drop_duplicates(subset=['Transfermarkt_Name', 'country_of_citizenship'], inplace=True)

In [7]:
missing = transfermarkt_base_players_df.merge(transfermarkt_fm20_merged_df, indicator='i', how='outer').query('i == "left_only"')[transfermarkt_base_players_df.columns]

In [8]:
missing.sort_values(by='market_value_in_gbp', ascending=False, inplace=True)
missing['Age_2020'] = missing['date_of_birth'].apply(approximate_age_2020)
missing_fm = fm20_base_players_df[:]

def find_match(player):
    global missing_fm
    country = player['country_of_citizenship']
    age = player['Age_2020']
    value = player['market_value_in_gbp'] / 5
    age_minus = age - 1
    potential_players = missing_fm.query('country_of_citizenship == @country and (Age == @age_minus or Age == @age) and Value >= @value')
    player_name = next(iter(difflib.get_close_matches(player['name'], potential_players['Name'], cutoff=0.5)), None)
    if player_name:
        player_fm = potential_players.query('Name == @player_name').sort_values(by='Value', ascending=False).head(1)
        missing_fm = missing_fm.drop(player_fm.index)
        return player_fm.index
    return None


matches = missing.apply(find_match, axis=1)


In [9]:
tfmk_missing = missing.merge(matches.to_frame('fm_player_index'), left_index=True, right_index=True).dropna()
tfmk_missing['fm_player_index'] = tfmk_missing['fm_player_index'].apply(lambda x: x.tolist()[0])
tfmk_missing = tfmk_missing.merge(fm20_base_players_df, left_on='fm_player_index', right_index=True)

In [10]:
transfermarkt_fm20_merged_df = transfermarkt_fm20_merged_df.append(tfmk_missing)

In [11]:
# Insert target values (valuation now vs valuations 2020)

transfermarkt_fm20_merged_df['value_2022'] = transfermarkt_fm20_merged_df['market_value_in_gbp']

player_values_2020 = transfermarkt_valuations_dataset.query("date >= '2019-07-01' & date <= '2020-12-31'").sort_values(by='date', ascending=True).drop_duplicates(subset='player_id')

player_values_2020['value_2020'] = player_values_2020['market_value']

players_with_values = transfermarkt_fm20_merged_df.merge(player_values_2020, on='player_id')

players_with_values['value_diff'] = players_with_values['value_2022'] - players_with_values['value_2020']

In [12]:
# Save dataframe

players_with_values.to_csv(PROJECT_FOLDER + 'assets/base_players_dataset.csv')