In [1]:
import ray
import ray.data

In [2]:
from hdfs import Config

client = Config(path="./config/.hdfscli.cfg").get_client(
    "dev"
)

In [3]:
# Insert files, taken from populate_hdfs
files_to_upload = ["transfers.csv","competitions.csv", "clubs.csv", "games.csv", "players.csv"]

remote_path = "/data/"

In [4]:
# make sure remote path exists
client.makedirs(remote_path)

# Insert files, taken from populate_hdfs

# Check if the file exists
for file in files_to_upload:
    local_path = f"./data/{file}"
    print(f"Checking if {file} exists in {remote_path}...")
    if client.status(remote_path + file, strict=False):
        print(f"{file} exists in {remote_path}!")
        continue

    print(f"{file} does not exist in {remote_path}!")
    print(f"Uploading {file} to {remote_path}...")
    # Upload a file to tmp, to be processed further
    client.upload(remote_path, local_path)

print(f"contents in {remote_path}: ", client.list("/data"))

Checking if transfers.csv exists in /data/...
transfers.csv exists in /data/!
Checking if competitions.csv exists in /data/...
competitions.csv exists in /data/!
Checking if clubs.csv exists in /data/...
clubs.csv exists in /data/!
Checking if games.csv exists in /data/...
games.csv exists in /data/!
Checking if players.csv exists in /data/...
players.csv exists in /data/!
contents in /data/:  ['appearances.csv', 'clubs.csv', 'competitions.csv', 'games.csv', 'players.csv', 'transfers.csv']


## Variables

Dependant - `to_club_id`

We are creating a classifier model, that would classify based on the independent variables below which club is the most likely for a future transfer.

When using as a service, it'd be nice if `player_id` and `to_club_name` were only necessary inputs and the rest read from HDFS/other data storage.
Let's presume that in these scenarios, the `transfer_season` would be the current one (24/25).

## Pre-processing

1. Remove entries where `transfer_fee == NaN`, since these entries are usually internal transfers (or from lower league youth teams).
2. Filter entries where `market_value_in_eur == Nan`, since we assume it's hard to find any info about these players

For now, we already execute/apply the filtering. But in the future, we will do all the processing first and then train our model on the batches, (hopefully) never applying `take_all`.

(Potential additional steps)

3. Remove retired players
4. Drop `transfer_date` column, as we don't need it for anything (the `transfer_season` should be enough for everything time-related).
5. Drop one of `from_club_name` or `from_club_id` (and the same for `to_club_...`).

### Joining tables

Other useful tables and their attributes:

appearances.csv - minutes played, goals, assists
(Would be hard to map to individual players playing, e.g. how do we know who was on the pitch when a goal was scored or conceded?) 

club_games.csv - own_position, opponent_goals, opponent_position

clubs.csv - domestic_competition_id, squad_size, average_age, foreigners_percentage, national_team_players, net_transfer_record, (maybe to filter outdated clubs) last_season

(IMO useless) competitions.csv
game_events.csv - player_id, type (goal, assist, card)

(To know no. of games started) game_lineups.csv - player_id, position, type (substitute, starter)

(IMO useless) games.csv

(Useful for training, to know the valuation at the time of transfer, maybe 1 year prior?) player_valuations.csv - date, market_value_in_eur, current_club_id, player_id

players.csv - last_season (filter retired players), country_of_birth, country_of_citizenship, position, sub_position, foot, height_in_cm, contract_expiration_date, agent_name, market_value_in_eur, highest_market_value_in_eur

Representing club names/ids the best way possible:
- initially as IDs, but that could be interpreted as ordinality by the model
- ideally as embeddings - either of the club name or combinations such as "club country + league + club name"

## Ray (Unused ATM)

In [None]:
# Initialize Ray
ray.init(dashboard_host="0.0.0.0")

In [None]:
# Read files
import pyarrow as pa
import pyarrow.csv as csv

# Helper function to read CSV files from HDFS in chunks
def read_csv_from_hdfs(client, file_path):
    with client.read(file_path) as reader:
        file_contents = reader.read()
    
    # Use pyarrow to read the CSV data from memory
    table = csv.read_csv(pa.py_buffer(file_contents))
    
    # Convert the pyarrow Table to a Ray Dataset
    return ray.data.from_arrow(table)

transfers_ds = read_csv_from_hdfs(client, "/data/transfers.csv")
clubs_ds = read_csv_from_hdfs(client, "/data/clubs.csv")
competitions_ds = read_csv_from_hdfs(client, "/data/competitions.csv")
players_ds = read_csv_from_hdfs(client, "/data/players.csv")

In [None]:
# Filter out rows with null values in 'transfer_fee' and 'market_value_in_eur'
def filter_transfers(batch):
    return batch[batch['transfer_fee'].notna() & batch['market_value_in_eur'].notna()]

transfers_ds = transfers_ds.map_batches(filter_transfers, batch_format="pandas")

In [None]:
def join_transfers_clubs(transfers_batch):
    transfers_df = pd.DataFrame(transfers_batch)
    clubs_df = clubs_ds.select_columns(['club_id', 'domestic_competition_id']).to_pandas()
    
    merged = transfers_df.merge(clubs_df, left_on='from_club_id', right_on='club_id', how='left', suffixes=('', '_from'))
    merged = merged.rename(columns={'domestic_competition_id': 'from_competition_id'})
    
    merged = merged.merge(clubs_df, left_on='to_club_id', right_on='club_id', how='left', suffixes=('', '_to'))
    merged = merged.rename(columns={'domestic_competition_id': 'to_competition_id'})
    
    return merged.drop(columns=['club_id', 'club_id_to', 'transfer_date'])

transfers_ds = transfers_ds.map_batches(join_transfers_clubs, batch_format="pandas")

# Filter out null competition IDs
transfers_ds = transfers_ds.filter(lambda row: row['from_competition_id'] is not None and row['to_competition_id'] is not None)

def join_transfers_competitions(transfers_batch):
    transfers_df = pd.DataFrame(transfers_batch)
    competitions_df = competitions_ds.select_columns(['competition_id', 'country_name', 'sub_type']).to_pandas()
    
    merged = transfers_df.merge(competitions_df, left_on='from_competition_id', right_on='competition_id', how='left', suffixes=('', '_from'))
    merged = merged.rename(columns={'country_name': 'from_country_name', 'sub_type': 'from_sub_type'})
    
    merged = merged.merge(competitions_df, left_on='to_competition_id', right_on='competition_id', how='left', suffixes=('', '_to'))
    merged = merged.rename(columns={'country_name': 'to_country_name', 'sub_type': 'to_sub_type'})
    
    return merged.drop(columns=['competition_id', 'competition_id_to'])

transfers_ds = transfers_ds.map_batches(join_transfers_competitions, batch_format="pandas")


# Join players_ds to transfers_ds
def join_transfers_players(transfers_batch):
    transfers_df = pd.DataFrame(transfers_batch)
    players_df = players_ds.select_columns(['player_id', 'last_season', 'country_of_citizenship', 'position', 'sub_position', 'contract_expiration_date', 'highest_market_value_in_eur']).to_pandas()
    
    merged = transfers_df.merge(players_df, on='player_id', how='left')
    return merged[merged['last_season'] > 2023]  # Filter 'retired' players

transfers_ds = transfers_ds.map_batches(join_transfers_players, batch_format="pandas")

In [None]:
def prepare_transfers(batch):
    df = batch.copy()
    
    # Replace transfer_season with transfer_season_num
    df['transfer_season_end_year'] = df['transfer_season'].apply(lambda x: int(x.split('/')[0]) + 1)

    # Replace countries with IDs
    country_columns = ['from_country_name', 'to_country_name', 'country_of_citizenship']
    all_countries = set()
    for col in country_columns:
        all_countries.update(df[col].dropna().unique())
    country_id_mapping = {country: idx for idx, country in enumerate(sorted(all_countries))}

    for col in country_columns:
        df[f'{col}_id'] = df[col].map(country_id_mapping)
        df = df.drop(columns=[col])

    # Replace position with IDs
    all_positions = df['position'].dropna().unique()
    position_mapping = {position: idx for idx, position in enumerate(sorted(all_positions))}
    df['position_id'] = df['position'].map(position_mapping)
    df = df.drop(columns=['position'])

    # Replace sub_position with IDs
    all_sub_positions = df['sub_position'].dropna().unique()
    sub_position_mapping = {sub_position: idx for idx, sub_position in enumerate(sorted(all_sub_positions))}
    df['sub_position_id'] = df['sub_position'].map(sub_position_mapping)
    df = df.drop(columns=['sub_position'])

    # Convert contract_expiration_date
    df['contract_expiration_date'] = pd.to_datetime(df['contract_expiration_date'], errors='coerce')
    df['contract_expiration_date'] = df['contract_expiration_date'].dt.year

    return df

prepared_transfers_ds = transfers_ds.map_batches(prepare_transfers, batch_format="pandas")

In [None]:
train_features = ['player_id', 'from_club_id', 'market_value_in_eur', 
                  'transfer_season_end_year', 'from_country_name_id', 
                  'country_of_citizenship_id', 'position_id', 'sub_position_id', 
                  'contract_expiration_date', 'highest_market_value_in_eur']

X = prepared_transfers_ds.select_columns(train_features)
y = prepared_transfers_ds.select_columns(['to_club_id'])

In [None]:
import ray
from ray import tune
from ray.train import ScalingConfig
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Define a function to prepare data and train the model
def train_func(config):
    # Get the Ray datasets
    X = ray.get(config["X"])
    y = ray.get(config["y"])
    
    # Convert to pandas (this will happen in parallel across workers)
    X_pd = X.to_pandas()
    y_pd = y.to_pandas()
    
    # Train the model
    clf = RandomForestClassifier(n_estimators=config["n_estimators"], random_state=42)
    clf.fit(X_pd, y_pd.values.ravel())
    
    # Calculate accuracy
    y_pred = clf.predict(X_pd)
    accuracy = accuracy_score(y_pd, y_pred)
    
    # Report results
    tune.report(accuracy=accuracy, model=clf)

# Define the search space
config = {
    "n_estimators": tune.choice([50, 100, 200]),
    "X": ray.put(X),  # Your Ray Dataset for features
    "y": ray.put(y)   # Your Ray Dataset for labels
}

# Create the tuner
tuner = tune.Tuner(
    train_func,
    param_space=config,
    tune_config=tune.TuneConfig(num_samples=1),  # Increase for hyperparameter tuning
    run_config=ray.train.RunConfig()
)

# Run the tuning
results = tuner.fit()

# Get the best result
best_result = results.get_best_result(metric="accuracy", mode="max")
print(best_result)
best_model = best_result.checkpoint.to_dict()["model"]

# Now you can use best_model for predictions

In [None]:
best_model

In [None]:
# Display the first few rows
print(transfers_ds.take(5))

# If you need the final result as a Pandas DataFrame:
# final_df = transfers_ds.to_pandas()

## Pandas

In [17]:
import modin.pandas as pd

# Initialize Ray
ray.init(dashboard_host="0.0.0.0")

RuntimeError: Maybe you called ray.init twice by accident? This error can be suppressed by passing in 'ignore_reinit_error=True' or by calling 'ray.shutdown()' prior to 'ray.init()'.

In [22]:
import os
import tempfile

def read_csv_with_modin(client, hdfs_path):
    with client.read(hdfs_path) as reader:
        file_contents = reader.read()
    
    # Create a temporary file
    with tempfile.NamedTemporaryFile(delete=False, suffix='.csv') as temp_file:
        temp_file.write(file_contents)
        temp_file_path = temp_file.name
    
    # Read the CSV file using Modin
    df = pd.read_csv(temp_file_path)
    
    # Delete the temporary file
    os.unlink(temp_file_path)
    
    return df

# Load the data
transfers_df = read_csv_with_modin(client, "/data/transfers.csv")
clubs_df = read_csv_with_modin(client, "/data/clubs.csv")
competitions_df = read_csv_with_modin(client, "/data/competitions.csv")
players_df = read_csv_with_modin(client, "/data/players.csv")

In [23]:
# Filter transfers
# 1. Remove entries where `transfer_fee == NaN`, since these entries are usually internal transfers (or from lower league youth teams).
# 2. Filter entries where `market_value_in_eur == Nan`, since we assume it's hard to find any info about these players

transfers_df = transfers_df[transfers_df['transfer_fee'].notna()]
transfers_df = transfers_df[transfers_df['market_value_in_eur'].notna()]

In [24]:
# Join tables
transfers_df = transfers_df.merge(clubs_df[['club_id', 'domestic_competition_id']], left_on='from_club_id', right_on='club_id', how='left', validate='m:m')
transfers_df = transfers_df.rename(columns={'domestic_competition_id': 'from_competition_id'})

transfers_df = transfers_df.merge(clubs_df[['club_id', 'domestic_competition_id']], left_on='to_club_id', right_on='club_id', how='left', validate='m:m')
transfers_df = transfers_df.rename(columns={'domestic_competition_id': 'to_competition_id'})

transfers_df = transfers_df.drop(columns=['club_id_x', 'club_id_y', 'transfer_date'])
transfers_df = transfers_df.dropna(subset=['from_competition_id', 'to_competition_id'])

transfers_df = transfers_df.merge(competitions_df[['competition_id', 'country_name', 'sub_type']], left_on='from_competition_id', right_on='competition_id', how='left', validate='m:m')
transfers_df = transfers_df.rename(columns={'country_name': 'from_country_name', 'sub_type': 'from_sub_type'})

transfers_df = transfers_df.merge(competitions_df[['competition_id','country_name', 'sub_type']], left_on='to_competition_id', right_on='competition_id', how='left', validate='m:m')
transfers_df = transfers_df.rename(columns={'country_name': 'to_country_name', 'sub_type': 'to_sub_type'})

transfers_df = transfers_df.drop(columns=['competition_id_x', 'competition_id_y'])

In [25]:
# Join players_df to transfers_df (columns last_season, country_of_birth, position, sub_position, contract_expiration_date, highest_market_value_in_eur)
# using the player_id
transfers_df = transfers_df.merge(players_df[['player_id', 'last_season', 'country_of_citizenship', 'position', 'sub_position', 'contract_expiration_date', 'highest_market_value_in_eur']], on='player_id', how='left', validate='m:m')

# Filter 'retired' players 
transfers_df = transfers_df[transfers_df['last_season'] > 2023]

transfers_df

Unnamed: 0,player_id,transfer_season,from_club_id,to_club_id,from_club_name,to_club_name,transfer_fee,market_value_in_eur,player_name,from_competition_id,...,from_country_name,from_sub_type,to_country_name,to_sub_type,last_season,country_of_citizenship,position,sub_position,contract_expiration_date,highest_market_value_in_eur
0,195778,25/26,79,27,VfB Stuttgart,Bayern Munich,0.0,12000000.0,Alexander Nübel,L1,...,Germany,first_tier,Germany,first_tier,2024,Germany,Goalkeeper,Goalkeeper,2026-06-30 00:00:00,16000000.0
1,569033,25/26,39,27,1.FSV Mainz 05,Bayern Munich,0.0,4000000.0,Armindo Sieb,L1,...,Germany,first_tier,Germany,first_tier,2024,Germany,Attack,Second Striker,2026-06-30 00:00:00,4000000.0
2,626913,25/26,398,380,Lazio,Salernitana,0.0,10000000.0,Boulaye Dia,IT1,...,Italy,first_tier,Italy,first_tier,2024,Senegal,Attack,Centre-Forward,2026-06-30 00:00:00,25000000.0
3,278343,25/26,167,114,FC Augsburg,Besiktas,5000000.0,7000000.0,Felix Uduokhai,L1,...,Germany,first_tier,Turkey,first_tier,2024,Germany,Defender,Centre-Back,2025-06-30 00:00:00,16000000.0
4,301238,25/26,2919,506,Monza,Juventus,14300000.0,18000000.0,Michele Di Gregorio,IT1,...,Italy,first_tier,Italy,first_tier,2024,Italy,Goalkeeper,Goalkeeper,2029-06-30 00:00:00,18000000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18949,33829,05/06,1085,294,Vitória Setúbal,Benfica,0.0,500000.0,José Fonte,PO1,...,Portugal,first_tier,Portugal,first_tier,2024,Portugal,Defender,Centre-Back,2025-06-30 00:00:00,7000000.0
18951,3333,05/06,762,405,Newcastle,Aston Villa,0.0,5500000.0,James Milner,GB1,...,England,first_tier,England,first_tier,2024,England,Midfield,Central Midfield,2025-06-30 00:00:00,21000000.0
18953,7825,05/06,1050,31,Villarreal,Liverpool,9800000.0,4500000.0,Pepe Reina,ES1,...,Spain,first_tier,England,first_tier,2024,Spain,Goalkeeper,Goalkeeper,2025-06-30 00:00:00,22000000.0
18956,15452,04/05,3709,1049,Getafe,Valencia,0.0,800000.0,Raúl Albiol,ES1,...,Spain,first_tier,Spain,first_tier,2024,Spain,Defender,Centre-Back,2025-06-30 00:00:00,18000000.0


### Preparing data

In [26]:
import hashlib
import numpy as np

def create_embedding(item, size=10):
    # Create a hash of the item
    hash_object = hashlib.md5(str(item).encode())
    hash_hex = hash_object.hexdigest()
    
    # Convert the hash to a list of floats
    return [int(hash_hex[i:i+2], 16) / 255.0 for i in range(0, size*2, 2)]

def apply_embeddings(df, column, size=10):
    embedding_cols = [f'{column}_emb_{i}' for i in range(size)]
    
    # Create a dictionary of embeddings
    unique_values = df[column].unique()
    embeddings = {str(val): create_embedding(val, size) for val in unique_values}
    
    # Apply embeddings
    for i in range(size):
        df[f'{column}_emb_{i}'] = df[column].astype(str).map(lambda x: embeddings[x][i])
    
    return df.drop(columns=[column])

prepared_transfers_df = transfers_df.copy()

# Replace transfer_season with transfer_season_end_year
prepared_transfers_df['transfer_season_end_year'] = prepared_transfers_df['transfer_season'].apply(lambda x: int(x.split('/')[0]) + 1)

# Apply embeddings to country columns
country_columns = ['from_country_name', 'to_country_name', 'country_of_citizenship']
for col in country_columns:
    prepared_transfers_df = apply_embeddings(prepared_transfers_df, col)

# Apply embeddings to position and sub_position
prepared_transfers_df = apply_embeddings(prepared_transfers_df, 'position')
prepared_transfers_df = apply_embeddings(prepared_transfers_df, 'sub_position')

# Convert contract_expiration_date
prepared_transfers_df['contract_expiration_date'] = pd.to_datetime(prepared_transfers_df['contract_expiration_date'])
prepared_transfers_df['contract_expiration_date'] = prepared_transfers_df['contract_expiration_date'].dt.year

# Print the first few rows to verify the changes
print(prepared_transfers_df.head())

# Print the column names to verify the new embedding columns
print(prepared_transfers_df.columns)

   player_id transfer_season  from_club_id  to_club_id  from_club_name  \
0     195778           25/26            79          27   VfB Stuttgart   
1     569033           25/26            39          27  1.FSV Mainz 05   
2     626913           25/26           398         380           Lazio   
3     278343           25/26           167         114     FC Augsburg   
4     301238           25/26          2919         506           Monza   

    to_club_name  transfer_fee  market_value_in_eur          player_name  \
0  Bayern Munich           0.0           12000000.0      Alexander Nübel   
1  Bayern Munich           0.0            4000000.0         Armindo Sieb   
2    Salernitana           0.0           10000000.0          Boulaye Dia   
3       Besiktas     5000000.0            7000000.0       Felix Uduokhai   
4       Juventus    14300000.0           18000000.0  Michele Di Gregorio   

  from_competition_id  ... sub_position_emb_0  sub_position_emb_1  \
0                  L1  ...   

In [27]:
# Update train_features
train_features = ['player_id', 'from_club_id', 'market_value_in_eur', 
                  'transfer_season_end_year',
                  'contract_expiration_date', 
                  'highest_market_value_in_eur']

# Add embedding columns
for col in ['from_country_name', 'to_country_name', 'country_of_citizenship', 'position', 'sub_position']:
    train_features.extend([f'{col}_emb_{i}' for i in range(10)])

X = prepared_transfers_df[train_features]
y = prepared_transfers_df['to_club_id']

In [28]:
X

Unnamed: 0,player_id,from_club_id,market_value_in_eur,transfer_season_end_year,contract_expiration_date,highest_market_value_in_eur,from_country_name_emb_0,from_country_name_emb_1,from_country_name_emb_2,from_country_name_emb_3,...,sub_position_emb_0,sub_position_emb_1,sub_position_emb_2,sub_position_emb_3,sub_position_emb_4,sub_position_emb_5,sub_position_emb_6,sub_position_emb_7,sub_position_emb_8,sub_position_emb_9
0,195778,79,12000000.0,26,2026.0,16000000.0,0.847059,0.690196,0.035294,0.160784,...,0.011765,0.294118,0.647059,0.443137,0.007843,0.572549,0.635294,0.650980,0.600000,0.380392
1,569033,39,4000000.0,26,2026.0,4000000.0,0.847059,0.690196,0.035294,0.160784,...,0.278431,0.556863,0.141176,0.047059,0.313725,0.929412,0.682353,0.486275,0.858824,0.858824
2,626913,398,10000000.0,26,2026.0,25000000.0,0.062745,0.027451,0.882353,0.717647,...,0.262745,0.250980,0.101961,0.600000,0.066667,0.270588,0.172549,1.000000,0.478431,0.278431
3,278343,167,7000000.0,26,2025.0,16000000.0,0.847059,0.690196,0.035294,0.160784,...,0.462745,0.133333,0.278431,0.078431,0.925490,0.874510,0.717647,0.243137,0.478431,0.819608
4,301238,2919,18000000.0,26,2029.0,18000000.0,0.062745,0.027451,0.882353,0.717647,...,0.011765,0.294118,0.647059,0.443137,0.007843,0.572549,0.635294,0.650980,0.600000,0.380392
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18949,33829,1085,500000.0,6,2025.0,7000000.0,0.917647,0.443137,0.701961,0.384314,...,0.462745,0.133333,0.278431,0.078431,0.925490,0.874510,0.717647,0.243137,0.478431,0.819608
18951,3333,762,5500000.0,6,2025.0,21000000.0,0.392157,0.964706,0.027451,0.564706,...,0.643137,0.764706,0.941176,0.192157,0.164706,0.007843,0.639216,0.666667,0.592157,0.368627
18953,7825,1050,4500000.0,6,2025.0,22000000.0,0.564706,0.494118,0.729412,0.196078,...,0.011765,0.294118,0.647059,0.443137,0.007843,0.572549,0.635294,0.650980,0.600000,0.380392
18956,15452,3709,800000.0,5,2025.0,18000000.0,0.564706,0.494118,0.729412,0.196078,...,0.462745,0.133333,0.278431,0.078431,0.925490,0.874510,0.717647,0.243137,0.478431,0.819608


In [29]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

In [30]:
from sklearn.metrics import accuracy_score, classification_report

print("Accuracy:", accuracy_score(y_test, y_pred))
# print("\nClassification Report:")
# print(classification_report(y_test, y_pred))

Accuracy: 0.2504604051565378


In [31]:
y_pred

array([  38, 1184,  533, ...,  724,  131, 1005])

In [32]:
y_pred_df = pd.DataFrame({'player_id': X_test['player_id'], 'predicted_club_id': y_pred})

# Join y_pred_df with players_df on player_id
y_pred_df = y_pred_df.merge(players_df[['player_id', 'name']], on='player_id', how='left')

# Join y_pred_df with clubs_df on predicted_club_id
y_pred_df = y_pred_df.merge(clubs_df[['club_id', 'name']], left_on='predicted_club_id', right_on='club_id', how='left')

# Rename columns for clarity
y_pred_df = y_pred_df.rename(columns={'name_x': 'player_name', 'name_y': 'predicted_club_name'})

# Print y_pred_df
print(y_pred_df[['player_id', 'player_name', 'predicted_club_id', 'predicted_club_name']].head(10))

   player_id           player_name  predicted_club_id  \
0     295630  Dimitrios Giannoulis                 38   
1     272855          Paul Onuachu               1184   
2     361260            Ozan Kabak                533   
3     226261       César de la Hoz               1531   
4     479638         Tiago Gouveia               2995   
5     505674        Sergio Camello               1108   
6     441170       Valentin Rosier                336   
7     353948         Lucas Tousart                 44   
8     177476            Nathan Aké                631   
9     423752           Robbie Deas                 43   

                              predicted_club_name  
0                              Fortuna Düsseldorf  
1                    Koninklijke Racing Club Genk  
2  TSG 1899 Hoffenheim Fußball-Spielbetriebs GmbH  
3                                        Elche CF  
4                            FC Paços de Ferreira  
5                         Deportivo Alavés S.A.D.  
6       

In [33]:
import pandas as pd

def predict_transfer_probability(player_data, target_club_id, model, feature_names):
    # Ensure player_data has all necessary features
    for feature in feature_names:
        if feature not in player_data.index:
            raise ValueError(f"Missing feature: {feature}")
    
    # Create a DataFrame with a single row
    input_df = pd.DataFrame([player_data[feature_names].values], columns=feature_names)
    
    # Get probabilities for all classes
    probabilities = model.predict_proba(input_df)[0]
    
    # Find the index of the target club ID in the classes
    target_index = np.where(model.classes_ == target_club_id)[0]

    # Return the probability for the target club
    if len(target_index) > 0:
        return probabilities[target_index[0]]
    else:
        return 0.0  # Return 0 if the club ID is not in the training data

In [34]:
# Select the example player
example_player = X[X['player_id'] == 195778].iloc[0]

target_club_ids = [27, 31, 40, 984, 418, 114, 11, 506, 148]  # Example club IDs

for club_id in target_club_ids:
    probability = predict_transfer_probability(example_player, club_id, clf, X.columns)
    club_name = clubs_df[clubs_df['club_id'] == club_id]['name'].values[0]
    print(f"Probability of transfer to {club_name} (ID: {club_id}): {probability:.2%}")

Probability of transfer to FC Bayern München (ID: 27): 5.00%
Probability of transfer to Liverpool Football Club (ID: 31): 0.00%
Probability of transfer to FC Girondins Bordeaux (ID: 40): 0.00%
Probability of transfer to West Bromwich Albion (ID: 984): 0.00%
Probability of transfer to Real Madrid Club de Fútbol (ID: 418): 0.00%
Probability of transfer to Beşiktaş Jimnastik Kulübü (ID: 114): 0.00%
Probability of transfer to Arsenal Football Club (ID: 11): 0.00%
Probability of transfer to Juventus Football Club (ID: 506): 0.00%
Probability of transfer to Tottenham Hotspur Football Club (ID: 148): 0.00%


In [33]:
mbappe = X[X['player_id'] == 342229].iloc[0]

for club_id in target_club_ids:
    probability = predict_transfer_probability(mbappe, club_id, clf, X.columns)
    club_name = clubs_df[clubs_df['club_id'] == club_id]['name'].values[0]
    print(f"Probability of transfer to {club_name} (ID: {club_id}): {probability:.2%}")

Probability of transfer to FC Bayern München (ID: 27): 0.00%
Probability of transfer to Liverpool Football Club (ID: 31): 0.00%
Probability of transfer to FC Girondins Bordeaux (ID: 40): 0.00%
Probability of transfer to West Bromwich Albion (ID: 984): 0.00%
Probability of transfer to Real Madrid Club de Fútbol (ID: 418): 76.00%
Probability of transfer to Beşiktaş Jimnastik Kulübü (ID: 114): 0.00%
Probability of transfer to Arsenal Football Club (ID: 11): 0.00%
Probability of transfer to Juventus Football Club (ID: 506): 0.00%
Probability of transfer to Tottenham Hotspur Football Club (ID: 148): 0.00%


In [34]:
sterling = X[X['player_id'] == 134425].iloc[0]

for club_id in target_club_ids:
    probability = predict_transfer_probability(sterling, club_id, clf, X.columns)
    club_name = clubs_df[clubs_df['club_id'] == club_id]['name'].values[0]
    print(f"Probability of transfer to {club_name} (ID: {club_id}): {probability:.2%}")

Probability of transfer to FC Bayern München (ID: 27): 0.00%
Probability of transfer to Liverpool Football Club (ID: 31): 2.00%
Probability of transfer to FC Girondins Bordeaux (ID: 40): 0.00%
Probability of transfer to West Bromwich Albion (ID: 984): 0.00%
Probability of transfer to Real Madrid Club de Fútbol (ID: 418): 0.00%
Probability of transfer to Beşiktaş Jimnastik Kulübü (ID: 114): 0.00%
Probability of transfer to Arsenal Football Club (ID: 11): 3.00%
Probability of transfer to Juventus Football Club (ID: 506): 0.00%
Probability of transfer to Tottenham Hotspur Football Club (ID: 148): 7.00%


In [35]:
udokhai = X[X['player_id'] == 278343].iloc[0]

for club_id in target_club_ids:
    probability = predict_transfer_probability(udokhai, club_id, clf, X.columns)
    club_name = clubs_df[clubs_df['club_id'] == club_id]['name'].values[0]
    print(f"Probability of transfer to {club_name} (ID: {club_id}): {probability:.2%}")

Probability of transfer to FC Bayern München (ID: 27): 0.00%
Probability of transfer to Liverpool Football Club (ID: 31): 0.00%
Probability of transfer to FC Girondins Bordeaux (ID: 40): 0.00%
Probability of transfer to West Bromwich Albion (ID: 984): 0.00%
Probability of transfer to Real Madrid Club de Fútbol (ID: 418): 0.00%
Probability of transfer to Beşiktaş Jimnastik Kulübü (ID: 114): 67.00%
Probability of transfer to Arsenal Football Club (ID: 11): 0.00%
Probability of transfer to Juventus Football Club (ID: 506): 0.00%
Probability of transfer to Tottenham Hotspur Football Club (ID: 148): 0.00%


In [36]:
di_gregorio = X[X['player_id'] == 301238].iloc[0]

for club_id in target_club_ids:
    probability = predict_transfer_probability(di_gregorio, club_id, clf, X.columns)
    club_name = clubs_df[clubs_df['club_id'] == club_id]['name'].values[0]
    print(f"Probability of transfer to {club_name} (ID: {club_id}): {probability:.2%}")


Probability of transfer to FC Bayern München (ID: 27): 0.00%
Probability of transfer to Liverpool Football Club (ID: 31): 0.00%
Probability of transfer to FC Girondins Bordeaux (ID: 40): 0.00%
Probability of transfer to West Bromwich Albion (ID: 984): 0.00%
Probability of transfer to Real Madrid Club de Fútbol (ID: 418): 0.00%
Probability of transfer to Beşiktaş Jimnastik Kulübü (ID: 114): 0.00%
Probability of transfer to Arsenal Football Club (ID: 11): 0.00%
Probability of transfer to Juventus Football Club (ID: 506): 62.00%
Probability of transfer to Tottenham Hotspur Football Club (ID: 148): 0.00%


In [38]:
gonzalez = X[X['player_id'] == 486031].iloc[0]

for club_id in target_club_ids:
    probability = predict_transfer_probability(gonzalez, club_id, clf, X.columns)
    club_name = clubs_df[clubs_df['club_id'] == club_id]['name'].values[0]
    print(f"Probability of transfer to {club_name} (ID: {club_id}): {probability:.2%}")


NameError: name 'vuskovic' is not defined

In [35]:
import joblib

joblib.dump(clf, 'random_forest_model.pkl')

['random_forest_model.pkl']

In [None]:
# Shutdown Ray
# ray.shutdown()