In [None]:
import ray
import ray.data
import pandas as pd

In [None]:
from hdfs import Config

client = Config(path="./config/.hdfscli.cfg").get_client(
    "dev"
)

In [None]:
# Insert files, taken from populate_hdfs
import os
files_to_upload = os.listdir('data/') 

remote_path = "/data/"

In [None]:
# make sure remote path exists
client.makedirs(remote_path)

# Insert files, taken from populate_hdfs

# Check if the file exists
for file in files_to_upload:
    local_path = f"./data/{file}"
    print(f"Checking if {file} exists in {remote_path}...")
    if client.status(remote_path + file, strict=False):
        print(f"{file} exists in {remote_path}!")
        continue

    print(f"{file} does not exist in {remote_path}!")
    print(f"Uploading {file} to {remote_path}...")
    # Upload a file to tmp, to be processed further
    client.upload(remote_path, local_path)

print(f"contents in {remote_path}: ", client.list("/data"))

In [None]:
# Initialize Ray
ray.init(dashboard_host="0.0.0.0")

In [None]:
# Read the CSV file from HDFS
with client.read(f"{remote_path}transfers.csv") as reader:
    file_contents = reader.read().decode('utf-8')

# Load the CSV data into a Pandas DataFrame
from io import StringIO
df = pd.read_csv(StringIO(file_contents))

# Convert the Pandas DataFrame into a Ray Dataset
dataset = ray.data.from_pandas(df)

## Pre-processing

1. Remove entries where `transfer_fee == NaN`, since these entries are usually internal transfers (or from lower league youth teams).
2. Filter entries where `market_value_in_eur == Nan`, since we assume it's hard to find any info about these players

For now, we already execute/apply the filtering. But in the future, we will do all the processing first and then train our model on the batches, (hopefully) never applying `take_all`.

In [None]:
import time

# Define a simple filter function
def filter_func(batch):
    return batch[
        batch['transfer_fee'].notna() & 
        (~batch['transfer_fee'].isna()) & 
        batch['market_value_in_eur'].notna() & 
        (~batch['market_value_in_eur'].isna())
    ]

# Apply the filter using map_batches for better parallelization
start_time = time.time()
filtered_ds = dataset.map_batches(
    filter_func,
    batch_format="pandas",
    num_cpus=1  # This will allow up to 8 batches to be processed in parallel
)

# Materialize the results
result = filtered_ds.take_all()
end_time = time.time()

print(f"Filtering time: {end_time - start_time} seconds")
print(f"Original dataset size: {dataset.count()}")
print(f"Filtered dataset size: {len(result)}")

(Potential additional steps)

3. Remove retired players
4. Drop `transfer_date` column, as we don't need it for anything (the `transfer_season` should be enough for everything time-related).
5. Drop one of `from_club_name` or `from_club_id` (and the same for `to_club_...`).

In [None]:
# From this table
# player_id, transfer_season, from_club_id, to_club_id, market_value_in_eur, fee

# Other useful tables and their attributes

# appearances.csv - minutes played, goals, assists
# (Would be hard to map to individual players playing, e.g. how do we know who was on the pitch when a goal was scored or conceded?) 
# club_games.csv - own_position, opponent_goals, opponent_position
# clubs.csv - domestic_competition_id, squad_size, average_age, foreigners_percentage, national_team_players, net_transfer_record, (maybe to filter outdated clubs) last_season
# (IMO useless) competitions.csv
# game_events.csv - player_id, type (goal, assist, card)
# (To know no. of games started) game_lineups.csv - player_id, position, type (substitute, starter)
# (IMO useless) games.csv
# (Useful for training, to know the valuation at the time of transfer, maybe 1 year prior?) player_valuations.csv - date, market_value_in_eur, current_club_id, player_id
# players.csv - last_season (filter retired players), country_of_birth, country_of_citizenship, position, sub_position, foot, height_in_cm, contract_expiration_date, agent_name, market_value_in_eur, highest_market_value_in_eur

## Variables

Independent - player information (from other tables), `market_value_in_eur`, `from_club_name`/`from_club_id`

Dependant - `to_club_id`/`to_club_name`, `transfer_fee`

Because we have multiple dependant variables, there would be two models - one regression one predicting the transfer fee and another one (classifier most likely) predicting the club ID/name.

When using as a service, it'd be nice if `player_id` and `to_club_name` were only necessary inputs and the rest read from HDFS/other data storage.
Let's presume that in these scenarios, the `transfer_season` would be the current one (24/25).

Representing club names/ids the best way possible:
- initially as IDs, but that could be interpreted as ordinality by the model
- ideally as embeddings - either of the club name or combinations such as "club country + league + club name"

In [None]:
# Shutdown Ray
# ray.shutdown()