In [1]:
import pandas as pd
import numpy as np
import os
import ftfy

In [None]:
df = pd.read_csv('player_stats.csv')
df.head()

Unnamed: 0,id,name,dateOfBirth,Age,Height,Foot,Position,OtherPosition,National,MarketValue,...,25MP,25AP,Ranking,2020AvgMV,2021AvgMV,2022AvgMV,2023AvgMV,2024AvgMV,2025AvgMV,TotalCups
0,591949,Aaron Hickey,"Jun 10, 2002",22.0,"1,85m",both,Right-Back,"['Left-Back', 'Left Midfield']",Scotland,22000000.0,...,0.0,0,432.0,1845000.0,8666667.0,17000000.0,28333330.0,25000000.0,0.0,0
1,434207,Aaron Connolly,"Jan 28, 2000",25.0,"1,74m",right,Centre-Forward,['Second Striker'],Ireland,2500000.0,...,505.0,14,3.644,5500000.0,7000000.0,5500000.0,3500000.0,2500000.0,0.0,2
2,578459,Aaron Bastiaans,"Apr 4, 2002",22.0,"1,84m",right,Left Winger,"['Right Winger', 'Centre-Forward']",Netherlands,0.0,...,0.0,0,0.0,150000.0,125000.0,175000.0,0.0,0.0,0.0,0
3,884244,AJ Marcucci,"Jul 31, 1999",25.0,"1,91m",,Goalkeeper,[],United States,250000.0,...,0.0,0,0.0,0.0,100000.0,125000.0,150000.0,216666.7,0.0,0
4,401362,AarÃ³n Herrera,"Jun 6, 1997",27.0,"1,80m",right,Right-Back,"['Left-Back', 'Right Midfield']",United States,2000000.0,...,0.0,0,4.274,966666.7,1600000.0,2000000.0,2000000.0,2000000.0,0.0,1


#### Handling missing values

In [3]:
#Check for misssing values in columns and store in df
missing = df.isnull().sum()
missing[missing > 0]

dateOfBirth          16
Age                  16
Height              196
Foot                691
Position             61
National            239
Outfitter         10330
Club_name            16
ContractExpiry       16
ContractOption    12061
dtype: int64

In [4]:
#Drop null columns
df.dropna(subset=['Height'], inplace=True)
df.dropna(subset=['Age'], inplace=True)
df.dropna(subset=['Club_name'], inplace=True)
df.dropna(subset=['Position'], inplace=True)
df.dropna(subset=['National'], inplace=True)


#Set foot as 'hand' for goalkeepers
df.loc[df['Position'] == 'Goalkeeper', 'Foot'] = 'hand'

#Find average values for foot using player position
foot_dict = df.groupby('Position')['Foot'].agg(lambda x: x.value_counts().index[0]).to_dict()
#Use the dictionary to fill in missing values
df['Foot'] = df.apply(lambda row: foot_dict[row['Position']] if pd.isnull(row['Foot']) else row['Foot'], axis=1)

#Fill in missing values for other columns
df.fillna({"Outfitter": "Unknown", "ContractOption": "None"}, inplace=True)


missing = df.isnull().sum()
missing[missing > 0]

Outfitter          9894
ContractOption    11630
dtype: int64

#### Transforming data

In [5]:
#Fix player names
df['name'] = df['name'].apply(lambda x: ftfy.fix_text(x))
#Fix club names
df['Club_name'] = df['Club_name'].apply(lambda x: ftfy.fix_text(x))

#Convert age to int
df['Age'] = df['Age'].astype(int)

#Convert height to float
df['Height'] = df['Height'].str.replace(',', '.').str.replace('m', '').astype(float)

#### Save cleaned data

In [None]:
df.to_csv('cleaned_player_stats.csv', index=False)

### Create Classification Data

In [None]:
def categorize_value(value):
    if value < 5000000:
        return 0  # Not valuable
    elif value <= 20000000:
        return 1  # Valuable
    else:
        return 2  # Highly valuable
    
    
data = pd.read_csv('Data/player_stats_cleaned.csv')
data.drop(columns=['id'], inplace=True)
#Filter with market value above 10000000
# data = data[data['MarketValue'] > 5000000]
data['ValueCategory'] = data['MarketValue'].apply(categorize_value)
data['Outfitter'] = data['Outfitter'].fillna('None')
data['ContractOption'] = data['ContractOption'].fillna('None')
data.to_csv('Data/Classification_data.csv', index=False)
