In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
player_file_list = ['players_15.csv', 'players_16.csv', 'players_17.csv', 'players_18.csv', 'players_19.csv', 'players_20.csv']
#player_file_list = ['players_19.csv']
#player_file_list = ['players_19.csv']
input_directory = 'data/'
output_directory = 'cleaned_data/'

In [3]:
target_cols = set()

In [4]:
def cleandata_number_values(df):
    
    df = df.copy(deep=True)
    
    column_number_fix_list = ['attacking_crossing', 'attacking_finishing', 'attacking_heading_accuracy', 'attacking_short_passing', \
                          'attacking_volleys', 'skill_dribbling', 'skill_curve', 'skill_fk_accuracy', 'skill_long_passing', \
                          'skill_ball_control', 'movement_acceleration', 'movement_sprint_speed', 'movement_agility', \
                          'movement_reactions', 'movement_balance', 'power_shot_power', 'power_jumping', 'power_stamina', \
                          'power_strength', 'power_long_shots', 'mentality_aggression', 'mentality_interceptions', \
                          'mentality_positioning', 'mentality_vision', 'mentality_penalties', 'mentality_composure', \
                          'defending_marking', 'defending_standing_tackle', 'defending_sliding_tackle', 'goalkeeping_diving', \
                          'goalkeeping_handling', 'goalkeeping_kicking', 'goalkeeping_positioning', 'goalkeeping_reflexes', \
                          'ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', \
                          'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb']
        
    # fix the +/- within number cols
    pattern = r'^[\d]+'
    for index, row in df.iterrows():
        for column in column_number_fix_list:
            value = row[column]
            if type(value) is str:
                match = re.match(pattern, value)
                if match:
                    value = int(match[0])
                    df.at[index, column] = value
                    
    return df

In [5]:
def cleandata_classes(df):
    
    df = df.copy(deep=True)
    
    # make team_position binary
    team_pos_col = 'team_position'
    for position in df[team_pos_col].unique():
        print(f'Processing team position: {position}...')
        col_name = team_pos_col + '_' + position.lower()
        df[col_name] = 1 * (df[team_pos_col] == position)
        
        target_cols.add(col_name)
        print(f'Added binary column: {col_name}')
        
    if team_pos_col in target_cols:
        target_cols.remove(team_pos_col)
    print('Removed team_position.')
    
    # get player_positions values
    player_pos_col = 'player_positions'
    player_pos_values = set()
    for position in df[player_pos_col].unique():
        pos_values = position.split(', ')
        for pos_value in pos_values:
            player_pos_values.add(pos_value)
            
    # make player_positions binary
    for player_pos_value in player_pos_values:
        print(f'Processing player position: {player_pos_value}...')
        col_name = player_pos_col + '_' + player_pos_value.lower()
        df[col_name] = 0
    
        for index, row in df.iterrows():
            if player_pos_value in row[player_pos_col]:
                df.loc[index, col_name] = 1
        
        target_cols.add(col_name)
        print(f'Added column: {col_name}')
    if player_pos_col in target_cols:
        target_cols.remove(player_pos_col)
    print('Removed player_positions.')
    
    # set preferred foot
    preferred_foot_col = 'preferred_foot'
    preferred_foot_col_new = 'preferred_foot_bin'
    df[preferred_foot_col_new] = 1 * (df[preferred_foot_col] == 'Left')
    target_cols.add(preferred_foot_col_new)
    if preferred_foot_col in target_cols:
        target_cols.remove(preferred_foot_col)
    print('Removed preferred_foot.')
        
    return df


In [6]:
def cleandata_check_na(df):
    
    df = df.copy(deep=True)
    
    # drop empty rows for positions
    df = df.dropna(subset=['team_position', 'player_positions', 'preferred_foot'])
    
    # check for other NaNs values
    for target_col in target_cols:
        for val in np.array(df[target_col].unique()):
            if np.nan == val:
                print(f'Column {target_col} has NaN in it.')
                
    return df

In [7]:
def cleandata_align_goalkeeper_data(df):
    
    df = df.copy(deep=True)
    
    # fill in the gaps for goalkeepers & inverse as zeros
    for index, row in df.iterrows():
        for col in target_cols:
            if pd.isnull(row[col]) or row[col] == '':
                df.loc[index, col] = 0
                
    return df

In [8]:
def clean_file_data(df):    
    
    # check/fix NaN
    print('Checking & fixing NaN values...')
    df = cleandata_check_na(df)
    
    # fix the +/- within number cols
    print('Fixing row +/- values...')
    df = cleandata_number_values(df)
                    
    # split position columns
    print('Fixing class columns (e.g. positions)...')
    df = cleandata_classes(df)
    
    # fill any gaps
    print('Filling any remaining holes, such as with goalkeepers...')
    df = cleandata_align_goalkeeper_data(df)
    
    return df

In [9]:
for file_name in player_file_list:
    
    target_cols = {'age', 'height_cm', 'weight_kg', 'value_eur', 'wage_eur', 'player_positions', 'preferred_foot', 'international_reputation', \
                   'weak_foot', 'skill_moves', 'release_clause_eur', 'team_position', 'pace', \
                   'shooting', 'passing', 'dribbling', 'defending', 'physic', 'gk_diving', 'gk_handling', 'gk_kicking', 'gk_reflexes', 'gk_speed', 'gk_positioning', \
                   'attacking_crossing', 'attacking_finishing', 'attacking_heading_accuracy', 'attacking_short_passing', 'attacking_volleys', \
                   'skill_dribbling', 'skill_curve', 'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control', 'movement_acceleration', \
                   'movement_sprint_speed', 'movement_agility', 'movement_reactions', 'movement_balance', 'power_shot_power', 'power_jumping', \
                   'power_stamina', 'power_strength', 'power_long_shots', 'mentality_aggression', 'mentality_interceptions', 'mentality_positioning', \
                   'mentality_vision', 'mentality_composure', 'defending_marking', 'defending_standing_tackle', 'defending_sliding_tackle', \
                   'goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking', 'goalkeeping_positioning', 'goalkeeping_reflexes', \
                   'ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', \
                   'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb'}
    
    input_path = input_directory + file_name
    df = pd.read_csv(input_path)
    
    df = clean_file_data(df)
    
    print('\nColumn values & types after cleaning:')
    for target_col in target_cols:
        print(f'   {target_col}: {df[target_col].dtypes}')
    
    output_path = output_directory + file_name
    df.to_csv(output_path)
    print('\nCleaned file written.')
    
    with open(output_directory + 'player_data_cols.csv', 'w') as outfile:
        for target_col in target_cols:
            outfile.write(f'{target_col}\n')
    print('\nWrote target player columns file.')

Checking & fixing NaN values...
Fixing row +/- values...
Fixing class columns (e.g. positions)...
Processing team position: CF...
Added binary column: team_position_cf
Processing team position: LW...
Added binary column: team_position_lw
Processing team position: SUB...
Added binary column: team_position_sub
Processing team position: ST...
Added binary column: team_position_st
Processing team position: GK...
Added binary column: team_position_gk
Processing team position: RES...
Added binary column: team_position_res
Processing team position: LM...
Added binary column: team_position_lm
Processing team position: RS...
Added binary column: team_position_rs
Processing team position: RW...
Added binary column: team_position_rw
Processing team position: LS...
Added binary column: team_position_ls
Processing team position: RCM...
Added binary column: team_position_rcm
Processing team position: LCB...
Added binary column: team_position_lcb
Processing team position: CAM...
Added binary column: 

In [10]:
df = pd.read_csv('cleaned_data/players_19.csv')
print('\nColumn values & types after cleaning:')
for target_col in target_cols:
    is_error_str = ''
    if df[target_col].dtypes == 'object':
        is_error_str = '[ERROR] '
    print(f'{is_error_str}   {target_col}: {df[target_col].dtypes}')


Column values & types after cleaning:
   team_position_rm: int64
   attacking_crossing: int64
   movement_balance: int64
   team_position_rf: int64
   mentality_vision: int64
   lb: int64
   goalkeeping_diving: int64
   player_positions_lm: int64
   cf: int64
   team_position_rs: int64
   pace: float64
   dribbling: float64
   lwb: int64
   skill_dribbling: int64
   attacking_finishing: int64
   cm: int64
   rcm: int64
   cam: int64
   team_position_rdm: int64
   player_positions_rwb: int64
   team_position_rcb: int64
   gk_positioning: float64
   skill_fk_accuracy: int64
   goalkeeping_kicking: int64
   cb: int64
   rw: int64
   team_position_st: int64
   player_positions_lwb: int64
   player_positions_cm: int64
   player_positions_cf: int64
   team_position_ldm: int64
   ls: int64
   team_position_lm: int64
   physic: float64
   weak_foot: int64
   defending_sliding_tackle: int64
   team_position_gk: int64
   rwb: int64
   team_position_cam: int64
   lf: int64
   team_position_lf: i

  interactivity=interactivity, compiler=compiler, result=result)
