In [12]:
import pandas as pd
import numpy as np

# Display all floats with 2 decimals
pd.options.display.float_format = '{:.2f}'.format

df = pd.read_csv('data/raw/data.csv', index_col=0)

In [13]:
df['Position'] = df['Position'].map({'GK': 'GK', 'CAM': 'CAM','CB': 'CB', 'CDM': 'CDM',
                                      'CM': 'CM','LB': 'LB','LM': 'LM','LW': 'LW',
                                      'RB': 'RB','RM': 'RM','RW': 'RW','ST': 'ST',
                                      'CF': 'ST','LAM': 'CAM','LCB': 'CB','LCM': 'CM',
                                      'LDM': 'CDM','LF': 'LW','LS': 'ST', 'LWB': 'LB',
                                      'RAM': 'CAM', 'RCB': 'CB','RCM': 'CM', 'RDM': 'CDM',
                                      'RF': 'RW', 'RS': 'ST','RWB': 'RB'})

# Remove rows with Position = NaN
df.dropna(subset=['Position'], inplace=True)

In [14]:
# Function that turn a string variable "€100M" to 100,000,000
def money_string_replace(variable_name):
    df[variable_name] = (df[variable_name].str[1:].replace(r'[KM]+$', '', regex=True).astype(float) *
                 df[variable_name].str.extract(r'[\d\.]+([KM]+)', expand=False)
                 .fillna(1)
                 .replace(['K','M'], [10**3, 10**6]).astype(int))

money_string_replace('Value')
money_string_replace('Wage')
money_string_replace('Release Clause')

In [15]:
# Creating a dummy variable to identify if a player is left footed or not
df.loc[df['Preferred Foot'] == "Left", 'Preferred Foot = Left'] = 1
df.loc[df['Preferred Foot'] == "Right", 'Preferred Foot = Left'] = 0

In [16]:
# Splitting the work rate column into attacking and defensive rates.
df['Work Rates'] = df['Work Rate'].str.split('/')

# Allocating each list item into the according work type
df['Attacking Rate'] = df['Work Rates'].str.get(0)

# Removing any whitespace
df['Attacking Rate'] = df['Attacking Rate'].str.strip()

# Creating a numeric variable for each work type
df.loc[df['Attacking Rate'] == "Low", 'Attacking Rate'] = 1
df.loc[df['Attacking Rate'] == "Medium", 'Attacking Rate'] = 2
df.loc[df['Attacking Rate'] == "High", 'Attacking Rate'] = 3

df['Defensive Rate'] = df['Work Rates'].str.get(1)
df['Defensive Rate'] = df['Defensive Rate'].str.strip()
df.loc[df['Defensive Rate'] == "Low", 'Defensive Rate'] = 1
df.loc[df['Defensive Rate'] == "Medium", 'Defensive Rate'] = 2
df.loc[df['Defensive Rate'] == "High", 'Defensive Rate'] = 3

In [17]:
# Converting pounds to kg
df["Weight"] = df["Weight"].str[:3].astype(float) /2.205

## Converting Feet and Inches string to a CM value
# Splitting Feet from Inches
df['HeightCM'] = df['Height'].str.split("'")

# Further split of feet and inches
df['feet'] = df['HeightCM'].str.get(0).astype(float)
df['inch'] = df['HeightCM'].str.get(1).astype(float)

# Replcaing Old string with new CM value
df['Height'] = (df['feet']*12+df['inch'])*2.54


In [22]:
df.loc[df['Body Type'] == "Lean", 'Body Type'] = 1
df.loc[df['Body Type'] == "Normal", 'Body Type'] = 2
df.loc[df['Body Type'] == "Stocky", 'Body Type'] = 3
df.loc[df['Body Type'] == "Messi", 'Body Type'] = 2
df.loc[df['Body Type'] == "C. Ronaldo", 'Body Type'] = 2
df.loc[df['Body Type'] == "Neymar", 'Body Type'] = 1
df.loc[df['Body Type'] == "Courtois", 'Body Type'] = 2
df.loc[df['Body Type'] == "Shaqiri", 'Body Type'] = 3
df.loc[df['Body Type'] == "Akinfenwa", 'Body Type'] = 3

df.loc[df['Body Type'] == "PLAYER_BODY_TYPE_25", 'Body Type'] = np.nan
df.dropna(subset=['Body Type'], inplace=True)

In [24]:
# Dropping unneccesary columns
df.drop(["Photo","Flag","Club Logo","Special","ID",'Preferred Foot','Real Face',
        'Jersey Number','Loaned From','LS','ST','RS','LW','LF','CF',
        'RF','RW','LAM','CAM','RAM','LM', 'LB','LCB','CB','RCB',
        'RB','LCM','CM','RCM','RM','LWB','LDM','CDM','RDM','RWB',
        'Work Rate','Work Rates', 'HeightCM', 'feet', 'inch'],
        axis=1, inplace=True)

In [25]:
df.to_csv('data/processed/clean_dataset.csv')