### 1. Data Review & Cleaning

#### 1.1. Initial review - remove redundant columns, standardize column names

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
pd.set_option('display.max_columns', None)

In [None]:
# Read dataset
data = pd.read_csv("fifa21_male2.csv")

In [None]:
# Have a quick look at the data:
data.head()

We notice immediately that the columns `Player Photo`, `Club Logo`, and `Flag Photo` contain *sofifa* links, which will not help in the current analysis, so we can drop them.

In [None]:
data.drop(['Player Photo', 'Club Logo', 'Flag Photo'], axis=1, inplace=True)

We also notice there is an ID column which we can use as our index column, after ensuring it doesn't have any duplicate values:

In [None]:
# If the number of unique values in the ID column are the same as the number of rows
# in the dataframe, we can set it as the index.
print(len(data['ID'].unique())/data.shape[0])

In [None]:
data.set_index('ID', inplace=True, drop=True)

We also notice that there are many columns which contain abbreviations that are unfamiliar for a non-player, so we will rename the columns to include the full names of the features. We will do so by using a `.csv` file containing the values for the abbreviations as seen on the *sofifa* website and standardize the column names alongside:

In [None]:
# Change capitalization to lowercase and replace spaces with underscores:
data.columns = data.columns.str.lower()
data.columns = data.columns.str.replace(" ", "_")

In [None]:
# Read positions.csv into dataframe
positions = pd.read_csv('positions.csv', header=None, index_col=0)

# Change dataframe to series and then dictionary so it can be used to rename columns:
positions = positions.squeeze().to_dict()

# Change the column names:
for column in data:
    if column in positions:
        data.rename(columns=positions, inplace=True)

We can quickly check how many columns have more than 75% of null values, so we can discard them from our analysis:

In [None]:
def check_null_values(df, threshold=75):
    nulls_percentage = {}
    for column in df:
        number_of_nulls = df[column].isna().sum()
        null_percentage = round(number_of_nulls * 100 / df.shape[0], 1)
        if null_percentage >= threshold:
            nulls_percentage[column] = null_percentage
    return nulls_percentage

check_null_values(data)

In [None]:
# As `loan_date_end` has mostly `NaN` values, we can discard it:
data.drop('loan_date_end', axis=1, inplace=True)

We can have a quick look at the maximum percentage of `NaN` values in any column for reference:

In [None]:
def max_nulls(df):
    nulls_percentage = []
    for column in df:
        number_of_nulls = df[column].isna().sum()
        null_percentage = round(number_of_nulls * 100 / df.shape[0], 1)
        nulls_percentage.append(null_percentage)
    return max(nulls_percentage)

max_nulls(data)

As the maximum amount of nulls in any column is 2.5%, we can continue with our initial review of the data. We can have a look at the number of unique values per column to see if there are any columns that have only one value:

In [None]:
def check_unique_values(df):
    single_value_columns = []
    for column in df:
        if len(df[column].unique()) == 1:
            single_value_columns.append(column)
    return single_value_columns

check_unique_values(data)

In [None]:
# Check the unique values in the gender column
data['gender'].unique()

In [None]:
# Remove the Gender column as the data shows only male players
data.drop('gender', axis=1, inplace=True)

We can see the `team_&_contract` column seems to have the same information as the `club` & `contract` columns, so we might be able to remove it as well after checking that our assumption is correct.

In [None]:
# Create a joined column to compare to the team & contract column
data['club_&_contract'] = data['club'] + ' ' + data['contract']

In [None]:
print(data['club_&_contract'].head())
print(data['team_&_contract'].head())

In [None]:
# Check that the columns are identical
def check_identical_columns(col1, col2, df):
    diff_values = pd.DataFrame({col1:[], col2:[]})
    identical = 1

    for i in range(0, df.shape[0]):
        if df[col1].iloc[i] == df[col2].iloc[i]:
            continue
        else:
            diff_values.loc[len(diff_values.index)] = [df[col1].iloc[i], df[col2].iloc[i]]
            identical = 0
            

    if identical == 0:
        print(diff_values)
    else:
        return 'Columns are identical.'         
            
check_identical_columns(col1='club_&_contract', col2='team_&_contract', df=data)         

In [None]:
# Check null values in columns:
print(data['club_&_contract'].isna().sum())
print(data['team_&_contract'].isna().sum())

In [None]:
# Check if columns are identical after removing the rows with null values:
data_test = data.copy()
data_test.dropna(axis=0, how="any", subset=['club_&_contract'], inplace=True)


check_identical_columns(col1='club_&_contract', col2='team_&_contract', df=data_test)

In [None]:
# We can remove Team & Contract column, given the same information is present in the Club
# and Contract columns
#data.drop('Team & Contract', axis=1, inplace=True)

#### 1.2. Data Cleaning

##### 1.2.1. Numerical Data Cleaning

`value`, `wage`, and `release clause` columns:

In [None]:
financials = ['value', 'wage', 'release_clause']

def clean_value(i):
    x = float(i.replace(".","").replace("€","").replace("K","000").replace("M","00000"))
    return x

for column in financials:
    data[column] = data[column].apply(clean_value)

`weight` column:

In [None]:
def clean_weight(i):
    x = float(i.replace('lbs',''))
    return x

data["weight"] = data["weight"].apply(clean_weight)

`height` column:

In [None]:
def convert_height(i):
    to_cm = 2.54
    x = i.replace("'"," ")
    x2 = x.replace('"','') # need to do it in two parts 
                           # because of different quote used 
                           # for inch and foot 
    y = x2.split()
    height = round(((float(y[0])*12)+float(y[1]))*to_cm,0)
    return height

data['height'] = data['height'].apply(convert_height)
data['height']

Cleaning the positions columns, i.e. `left-striker`, `goalkeeper`, etc. :

In [None]:
def cleaning_positions(i):
    x = float(i.replace("+",".").replace("-",""))
    return x

for col in data.loc[:, 'left_striker':'goalkeeper']:
    data[col] = data[col].apply(cleaning_positions)
data.head()

`weak_foot`, `skill_moves`, and `international_reputation` columns:

In [None]:
star_columns = ['weak_foot', 'skill_moves', 'international_reputation']

# Check unique values
for column in star_columns:
    print(data[column].unique())

In [None]:
# Select only the first character from the string, then convert the data type to integer
# and check that the operation was successful
for column in star_columns:
    data[column] = data[column].str[0]
    data[column] = pd.to_numeric(data[column], errors='raise')
    print(data[column].dtypes)

In [None]:
# Check that all edited 

##### 1.2.2. Categorical Data Cleaning

Extracting the contract ending data from the `contract` column:

In [None]:
# Explore the contract data
data['contract'].unique()

# As the end of the contract is typically represented by the last 4 characters of the 
# strings, we will extract those where possible:
def clean_contract(x):
    try:
        x = int(x[-4:])
    except:
        pass
    return x

data['contract'] = data['contract'].apply(clean_contract)

# Check what non-integer values remained in the column:
data['contract'].unique()

In [None]:
# As there are many values ending with 'On Loan', we'll first check that these are not
# equivalent to the joined date by looking at some data samples. Additionally,
# we also want to see if the "Country Free" is related to the nationality of the player:
print(data.loc[:, ['contract', 'joined', 'nationality']].head(50))
print(data.loc[:, ['contract', 'joined', 'nationality']].tail(50))

In [None]:
# We'll now remove the 'On Loan' string from the contract column to extract the year and
# replace all 'Country Free' values with NaN:
def clean_loans(x):
    try:
        x = int(x.replace(" On Loan", "")[-4:])
    except:
        x = np.nan
    return x    

data['contract'] = data['contract'].apply(clean_loans)

# Check that the operation was successful
data['contract'].unique()

`nationality` column:

In [None]:
data["nationality"] = data["nationality"].apply(lambda x: "Democratic Republic of the Congo" if str(x).startswith("DR")
                                                 else "North Korea" if str(x).endswith("DPR")
                                                 else "China" if str(x).endswith("PR")
                                                 else str(x).replace("&amp;","and") if "&amp;" in x
                                                 else x)
# data['nationality'].unique()

In [None]:
data.head()