In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import squarify  # to draw treemap in matplotlib

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Read clean dataframe from previous notebook
data = pd.read_csv('fifa21_male2-post-cleaning.csv')

### 2. Exploratory Data Analysis

#### 2.1. Categorical & Discrete Numerical data

In [None]:
# Select categorical and discrete numerical data
cat_cols = []

for column in data.columns:
    if len(data[column].unique()) <= 10 or data[column].dtypes == np.object:
        cat_cols.append(column)

# Save bar plots for categorical data with few values
for column in cat_cols:
    x = data[column].unique()
    y = data[column].value_counts()
    if len(x) < 100:
        fig, ax = plt.subplots(figsize = (12, 9))
        plt.title(column)
        plt.bar(x, y)
        fig.savefig('cat_graphs/{}.png'.format(column))
        plt.close(fig)

From these plots, we see that:
* most players in the dataset are in center positions (*CM*, *CAM*, *CF*)
- players' position rating is mostly from 1-3 stars
* ~ 3/4 of players prefer playing with their right-foot
- most players play relatively well with their weak foot (4-5 ratings)
* most players have a 3-4 rating for skill moves
- most players have a moderate amount of wins away from home, as well as draws
* the vast majority of the players in the dataset have an international reputation of 1 star
- the vast majority of players play for the team they've been contracted in, rather than on loan

Therefore, we'll remove the `international_reputation` and `on_loan` columns:

In [None]:
data.drop(['international_reputation', 'on_loan'], axis=1, inplace=True)

Given there are too many categories in the `nationality` and `club` columns, we cannot easily visualize them using (small) bar plots. However, inspired by [4m4n5's graph](https://github.com/4m4n5/fifa18-all-player-statistics) for representing nationality data, we used a treemap:

In [None]:
nationality = pd.DataFrame(data['nationality'].value_counts())
nationality.sort_values(ascending=False, inplace=True, by='nationality')
nationality.reset_index(inplace=True)
nationality.rename({'index': 'nationality', 'nationality':'number_of_players'}, axis=1, inplace=True)

In [None]:
# Rename countries with less than 10 players to 'Other' and remove columns:
other_count = 0

for i in range(0, nationality.shape[0]):
    if nationality.loc[i, 'number_of_players'] < 30:
        nationality.loc[i,'nationality'] = 'Other'
        other_count += nationality.loc[i, 'number_of_players']
        nationality.drop(i, axis=0, inplace=True)

# Add back the consolidated 'Other' column:
nationality.loc[len(nationality.index)] = ['Other', other_count]

In [None]:
# Plot data
fig, ax = plt.subplots(figsize = (20, 9))
squarify.plot(sizes=nationality['number_of_players'], label=nationality['nationality'], alpha=.8)
plt.axis('off')
plt.show()

#### 2.2. Continuous numerical data

We will have a look at the overall and potential scores of the players, to get an idea of whether or not they would be collinear:

In [None]:
# Sort the best players by overall score
best_players = data.sort_values(["overall_score"], ascending=[False])
rank = best_players[["name", "overall_score"]]
rank.head(10)

In [None]:
# Sort the best potential players
best_pot = data.sort_values(["potential_score"], ascending=[False])
rank_2 = best_players[["name", "potential_score"]]
rank_2.head(10) # same players as sorted by overall score

In [None]:
# Would there be any difference between players with the worst overall and potential score?
worst_players = data.sort_values(["overall_score"], ascending=[False])
rank = worst_players[["name", "overall_score"]]
rank.tail(10)

In [None]:
worst_pot = data.sort_values(["potential_score"], ascending=[False])
rank_2 = worst_players[["name", "potential_score"]]
rank_2.tail(10) # the players are exactly the same

In [None]:
# How many hits do the best players have?
best_hits = data.sort_values(["overall_score", "hits"], ascending=[False, False])
rank_3 = best_players[["name", "overall_score", "hits"]]
rank_3.head(3)

Now, we will have a look at the data distribution:

In [None]:
# Select the numerical data
num = data.select_dtypes(np.number)

# Extract the continuous numerical data and plot it
for column in num.columns:
    if len(data[column].unique()) > 10:
        sns.displot(data[column])
        plt.savefig('num_graphs/{}-barplot.png'.format(column))

We can see that many of the attributes are normally distributed, except for the `wage`, `value`, `release_clause`, `hits`, and `joined` columns, which are heavily skewed towards lower values. We'll also represent the data using boxplots, to get an image of the amount of outliers present:

In [None]:
# Extract the continuous numerical data and plot it
for column in num.columns:
    if len(data[column].unique()) > 10:
        sns.boxplot(x=column, data=data)
        plt.savefig('num_graphs/{}-boxplot.png'.format(column))

We can see there will be a considerable amount of outliers in:
- distributions that have two medians
* non-Gaussian distributions, e.g. `value`, `wage`, `release_clause`

#### 2.3. Correlation between different attributes

In [None]:
# Calculate correlation matrix
data.corr()

# Create heatmap
mask = np.zeros_like(data.corr())
mask[np.triu_indices_from(mask)] = True # optional, to hide repeat half of the matrix
fig, ax = plt.subplots(figsize=(80, 76))
ax = sns.heatmap(data.corr(), mask=mask, annot=True)
plt.savefig('corr_matrix.png'.format(column))

We can notice that:
* `value`, `wage` and `release clause` are highly correlated to each other (around 0.6)
- positions are very highly correlated (>0.95) to stats relevant to those positions, ie. attacking positions correlated to attacking stats

Whilst these were easy to spot, there are many other features that are highly correlated between themselves. Therefore, we created a function to find them and remove the feature that is the least correlated with a chosen target, which we'll use when we begin modelling:

In [None]:
def corr_destroyer(data, target, max_threshold=0.95):
    corr_data = data.corr()
    corr_target = corr_data[target]
    corr_data.drop(target, axis=1, inplace=True)
    corr_data.drop(target, axis=0, inplace=True)
    
    column_no = corr_data.shape[0]
    to_drop = []

    for i in range(0, column_no):
        for j in range(i + 1, column_no):
            if corr_data.iloc[i, j] > max_threshold:
                if corr_target.iloc[i] > corr_target.iloc[j]:
                    to_drop.append(corr_data.columns[j])
                else:                 
                    to_drop.append(corr_data.columns[i])
    
    to_drop = list(set(to_drop)) # Get unique values
    return to_drop

In [None]:
# Save "cleaner" data to csv
data.to_csv('fifa21_male2-post-exploration.csv', index=False)