## Data Cleaning

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Importing required Libraries**

In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import missingno as msno
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.impute import KNNImputer

In [None]:
data = pd.read_csv('/kaggle/input/fifa-21-player-and-formation-analysis/fifa21.csv')
data.shape

In [None]:
data.columns

Let's first consider the features which may influence the model.

In [None]:
features = ['Name', 'Age', 'Nationality', 'Overall', 'Potential', 'Club', 'Value', 'Wage', 'Special',
       'Preferred Foot', 'Weak Foot', 'Skill Moves','International Reputation', 'Work Rate', 'Body Type', 
        'Position', 'Height', 'Weight', 'Likes', 'Dislikes', 'Following', 'Crossing', 'Finishing',
       'Heading Accuracy', 'Short Passing', 'Volleys', 'Dribbling', 'Curve','FK Accuracy', 'Long Passing', 
        'Ball Control', 'Acceleration','Sprint Speed', 'Agility', 'Reactions', 'Balance', 'Shot Power',
        'Jumping', 'Stamina', 'Strength', 'Long Shots', 'Aggression','Interceptions', 'Positioning', 
        'Vision', 'Penalties', 'Composure', 'Standing Tackle', 'Sliding Tackle', 
        'GK Diving','GK Handling', 'GK Kicking', 'GK Positioning', 'GK Reflexes']

In [None]:
df = data[features]
df.shape

In [None]:
# Cleaning the Value column
def clean_money(column):
    values = []
    for value in data[column].fillna(''):
        if value[-1]=='M':
            money = 1000000
            money *= float(value[1:-1])
        elif value[-1]=='K':
            money = 1000
            money *= float(value[1:-1])
        else: 
            money = 0
        values.append(money/1000000)
    return values

# Cleaning Weight column
def clean_weight():
    weights = []
    for weight in data['Weight'].fillna(''):
        if weight != '':
            weights.append(int(weight[:-3]))
        else:
            weights.append(np.nan)
    return weights

# Cleaning Height Column
def clean_height():
    heights = []
    for height in data['Height'].fillna(''):
        if height != '':
            height =int(height[0])*12 + int(height[2])
            heights.append(height)
        else:
            heights.append(np.nan)
    return heights

# # Cleaning Release Clause
def clean_release_clause():
    release_clause = []
    for clause in data['Release Clause'].fillna(''):
        if clause == '':
            money=0.0
        elif clause[-1]=='M':
            money = 1000000
            money *= int(clause[1:-1])
        elif clause[-1]=='K':
            money = 1000
            money *= int(clause[1:-1])
        else: 
            money = 0
        release_clause.append(money/1000000)
    return release_clause

In [None]:
##Remove the # and run the below lines to clean the data:
data['Weight'] = clean_weight()
data['Height'] = clean_height()

In [None]:
data['Height']

In [None]:
data['Release Clause']

In [None]:
data['Wage']

In [None]:
def clean_height():
    heights = []
    for height in data['Height'].fillna(''):
        if height != '':
            height =int(height[0])*12 + int(height[2])
            heights.append(height)
        else:
            heights.append(np.nan)
    return heights
#data['Height'] = clean_height()
data['Height']

In [None]:
df.isna().sum()

features = ['Name', 'Age', 'Nationality', 'Overall', 'Potential', 'Club', 'Value', 'Wage', 'Special',
       'Preferred Foot', 'Weak Foot', 'Skill Moves','International Reputation', 'Work Rate', 'Body Type', 
        'Position', 'Height', 'Weight', 'Likes', 'Dislikes', 'Following', 'Crossing', 'Finishing',
       'Heading Accuracy', 'Short Passing', 'Volleys', 'Dribbling', 'Curve','FK Accuracy', 'Long Passing', 
        'Ball Control', 'Acceleration','Sprint Speed', 'Agility', 'Reactions', 'Balance', 'Shot Power',
        'Jumping', 'Stamina', 'Strength', 'Long Shots', 'Aggression','Interceptions', 'Positioning', 
        'Vision', 'Penalties', 'Composure',  'Standing Tackle', 'Sliding Tackle', 
        'GK Diving','GK Handling', 'GK Kicking', 'GK Positioning', 'GK Reflexes']

In [None]:
numerical_features =['Age', 'Overall', 'Potential', 'Value', 'Wage', 'Special', 'Height',
                   'Weight', 'Likes', 'Dislikes','Following', 'Crossing', 'Finishing', 'Heading Accuracy', 'Short Passing',
                   'Volleys', 'Dribbling', 'Curve', 'FK Accuracy', 'Long Passing',
                    'Ball Control', 'Acceleration', 'Sprint Speed', 'Agility', 'Reactions',
                   'Balance', 'Shot Power', 'Jumping', 'Stamina', 'Strength', 'Long Shots',
                   'Aggression', 'Interceptions', 'Positioning', 'Vision', 'Penalties',
                   'Composure', 'Standing Tackle', 'Sliding Tackle', 'GK Diving',
                   'GK Handling', 'GK Kicking', 'GK Positioning', 'GK Reflexes']

categorical_features = ['Name','Nationality', 'Club', 'Preferred Foot', 'Work Rate','Body Type', 
                        'Position','International Reputation', 'Weak Foot', 'Skill Moves']

In [None]:
df[numerical_features].describe().T

In [None]:
data_num = df[numerical_features]
data_num.dtypes

In [None]:
data_cat = df[categorical_features]
data_cat.dtypes

* We can see much variablity in S.D. in dataset

## Data Visualisation

* Missing Values Visualisation

In [None]:
fig = plt.figure(figsize=(15,7))
sns.heatmap(df.isna(), yticklabels=False, cmap='YlGnBu')

The club has no relation with the other features and imputing it wiht any club names will be bias. So i am imputing it with the 'no Club'. We can also see there is an line showing missing values in rows of different features both categorical and numerical. So let's view categorical and numerical variable seperately

* Missing Categorical Values Visualisation

In [None]:
fig = plt.figure(figsize=(100,100))
fig.subplots_adjust(hspace=0.4, wspace=0.1)

ax = fig.add_subplot(7, 7, 1)
sns.heatmap(df[categorical_features].isna(), yticklabels=False, cmap='YlGnBu')

ax = fig.add_subplot(7, 7, 2)
sns.heatmap(df[numerical_features].isna(), yticklabels=False, cmap='YlGnBu')

In [None]:
fig = plt.figure(figsize=(20,30))
fig.subplots_adjust(hspace=0.4, wspace=0.4)
count=1
for feature in numerical_features:
    ax = fig.add_subplot(len(numerical_features)//4+1, 4, count)
    sns.boxplot(x = data[feature])
    count +=1

* If we try to statistically impute these missing values then it will be biased imputation for many features.
* There are many features with outliers so if we try to impute the missing values using any imputer models then model will provide incorrect results.
* But if we try to remove these outliers or correct the outliers, then the models will be somewhat provide meaningful results.
* But again a question arise weather to remove or correct the outliers??

In [None]:
corr_ = df[numerical_features].corr()

f,ax = plt.subplots(figsize=(25, 10))
sns.heatmap(corr_,annot=True, linewidths=0.5, cmap="YlGnBu", fmt= '.1f',ax=ax)
plt.show()

* We can clearly see from these heatmap that 'Special' feature of the dataset is somewhat strongly correlated with many features which are having missing values.
   *  'Crossing', 'Finishing', 'HeadingAccuracy', 'ShortPassing','Volleys', 'Dribbling', 'Curve', 
      'FKAccuracy', 'LongPassing','BallControl', 'Acceleration', 'SprintSpeed', 'Agility', 
      'Reactions','Balance', 'ShotPower','Stamina','LongShots','Aggression', 'Interceptions', 
      'Positioning', 'Vision', 'Penalties','Composure', 'Marking','GKDiving','GKHandling', 'GKKicking', 
      'GKPositioning', 'GKReflexes'
* Height and Weight doesn't have good correlation with any other features. So we will impute this using some stats imputations technique after getting some visual view of the data below.
* Weight is Correlation(-0.7) with
   * 'Balance'       
* Interception is Correlation(0.9) with
   * 'StandingTackle', 'SlidingTackle'

In [None]:
features =  ['Crossing', 'Finishing', 'Heading Accuracy', 'Short Passing','Volleys', 'Dribbling', 'Curve', 'FK Accuracy', 'Long Passing','Ball Control', 
            'Acceleration', 'Sprint Speed', 'Agility', 'Reactions','Balance', 'Shot Power','Stamina','Long Shots','Aggression', 'Interceptions', 
            'Positioning', 'Vision', 'Penalties','Composure','GK Diving','GK Handling', 'GK Kicking', 'GK Positioning', 'GK Reflexes']

fig = plt.figure(figsize=(20,30))
fig.subplots_adjust(hspace=0.4, wspace=0.4)
count=1
for feature in features:
    ax = fig.add_subplot(len(features)//4+1, 4, count)
    sns.scatterplot(x=df['Special'], y=df[feature])
    count +=1

We don't see any outliers effecting.

* Let's visualise Balance and Weight

In [None]:
sns.scatterplot(x= 'Balance', y = 'Weight', data = df)

In [None]:
fig = plt.figure(figsize=(15,5))
fig.subplots_adjust(hspace=0.4, wspace=0.4)

ax = fig.add_subplot(1, 3, 1)
sns.scatterplot(x=df['Interceptions'], y=df['Standing Tackle'])

ax = fig.add_subplot(1, 3, 2)
sns.scatterplot(x=df['Interceptions'], y=df['Sliding Tackle'])

ax = fig.add_subplot(1, 3, 3)
sns.scatterplot(x=df['Standing Tackle'], y=df['Sliding Tackle'])


* Let's Visuslise Height

In [None]:
fig = plt.figure(figsize=(5,5))
sns.distplot(x = data['Height'])

## So let's now Impute the the feature using Linear Imputer and KNN Imputer
*   Before proceeding let's list down what Imputer will be used in which feature

As we can see from above scatter plots in which many features are compaired from 'Special' features. We see a linear relationship except some features('GKDiving','GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes'). So these feature can be imputed by Linear Imputation.
And we can see features like :- ('GKDiving','GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes') are making 2 group. So we can imputed this by KNN imputation.
We can impute the Weights by Balance using Linear Imputation.
We can impute the StandingTackle and SlidingTackle by Marking using Linear Imputation.
The height will be imputed through Stats imputation techniques.
* Splitting train and test data

Let's create a traning and testing set for Linear Imputation and use LinearRegression model to predict the values for features

In [None]:
train_df = data[['Special']+features[:-5]].dropna()
test_df = data[data[['Special']+features[:-5]].isnull().any(axis=1)]

In [None]:
for feature in features[:-5]:
    
    polyreg = make_pipeline(PolynomialFeatures(2),LinearRegression())
    polyreg.fit(X = train_df[['Special']], y = train_df[feature])
    
    predicted_output = polyreg.predict(test_df[['Special']])
    test_df[feature] = np.round(predicted_output)
    df[feature].fillna(test_df[feature], inplace=True)

In [None]:
fig = plt.figure(figsize=(20,30))
fig.subplots_adjust(hspace=0.4, wspace=0.4)
count=1
for feature in features[:-5]:
    ax = fig.add_subplot(len(features[:-5])//4+1, 4, count)
    sns.scatterplot(x=train_df[feature], y=train_df['Special'])
    sns.scatterplot(x=test_df[feature], y=test_df['Special'], palette="deep")
    count +=1

As we can see that the imputation is quit good. So let's now do the KNN imputation for remanining numerical variables.

* Let's create a traning and testing set for KNN Imputation and use KNNImputer model to predict the values for features

In [None]:
train_df = data[['Special']+features[-5:]].dropna()
test_df = data[data[features[-5:]].isnull().any(axis=1)][['Special']+features[-5:]]

In [None]:
imputer = KNNImputer(n_neighbors=1)
imputer.fit(train_df)
predicted_df = pd.DataFrame(np.round(imputer.transform(test_df)), columns=test_df.columns,index=test_df.index)

In [None]:
for col in test_df.columns[1:]:
    df[col].fillna(predicted_df[col], inplace=True)

In [None]:
fig = plt.figure(figsize=(20, 10))
fig.subplots_adjust(hspace=0.4, wspace=0.4)
count=1
for feature in features[-5:]:
    ax = fig.add_subplot(2, 3, count)
    sns.scatterplot(x=train_df[feature], y=train_df['Special'])
    sns.scatterplot(x=predicted_df[feature], y=predicted_df['Special'])
    count +=1

* Let's impute Weight feature by Balance using LinearRegression

In [None]:
train_df = data[['Weight', 'Balance']].dropna()
test_df = data[data['Balance'].isna()][['Weight', 'Balance']]
test_df

In [None]:
polyreg=make_pipeline(PolynomialFeatures(1),LinearRegression(1))
polyreg.fit(X = train_df[['Balance']], y = train_df['Weight'])

test_df['Balance'] = np.round(polyreg.predict(test_df[['Weight']]))   #.reshape(-1,1)
test_df

In [None]:
df['Weight'].fillna(test_df['Weight'], inplace=True)

In [None]:
sns.scatterplot(train_df['Weight'], train_df['Balance'])
sns.scatterplot(test_df['Weight'], test_df['Balance'])

In [None]:
train_df = df[['Standing Tackle', 'Sliding Tackle', 'Interceptions']].dropna()
test_df = df[df['Sliding Tackle'].isna()][['Standing Tackle', 'Sliding Tackle', 'Interceptions']]

In [None]:
for feature in ['Standing Tackle', 'Sliding Tackle']:
    polyreg=make_pipeline(PolynomialFeatures(2),LinearRegression())
    polyreg.fit(X = train_df[['Interceptions']], y = train_df[feature])

    test_df[feature] = np.round(polyreg.predict(test_df[['Interceptions']]))
    df[feature].fillna(test_df[feature], inplace=True)

In [None]:
fig = plt.figure(figsize=(20, 5))
fig.subplots_adjust(hspace=0.4, wspace=0.4)
count=1
for feature in ['Standing Tackle', 'Sliding Tackle']:
    ax = fig.add_subplot(1, 2, count)
    sns.scatterplot(train_df['Interceptions'], train_df[feature])
    sns.scatterplot(test_df['Interceptions'], test_df[feature])
    count +=1

In [None]:
df['Height'].fillna(61.0, inplace=True)
df['Jumping'].fillna(df['Jumping'].mean(), inplace=True)
df['Strength'].fillna(df['Strength'].mean(), inplace=True)

In [None]:
df[numerical_features].isna().sum()

In [None]:
for feature in ['Standing Tackle']:
    polyreg=make_pipeline(PolynomialFeatures(2),LinearRegression())
    polyreg.fit(X = train_df[['Interceptions']], y = train_df[feature])

    test_df[feature] = np.round(polyreg.predict(test_df[['Interceptions']]))
    df[feature].fillna(test_df[feature], inplace=True)

In [None]:
df['Standing Tackle'].fillna(df['Standing Tackle'].mean(), inplace=True)

In [None]:
df[numerical_features].isna().sum()

## As all the Numerical features are been imputed. Let's fill the Categorical features

### Let's first visualise the category features distribution.

In [None]:
df[categorical_features].isna().sum()

In [None]:
fig = plt.figure(figsize=(15,5))
sns.countplot(df['Position'])

In [None]:
fig = plt.figure(figsize=(100,20))
sns.countplot(df['Body Type'])

### Imputing some Categorical variable by viewing the graphs above.

In [None]:
df['Club'].fillna('No Club', inplace=True)
df['Body Type'].fillna('Normal',inplace=True)
df['Position'].fillna('NA', inplace=True)

### After all imputations let's visualise the Dataset using Heatmap

In [None]:
fig = plt.figure(figsize=(15,7))
sns.heatmap(df.isna(), yticklabels=False, cmap='YlGnBu')

In [None]:
df.isna().sum()

In [None]:
df.to_csv('fifa21_clean.csv',index=False)