In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import plotly.express as px

In [None]:
fifa21 = pd.read_csv('../input/fifa-21-complete-player-dataset/fifa21_male2.csv')

# Data Preprocessing

## Data Cleaning

In [None]:
missing = pd.DataFrame()
missing['column'] = fifa21.columns

missing['percent'] = [
    round(100* fifa21[col].isnull().sum() / len(fifa21), 2) for col in fifa21.columns
]
missing = missing[missing['percent']>0].sort_values('percent')

fig = px.bar(
    missing, 
    x='percent',
    y="column", 
    orientation='h', 
    title='Percent of Missing Values in each Column', 
    height=1300, 
    width=800
)
fig.show()


### Dropping problematic columns

In [None]:
fifa21.drop(['Loan Date End','Joined','Hits','Gender'],axis=1,inplace=True)

In [None]:
fifa21.head()

### Looks like we have Object Types...

In [None]:
print(fifa21.Value.dtype)
print(fifa21.Wage.dtype)
print(fifa21['Release Clause'].dtype)
print(fifa21.Height.dtype)
print(fifa21.Weight.dtype)

### We must omit the Euro symbol and multiply the number in the object by either 1000 for 'K' or 1000000 for 'M'
### Height and Weight will be formatted as well

In [None]:
fifa21.Value.head(),fifa21.Wage.head(),fifa21['Release Clause'].head()

In [None]:
fifa21['Weight'].head(),fifa21['Height'].head()

## Formatting Object Types

In [None]:
#formating the Value column
def format_money(column):
    values = []
    for value in fifa21[column]:
        if value[-1]=='M':
            money = 1000000
            money *= float(value[1:-1])
        elif value[-1]=='K':
            money = 1000
            money *= float(value[1:-1])
        else: 
            money = 0
        values.append(money)
    return values

# formating Weight column
def format_weight():
    weights = []
    for weight in fifa21['Weight'].fillna(''):
        if weight != '':
            weights.append(int(weight[:-3]))
        else:
            weights.append(np.nan)
    return weights

# formating Height Column
def format_height():
    heights = []
    for height in fifa21['Height'].fillna(''):
        if height != '':
            height =int(height[0])*12 + int(height[2])
            heights.append(height)
        else:
            heights.append(np.nan)
    return heights

# # formating Release Clause

def format_release_clause():
    release_clause = []
    for clause in fifa21['Release Clause'].fillna(''):
        if clause == '':
            money=0.0
        elif clause[-1]=='M':
            money = 1000000
            money *= float(clause[1:-1])
        elif clause[-1]=='K':
            money = 1000
            money *= float(clause[1:-1])
        else: 
            money = 0
        release_clause.append(money)
    return release_clause

In [None]:
fifa21['Value'] =  format_money('Value')
fifa21['Wage'] = format_money('Wage')
fifa21['Weight'] = format_weight()
fifa21['Height'] = format_height()
fifa21['Release Clause'] = format_release_clause()

### Looks like we have 0s inplace for some 'Wage' and 'Value' obs. I cross validated this with sofifa.com, and it turns out the website just doesn't yet have data on those players. Essentially, every '0' inplace for Value overlaps with that of Wage, so we can drop observations relative to Wage = 0, but the following is another way to drop them via the use of numpy

In [None]:
(fifa21).describe()

### Removing zeroes in wage,release clause, and value columns

In [None]:
dropdex = (fifa21.loc[fifa21.Value==0].index)
dropdex = dropdex.append((fifa21.loc[fifa21.Wage==0].index))
dropdex = dropdex.append((fifa21.loc[fifa21['Release Clause']==0].index))
dropdex = np.unique(dropdex)

In [None]:
fifa21 = fifa21.drop(index=dropdex, axis = 1)

In [None]:
print(sum(fifa21.Value==0),sum(fifa21.Wage==0),sum(fifa21['Release Clause']==0))

In [None]:
fifa21.shape

### We lost ~4500 observations in the cleaning process... It's okay, though - our data is still just as comprehensive

# Feature Engineering

## General Position 

#### In FIFA, there are 24 different positions... Our goal here is to create a new variable in the data connotating players' general positions.

#### The Four General Positions are:
- Forward
- Midfielder
- Defender
- Goalkeeper

In [None]:
conditions = [
    (fifa21['BP'] == 'RF') | (fifa21['BP'] == 'ST') | (fifa21['BP'] == 'LW') | (fifa21['BP'] == 'LF') | (fifa21['BP'] == 'RS') | (fifa21['BP'] == 'LS') | (fifa21['BP'] == 'RW') | (fifa21['BP'] == 'CF')
    ,(fifa21['BP'] == 'RCM') | (fifa21['BP'] == 'LCM') | (fifa21['BP'] == 'LDM') | (fifa21['BP'] == 'CAM') | (fifa21['BP'] == 'CDM') | (fifa21['BP'] == 'RM') | (fifa21['BP'] == 'LAM') | (fifa21['BP'] == 'LM') | (fifa21['BP'] == 'RDM') | (fifa21['BP'] == 'CM') | (fifa21['BP'] == 'RAM')
    ,(fifa21['BP'] == 'RCB') | (fifa21['BP'] == 'CB') | (fifa21['BP'] == 'LCB') | (fifa21['BP'] == 'LB') | (fifa21['BP'] == 'RB') | (fifa21['BP'] == 'LWB') | (fifa21['BP'] == 'RWB')
    ,(fifa21['BP'] == 'GK')]

choices = ['FW','MID','DEF',"GK"]
fifa21['GPOS'] = np.select(conditions, choices, default='N/A')
fifa21['GPOS']

 ## Encoding GPOS

In [None]:
from sklearn import preprocessing
# label_encoder object knows how to understand word labels. 
label_encoder = preprocessing.LabelEncoder() 
  
# Encode labels in column 'GPOS'. 
fifa21['GPOS']= label_encoder.fit_transform(fifa21['GPOS']) 
  
fifa21['GPOS'].unique()

In [None]:
#Inverse the encoding
label_encoder.inverse_transform(fifa21.GPOS)

## BMI
### Since we have player Height and Weight, why not calculate their BMI? Perhaps it can be a helpful variable in the future...

In [None]:
import math

In [None]:
fifa21['BMI'] = fifa21.Weight*0.453592/pow((fifa21.Height*2.54)/100,2)

# Modeling

## Feature Selection

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

In [None]:
df = fifa21.copy()

In [None]:
categorical = [
    'Nationality', 
    'Club', 
    'foot',
    'A/W',
    'D/W',
    'W/F',
    'BP',
    'SM',
    'IR'
]

In [None]:
for item in categorical:
    df[item] = df[item].fillna('0') 
    le = preprocessing.LabelEncoder()
    df[item] = le.fit_transform(df[item])

In [None]:
drop = [
    'ID', 'Name', 'Player Photo' 
     ,'POT', 'Club Logo', 'Flag Photo' 
      ,'Team & Contract','Contract' , 'Position', 'BOV'
    
    ,'Release Clause'
    ,'Wage' 
    #,'Growth'
    ,'LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF' 
    ,'RW', 'LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM', 'CDM' 
    ,'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB','GK'
    
    #,'PAC' 
    ,'SHO', 'PAS', 'DRI', 'DEF', 'PHY'
    , 'Total Stats','Base Stats' 
    ,'Skill'
    ,'GPOS'
    , 'BMI'
,'Nationality'
,'Club'
,'BP'
,'Height'
,'Weight'
,'foot'
,'Balance'
,'Jumping'
,'Strength'
,'Interceptions'
,'Defending'
,'Marking'
,'Standing Tackle'
,'Sliding Tackle'
,'Goalkeeping'
,'GK Diving'
,'GK Handling'
,'GK Kicking'
,'GK Positioning'
,'GK Reflexes'
,'W/F'
,'A/W'
,'D/W'
]

df = df.drop(drop, axis=1)

# PCA & Clustering

In [None]:
fifacluster=fifa21[["Attacking","Crossing","Finishing"
,"Heading Accuracy","Short Passing","Volleys"
,"Skill","Dribbling","Curve"
,"FK Accuracy","Long Passing","Ball Control"
,"Movement","Acceleration","Sprint Speed"
,"Agility","Reactions","Balance"
,"Power","Shot Power","Jumping"
,"Stamina","Strength","Long Shots"
,"Mentality","Aggression","Interceptions"
,"Positioning","Vision","Penalties"
,"Composure","Defending","Marking"
,"Standing Tackle","Sliding Tackle","Goalkeeping"
,"GK Diving","GK Handling","GK Kicking"
,"GK Positioning","GK Reflexes","GPOS"]]
fifacluster.head()

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principalComponents1 = pca.fit_transform(fifacluster[fifacluster.columns[~fifacluster.columns.isin(['GPOS'])]])

In [None]:
# Creating a dataframe featuring the two Principal components that we acquired through PCA.
PCA_dataset1 = pd.DataFrame(data = principalComponents1, columns = ['component1', 'component2'] )
PCA_dataset1.head()

In [None]:
principal_component1 = PCA_dataset1['component1']
principal_component2 = PCA_dataset1['component2']

In [None]:
# Visualizing PCA in 2 Dimensions.
plt.figure()
plt.figure(figsize=(10,10))
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.title('Two Dimensional PCA')
plt.scatter(PCA_dataset1['component1'], PCA_dataset1['component2']
            #, c = y_kmeans, s=10
           )
plt.show()

#### Intrestingly, we see a large peice of scatter (to the left) which looks made up of 2 seperate clusters and a smaller, distant piece of scatter (to the right)
#### It would be interesting to see how these attributes cluster...

### How many Clusters do we need?

In [None]:
from yellowbrick.cluster import KElbowVisualizer

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters = 4, init = 'k-means++', random_state = 1)
y_kmeans = kmeans.fit_predict(principalComponents1)

In [None]:
model = KMeans()
fig = KElbowVisualizer(model,k=(1,10))
fig.fit(fifacluster[fifacluster.columns[~fifacluster.columns.isin(['GPOS','BMI','eGPOS'])]])
fig.show();

In [None]:
fifacluster['principal component 1'] = principal_component1
fifacluster['principal component 2'] = principal_component2
fifacluster['cluster'] = y_kmeans

## Visualizaing Clusters

In [None]:
from matplotlib import colors as mcolors

In [None]:
plt.figure(figsize=(10,7))
plt.scatter(principalComponents1[y_kmeans == 0, 0], principalComponents1[y_kmeans == 0, 1], s = 100, c = 'blue', label = 'Cluster 1')
plt.scatter(principalComponents1[y_kmeans == 1, 0], principalComponents1[y_kmeans == 1, 1], s = 100, c = 'red', label = 'Cluster 2')
plt.scatter(principalComponents1[y_kmeans == 2, 0], principalComponents1[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(principalComponents1[y_kmeans == 3, 0], principalComponents1[y_kmeans == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
#plt.scatter(principalComponents1[y_kmeans == 4, 0], principalComponents1[y_kmeans == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')
#plt.scatter(principalComponents1[y_kmeans == 5, 0], principalComponents1[y_kmeans == 5, 1], s = 100, c = 'limegreen', label = 'Cluster 6')
#plt.scatter(principalComponents1[y_kmeans == 6, 0], principalComponents1[y_kmeans == 6, 1], s = 100, c = 'silver', label = 'Cluster 7')
#plt.scatter(principalComponents1[y_kmeans == 7, 0], principalComponents1[y_kmeans == 7, 1], s = 100, c = 'rosybrown', label = 'Cluster 8')
#plt.scatter(principalComponents1[y_kmeans == 8, 0], principalComponents1[y_kmeans == 8, 1], s = 100, c = 'mediumpurple', label = 'Cluster 9')

plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 25, c = 'yellow', label = 'Centroids')
plt.title('Clusters of Players')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.show()

In [None]:
fifacluster.groupby("GPOS")["cluster"].value_counts().plot(kind="barh",color="teal",figsize=(14,6))
plt.ylabel('General Positions')
plt.xlabel('Count')
plt.title(' Count of Positions in Each Cluster')
plt.show()

In [None]:
pd.crosstab( index=fifacluster['cluster'],columns=fifacluster['GPOS'])

### It seems as if our clusters form generally/mostly around each player position. Given the attributes fed to the model, and the discerning of the four clusters, we can confidently say that players along the borders of the three neighboring clusters are position-fluid - this means that we can potentially try to find 'well-rounded' players along that general area, and perhaps between the centroids of the three clusters.

### It would even make sense to continue to cluster to try to pinpoint these groups of players

# Normalizing 'Value'

In [None]:
target = np.log1p(df['Value'])
original_target = df['Value']
df = df.drop(['Value'], axis=1)

### 'Value' is un-normal, and is skewed to the right

In [None]:
plt.hist(original_target,color='green')
plt.title('Distrubution of Value')
plt.show()

### However, 'Value' is log normal.

In [None]:
plt.hist(target,color='green')
plt.title('Distrubution of Log Normalized Value')
plt.show()

# Linear Regression

## Train/Test Split

In [None]:
scaler=MinMaxScaler()

X = df.copy()

y = target.copy()

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.33, random_state=99,shuffle=True)

X_train = scaler.fit_transform(X_train)

X_test = scaler.fit_transform(X_test)

In [None]:
reg = LinearRegression()

reg.fit(X_train,y_train)

pred = reg.predict(X_train)

scores = cross_val_score(reg,X,y)
 
pred_1 = reg.predict(X_test)

respred=(np.expm1(pred_1))
restrue=(np.expm1(y_test))


#Defining MAPE function
def MAPE(Y_actual,Y_Predicted):
    mape = np.mean(np.abs((Y_actual - Y_Predicted)/Y_actual))*100
    return mape
#Real value MAPE
LR_MAPE= MAPE(y_test,pred_1)

print('MAPE:', '{:0.3f}'.format(LR_MAPE)+'%')
print('Train r2 Score:','{:0.3f}'.format(reg.score(X_train, y_train)*100)+'%')
print('Test r2 Score:','{:0.3f}'.format(reg.score(X_test, y_test)*100))
print('Adjusted r2 Score:','{:0.3f}'.format((1-((1-reg.score(X_test, y_test)*100)*((4188-1))/(4188-35-1))))+'%')
#print('MSE:', mean_squared_error(y_true=restrue, y_pred=respred,squared=True))
print('RMSE:', mean_squared_error(restrue, respred,squared=False))
print('MAE:', mean_absolute_error(restrue, respred))

respred=np.array(np.expm1(pred_1)).reshape(-1,1)
restrue=np.array(np.expm1(y_test)).reshape(-1,1)
print('Cross Validation Score:','{:0.3f}'.format(cross_val_score(reg,restrue,respred,cv=2).mean()*100)+'%')

In [None]:
plt.figure(figsize=(10,8))
sns.regplot(x=y_test, y=pred_1)
plt.xlabel('Actual Values ')
plt.ylabel('Predictions ')
plt.title('Linear Model Predictions vs Actual')
plt.show()

# Random Forest

In [None]:
random = RandomForestRegressor()

random.fit(X_train, y_train)

y_pred = random.predict(X_test)

In [None]:
RF_MAPE= MAPE((y_test),(y_pred))

y_pred1=np.array(np.expm1(y_pred)).reshape(-1,1)
y_test1=np.array(np.expm1(y_test)).reshape(-1,1)

cvs=cross_val_score(random,y_test1,y_pred1).mean()

In [None]:
print('Train r2 Score:',random.score(X_train, y_train))
print('Test r2 Score:',random.score(X_test, y_test))
print('Adjusted r2 Score:',(1-((1-random.score(X_test, y_test))*((4188-1))/(4188-35-1))))
#print('MSE:', mean_squared_error(y_true=y_test1, y_pred=y_pred1,squared=True))
print('RMSE:', mean_squared_error(y_true=y_test1, y_pred=y_pred1,squared=False))
print('MAE:', mean_absolute_error(y_test1, y_pred1))
print('MAPE:', RF_MAPE)
print('Cross Validation Score:',cvs)

In [None]:
plt.figure(figsize=(10,7))

sns.regplot(x=y_test, y=y_pred)
plt.xlabel('Actual Values ')
plt.ylabel('Predictions ')
plt.title('Random Forest Predictions vs Actual')
plt.show()


# Feature Importance

In [None]:
importances = random.feature_importances_
importances

In [None]:
features = df.columns
imp = pd.DataFrame({'Features': features, 'Importance': importances})
imp.sort_values(by='Importance',ascending=False)