The aim of this notebook is to:
1. Conduct Exploratory Data Analysis regarding the player's value, player's wage, and player's overall    ability
2. Analyze correlation between player's value, player's wage and player's overall ability
3. Predict player's value and player's wage determined by his overall ability using regression analysis

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
%matplotlib inline

In [None]:
df = pd.read_csv("../input/fifa-20-complete-player-dataset-for-manager-mode/fifa20_data.csv")
pd.set_option("display.max_columns", 79)
df.head()

Adding new columns of converted player's value and player's wage in real float data type

In [None]:
million = ['M']
thousand = ['K']
value_list = df['Value']
real_value = []
for c in value_list:
    for m in million:
        for t in thousand:
            if m in c:
                mr = float(c[1:-1]) * 1000000
                real_value.append(mr)
            elif t in c:
                mc = float(c[1:-1]) * 1000
                real_value.append(mc)
            else:
                mn = float(c[1:])
                real_value.append(mn)
float_value = pd.Series(real_value)
print(float_value)

In [None]:
df.insert(19,'F_Value',float_value)
df.head()

In [None]:
million = ['M']
thousand = ['K']
wage_list = df['Wage']
real_wage = []
for c in wage_list:
    for m in million:
        for t in thousand:
            if m in c:
                mr = float(c[1:-1]) * 1000000
                real_wage.append(mr)
            elif t in c:
                mc = float(c[1:-1]) * 1000
                real_wage.append(mc)
            else:
                mn = float(c[1:])
                real_wage.append(mn)
float_wage = pd.Series(real_wage)
print(float_wage)

In [None]:
df.insert(21,'F_Wage',float_wage)
df.head(10)

Make a new dataset

In [None]:
dfc = df[['Club','Overall','Full_Name','Country','F_Value','F_Wage','foot']]
dfc.head(10)

Generating a pairplot to see the shape of the correlation

In [None]:
sns.pairplot(dfc, hue='foot', height=4)
plt.ticklabel_format(style='plain')

Generating a heatmap to see the correlation in scale

In [None]:
plt.figure(figsize=(10,5))
sns.heatmap(dfc.corr(), annot=True)
plt.title("Heatmap Correlation")

Top 10 Most Expensive Clubs

In [None]:
dfr= pd.pivot_table(df, index=['Club'],values=['F_Value','F_Wage'],aggfunc=np.sum)
dfr.head(10)

In [None]:
dft = dfr.sort_values(by=['F_Value'], ascending=False).head(10)
dft

In [None]:
plt.figure(figsize=(15,7))
sns.barplot(data=dft, x=dft['F_Value'], y=dft.index, palette='rocket')
plt.ylabel("Club")
plt.xlabel("Total Value")
plt.title("Top 10 Most Expensive Clubs")

Top 10 Most Spending Clubs

In [None]:
dfm = dfr.sort_values(by=['F_Wage'], ascending=False).head(10)
dfm

In [None]:
plt.figure(figsize=(15,7))
sns.barplot(data=dfm, x=dfm['F_Wage'], y=dfm.index, palette='rocket')
plt.ylabel("Club")
plt.xlabel("Total Spending")
plt.title("Top 10 Most Spending Clubs")

plotting scatterplot to determine regression method for the model

In [None]:
plt.figure(figsize=(8,8))
sns.scatterplot(data=df, x="Overall", y="F_Value", hue='foot')
plt.xlabel("Overall Ability")
plt.ylabel("Player's Value in Euro")
plt.ticklabel_format(style='plain')
plt.title("Overall Ability to Player's Value")
plt.show()

In [None]:
plt.figure(figsize=(8,8))
sns.scatterplot(data=df, x="Overall", y="F_Wage", hue='foot')
plt.xlabel("Overall Ability")
plt.ylabel("Player's Wage in Euro")
plt.title("Overall Ability to Player's Wage")
plt.show()

Because of the shape of the distribution show curve and not linear so the best regression method would be polynomial regression

Importing libraries for polynomial regression analysis

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

Defining test and training set for player's value prediction

In [None]:
xv = dfc.iloc[:,1:2].values
yv = dfc.iloc[:,4:5].values
xv_train, yv_train, xv_test, yv_test = train_test_split(xv, yv, test_size=0.2, random_state=0)

Setting the formula and plotting a the line in the scatterplot

In [None]:
#Setting the formula
poly_regv = PolynomialFeatures(degree=8)
xv_poly = poly_regv.fit_transform(xv)
pol_regv = LinearRegression()
pol_regv.fit(xv_poly, yv)

#Plotting the line
plt.figure(figsize=(8,8))
sns.scatterplot(data=df, x="Overall", y="F_Value", hue='foot')
plt.plot(xv, pol_regv.predict(poly_regv.fit_transform(xv)), color='g')
plt.xlabel("Overall Ability")
plt.ylabel("Player's Value in Euro")
plt.title("Player's Value Plot")
plt.ticklabel_format(style='plain')
plt.show()

Defining test and training set for player's wage prediction

In [None]:
xw = dfc.iloc[:,1:2].values
yw = dfc.iloc[:,5:6].values
xw_train, yw_train, xw_test, yw_test = train_test_split(xw, yw, test_size=0.2, random_state=0)

Setting the formula and plotting a the line in the scatterplot

In [None]:
#Setting the formula
poly_regw = PolynomialFeatures(degree=7)
xw_poly = poly_regw.fit_transform(xw)
pol_regw = LinearRegression()
pol_regw.fit(xw_poly, yw)

#Plotting the line
plt.figure(figsize=(8,8))
sns.scatterplot(data=df, x="Overall", y="F_Wage", hue='foot')
plt.plot(xw, pol_regw.predict(poly_regw.fit_transform(xw)), color='g')
plt.xlabel("Overall Ability")
plt.ylabel("Player's Wage in Euro")
plt.title("Player's Wage Plot")
plt.ticklabel_format(style='plain')
plt.show()


Predicting player's value based on his overall ability

In [None]:
pol_regv.predict(poly_regv.fit_transform([[92]]))

In [None]:
pol_regw.predict(poly_regw.fit_transform([[92]]))