# Get Data

In [2]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import math
import warnings
warnings.filterwarnings('ignore')
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#Change Notebook Settings
pd.set_option('display.max_rows', 120)
pd.set_option('display.max_columns', 70)

In [3]:
data = pd.read_csv('fifa21_training.csv')

#Removing Rows with Inactive Players
data.dropna(inplace=True)

#Drop Nationality, Club, and Name Columns
data = data.drop(['Nationality', 'Club', 'Name'], axis = 1)
    
#Drop Players where GK is Default Position
data.drop(data.loc[data['Default Position']=='GK'].index, inplace=True)

#Remove Goalkeepers
data = data.drop(['GK Diving', 'GK Handling', 'GK Kicking', 'GK Positioning', 'GK Reflexes'], axis =1)

data.head()
#OVA column is our target

Unnamed: 0,Age,Default Position,Height,Weight,Value,Attacking,Crossing,Finishing,Heading Accuracy,Short Passing,Volleys,Skill,Dribbling,Curve,FK Accuracy,Long Passing,Ball Control,Movement,Acceleration,Sprint Speed,Agility,Reactions,Balance,Power,Shot Power,Jumping,Stamina,Strength,Long Shots,Mentality,Aggression,Interceptions,Positioning,Vision,Penalties,Composure,Defending,Marking,Standing Tackle,Sliding Tackle,Goalkeeping,Total Stats,Base Stats,W/F,SM,AW,DW,IR,PAC,SHO,PAS,DRI,DEF,PHY,OVA
0,26,CM,"5'9""",161lbs,€525K,258,54,47,43,70,44.0,286,61,44.0,55,63,63,346,64,73,61.0,66,82.0,306,62,73.0,71,55,45,290,54,52.0,62.0,68.0,54,54.0,148,49,56,43.0,48,1682,357,4 ★,2★,High,Medium,1 ★,69,51,63,63,51,60,64
1,30,ST,"6'0""",159lbs,€8.5M,365,66,79,76,68,76.0,375,83,78.0,72,63,79,404,83,83,88.0,75,75.0,372,74,81.0,75,74,68,313,54,33.0,78.0,72.0,76,70.0,77,35,20,22.0,55,1961,412,3 ★,4★,High,Low,2 ★,83,75,68,82,33,71,77
2,33,CAM,"5'4""",134lbs,€9M,336,73,76,34,78,75.0,424,85,89.0,91,74,85,424,84,76,93.0,78,93.0,308,79,34.0,75,42,78,332,75,26.0,80.0,78.0,73,82.0,80,23,29,28.0,21,1925,404,4 ★,4★,High,Medium,2 ★,80,77,78,86,27,56,80
3,22,CDM,"5'10""",152lbs,€275K,242,44,42,58,62,36.0,259,54,41.0,46,57,61,282,54,59,59.0,55,55.0,277,57,60.0,64,58,38,257,61,57.0,31.0,54.0,54,48.0,168,55,58,55.0,42,1527,329,2 ★,2★,Medium,Medium,1 ★,57,44,54,57,57,60,59
4,23,CDM,"5'11""",150lbs,€725K,249,49,37,61,68,34.0,280,64,44.0,45,61,66,324,66,66,61.0,62,69.0,280,61,34.0,81,61,43,294,66,60.0,55.0,64.0,49,58.0,185,58,61,66.0,52,1664,360,2 ★,3★,Low,Medium,1 ★,66,44,60,64,60,66,65


In [4]:
data['Value'] = [x.strip("€") for x in data['Value']]
data['Value'] = [x.strip("M") for x in data['Value']]
data['Value'] = [x.strip("K") for x in data['Value']]
data['Value'] = pd.to_numeric(data['Value'])

# Clean Data

In [5]:
#Define a Function for Cleaning 
def clean(x):
    
    data = x
    
    #Stripping " from the Height row so that it can be split on '
    data['Height'] = data['Height'].str.strip("\"")
    
    #Creating a New Dataframe with the split columns
    new = data['Height'].str.split("\'", n = 1, expand = True)
    
    #Define Foot > Inches Conversion
    def ftin(x):
        return int (x)*12
    
    #Create a New Column in the Main DF with the converted Feet
    data['Inches2'] = list(map(ftin, new[0]))
    
    #Create a New Column in the Main DF with split inches
    data["Inches"]= new[1]
    
    # Sum the two and replace values in Height
    data['Height'] = data[['Inches', 'Inches2']].sum(axis = 1)
    
    #Define Inches to CM Conversion
    def toCM(x):
        return float (x)*2.54
    
    #Convert Corrected Heights into CM
    data['Height'] = list(map(toCM, data['Height']))
    
    #Drop Columns used for Manipulation
    data.drop(['Inches', 'Inches2'], axis=1, inplace=True)
    
    #Create Value Multiply Function

    def valueMultiply(x):
        if (x) >= 100:
            return int(x)*1000
        else:
            return int(x)*1000000
            
    #Apply the Function to the Value Column

    data['Value'] = list(map(valueMultiply, data['Value']))
    
    #Clean Weight Function - Jessie
    
    def cleanWeight(x):
        data = x
        data['Weight'].apply(lambda x: x.strip("lbs"))
        data['Weight'] = [x.strip("lbs") for x in data['Weight']]
    
    cleanWeight(data)
        
    def convert (x):
            x = int(x)/2.2046
            return int(x)
    
    data['Weight'] = list(map(convert, data['Weight']))
    
    return data  

    
    #reindex DataFrame
    data = data.reset_index()
    data = data.drop(['index'], axis=1)
    return data

In [6]:
clean(data)
data['Default Position'].unique()

array(['CM', 'ST', 'CAM', 'CDM', 'LM', 'RB', 'CB', 'RM', 'LB', 'RW',
       'LWB', 'LW', 'RWB', 'CF'], dtype=object)

In [7]:
data.head()

Unnamed: 0,Age,Default Position,Height,Weight,Value,Attacking,Crossing,Finishing,Heading Accuracy,Short Passing,Volleys,Skill,Dribbling,Curve,FK Accuracy,Long Passing,Ball Control,Movement,Acceleration,Sprint Speed,Agility,Reactions,Balance,Power,Shot Power,Jumping,Stamina,Strength,Long Shots,Mentality,Aggression,Interceptions,Positioning,Vision,Penalties,Composure,Defending,Marking,Standing Tackle,Sliding Tackle,Goalkeeping,Total Stats,Base Stats,W/F,SM,AW,DW,IR,PAC,SHO,PAS,DRI,DEF,PHY,OVA
0,26,CM,152.4,73,525000,258,54,47,43,70,44.0,286,61,44.0,55,63,63,346,64,73,61.0,66,82.0,306,62,73.0,71,55,45,290,54,52.0,62.0,68.0,54,54.0,148,49,56,43.0,48,1682,357,4 ★,2★,High,Medium,1 ★,69,51,63,63,51,60,64
1,30,ST,182.88,72,8000000,365,66,79,76,68,76.0,375,83,78.0,72,63,79,404,83,83,88.0,75,75.0,372,74,81.0,75,74,68,313,54,33.0,78.0,72.0,76,70.0,77,35,20,22.0,55,1961,412,3 ★,4★,High,Low,2 ★,83,75,68,82,33,71,77
2,33,CAM,152.4,60,9000000,336,73,76,34,78,75.0,424,85,89.0,91,74,85,424,84,76,93.0,78,93.0,308,79,34.0,75,42,78,332,75,26.0,80.0,78.0,73,82.0,80,23,29,28.0,21,1925,404,4 ★,4★,High,Medium,2 ★,80,77,78,86,27,56,80
3,22,CDM,152.4,68,275000,242,44,42,58,62,36.0,259,54,41.0,46,57,61,282,54,59,59.0,55,55.0,277,57,60.0,64,58,38,257,61,57.0,31.0,54.0,54,48.0,168,55,58,55.0,42,1527,329,2 ★,2★,Medium,Medium,1 ★,57,44,54,57,57,60,59
4,23,CDM,152.4,68,725000,249,49,37,61,68,34.0,280,64,44.0,45,61,66,324,66,66,61.0,62,69.0,280,61,34.0,81,61,43,294,66,60.0,55.0,64.0,49,58.0,185,58,61,66.0,52,1664,360,2 ★,3★,Low,Medium,1 ★,66,44,60,64,60,66,65


In [8]:
data.shape

(12104, 55)

In [9]:
#cleaning functions for star ratings
def cleanStar (x):
    if x in ['4 ★','4★']:
        x = '4 Star'
    elif x in ['5 ★','5★']:
        x = '5 Star'
    elif x in ['3 ★','3★']:
        x = '3 Star'
    elif x in ['2 ★','2★']:
        x = '2 Star'
    elif x in ['1 ★','1★']:
        x = '1 Star'
    return x

In [10]:
#apply clean functions
data['SM'] = list(map(cleanStar, data['SM']))
data['IR'] = list(map(cleanStar, data['IR']))
data['W/F'] = list(map(cleanStar, data['W/F']))

In [11]:
data.shape

(12104, 55)

In [12]:
#If needed, standardize header names
cols = []
for i in range(len(data.columns)):
    cols.append(data.columns[i].lower())
data.columns = cols
data.columns = [e.lower().replace(' ','_')for e in data.columns]
data.head()

Unnamed: 0,age,default_position,height,weight,value,attacking,crossing,finishing,heading_accuracy,short_passing,volleys,skill,dribbling,curve,fk_accuracy,long_passing,ball_control,movement,acceleration,sprint_speed,agility,reactions,balance,power,shot_power,jumping,stamina,strength,long_shots,mentality,aggression,interceptions,positioning,vision,penalties,composure,defending,marking,standing_tackle,sliding_tackle,goalkeeping,total_stats,base_stats,w/f,sm,aw,dw,ir,pac,sho,pas,dri,def,phy,ova
0,26,CM,152.4,73,525000,258,54,47,43,70,44.0,286,61,44.0,55,63,63,346,64,73,61.0,66,82.0,306,62,73.0,71,55,45,290,54,52.0,62.0,68.0,54,54.0,148,49,56,43.0,48,1682,357,4 Star,2 Star,High,Medium,1 Star,69,51,63,63,51,60,64
1,30,ST,182.88,72,8000000,365,66,79,76,68,76.0,375,83,78.0,72,63,79,404,83,83,88.0,75,75.0,372,74,81.0,75,74,68,313,54,33.0,78.0,72.0,76,70.0,77,35,20,22.0,55,1961,412,3 Star,4 Star,High,Low,2 Star,83,75,68,82,33,71,77
2,33,CAM,152.4,60,9000000,336,73,76,34,78,75.0,424,85,89.0,91,74,85,424,84,76,93.0,78,93.0,308,79,34.0,75,42,78,332,75,26.0,80.0,78.0,73,82.0,80,23,29,28.0,21,1925,404,4 Star,4 Star,High,Medium,2 Star,80,77,78,86,27,56,80
3,22,CDM,152.4,68,275000,242,44,42,58,62,36.0,259,54,41.0,46,57,61,282,54,59,59.0,55,55.0,277,57,60.0,64,58,38,257,61,57.0,31.0,54.0,54,48.0,168,55,58,55.0,42,1527,329,2 Star,2 Star,Medium,Medium,1 Star,57,44,54,57,57,60,59
4,23,CDM,152.4,68,725000,249,49,37,61,68,34.0,280,64,44.0,45,61,66,324,66,66,61.0,62,69.0,280,61,34.0,81,61,43,294,66,60.0,55.0,64.0,49,58.0,185,58,61,66.0,52,1664,360,2 Star,3 Star,Low,Medium,1 Star,66,44,60,64,60,66,65


In [13]:
data = data.reset_index()
data = data.drop(['index'], axis=1)

In [14]:
data.shape

(12104, 55)

In [15]:
X_num = data.select_dtypes(include = np.number)
X_cat = data.select_dtypes(include = np.object)

In [16]:
X_num.head()

Unnamed: 0,age,height,weight,value,attacking,crossing,finishing,heading_accuracy,short_passing,volleys,skill,dribbling,curve,fk_accuracy,long_passing,ball_control,movement,acceleration,sprint_speed,agility,reactions,balance,power,shot_power,jumping,stamina,strength,long_shots,mentality,aggression,interceptions,positioning,vision,penalties,composure,defending,marking,standing_tackle,sliding_tackle,goalkeeping,total_stats,base_stats,pac,sho,pas,dri,def,phy,ova
0,26,152.4,73,525000,258,54,47,43,70,44.0,286,61,44.0,55,63,63,346,64,73,61.0,66,82.0,306,62,73.0,71,55,45,290,54,52.0,62.0,68.0,54,54.0,148,49,56,43.0,48,1682,357,69,51,63,63,51,60,64
1,30,182.88,72,8000000,365,66,79,76,68,76.0,375,83,78.0,72,63,79,404,83,83,88.0,75,75.0,372,74,81.0,75,74,68,313,54,33.0,78.0,72.0,76,70.0,77,35,20,22.0,55,1961,412,83,75,68,82,33,71,77
2,33,152.4,60,9000000,336,73,76,34,78,75.0,424,85,89.0,91,74,85,424,84,76,93.0,78,93.0,308,79,34.0,75,42,78,332,75,26.0,80.0,78.0,73,82.0,80,23,29,28.0,21,1925,404,80,77,78,86,27,56,80
3,22,152.4,68,275000,242,44,42,58,62,36.0,259,54,41.0,46,57,61,282,54,59,59.0,55,55.0,277,57,60.0,64,58,38,257,61,57.0,31.0,54.0,54,48.0,168,55,58,55.0,42,1527,329,57,44,54,57,57,60,59
4,23,152.4,68,725000,249,49,37,61,68,34.0,280,64,44.0,45,61,66,324,66,66,61.0,62,69.0,280,61,34.0,81,61,43,294,66,60.0,55.0,64.0,49,58.0,185,58,61,66.0,52,1664,360,66,44,60,64,60,66,65


In [17]:
X_cat.head()

Unnamed: 0,default_position,w/f,sm,aw,dw,ir
0,CM,4 Star,2 Star,High,Medium,1 Star
1,ST,3 Star,4 Star,High,Low,2 Star
2,CAM,4 Star,4 Star,High,Medium,2 Star
3,CDM,2 Star,2 Star,Medium,Medium,1 Star
4,CDM,2 Star,3 Star,Low,Medium,1 Star


In [18]:
Y = X_num['ova']
X_num = X_num.drop(['ova'], axis=1)

In [19]:
X_num.head()

Unnamed: 0,age,height,weight,value,attacking,crossing,finishing,heading_accuracy,short_passing,volleys,skill,dribbling,curve,fk_accuracy,long_passing,ball_control,movement,acceleration,sprint_speed,agility,reactions,balance,power,shot_power,jumping,stamina,strength,long_shots,mentality,aggression,interceptions,positioning,vision,penalties,composure,defending,marking,standing_tackle,sliding_tackle,goalkeeping,total_stats,base_stats,pac,sho,pas,dri,def,phy
0,26,152.4,73,525000,258,54,47,43,70,44.0,286,61,44.0,55,63,63,346,64,73,61.0,66,82.0,306,62,73.0,71,55,45,290,54,52.0,62.0,68.0,54,54.0,148,49,56,43.0,48,1682,357,69,51,63,63,51,60
1,30,182.88,72,8000000,365,66,79,76,68,76.0,375,83,78.0,72,63,79,404,83,83,88.0,75,75.0,372,74,81.0,75,74,68,313,54,33.0,78.0,72.0,76,70.0,77,35,20,22.0,55,1961,412,83,75,68,82,33,71
2,33,152.4,60,9000000,336,73,76,34,78,75.0,424,85,89.0,91,74,85,424,84,76,93.0,78,93.0,308,79,34.0,75,42,78,332,75,26.0,80.0,78.0,73,82.0,80,23,29,28.0,21,1925,404,80,77,78,86,27,56
3,22,152.4,68,275000,242,44,42,58,62,36.0,259,54,41.0,46,57,61,282,54,59,59.0,55,55.0,277,57,60.0,64,58,38,257,61,57.0,31.0,54.0,54,48.0,168,55,58,55.0,42,1527,329,57,44,54,57,57,60
4,23,152.4,68,725000,249,49,37,61,68,34.0,280,64,44.0,45,61,66,324,66,66,61.0,62,69.0,280,61,34.0,81,61,43,294,66,60.0,55.0,64.0,49,58.0,185,58,61,66.0,52,1664,360,66,44,60,64,60,66


In [20]:
#Create normalized dataframe

from sklearn.preprocessing import Normalizer
transformer = Normalizer().fit(X_num)
x_normalized = transformer.transform(X_num)
print(x_normalized.shape)

(12104, 48)


In [21]:
#Create standardized dataframe

from sklearn.preprocessing import StandardScaler
transformer = StandardScaler().fit(X_num)
x_standardized = transformer.transform(X_num)
print(x_standardized.shape)

(12104, 48)


In [22]:
#Encoding the Dummies using OneHot
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(handle_unknown='error', drop='first').fit(X_cat)
encoded = encoder.transform(X_cat).toarray()
encoded.shape

(12104, 28)

In [23]:
#
X = np.concatenate((x_normalized, encoded), axis=1)
pd.DataFrame(X).shape


(12104, 76)

In [24]:
Y.shape

(12104,)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=100)

In [26]:
#Perform a regression
lm = linear_model.LinearRegression()
model = lm.fit(X_train,y_train)
predictions  = lm.predict(X_test)

#Calculate
mse = mean_squared_error(y_test, predictions) 
rmse = math.sqrt(mse) 
r2 = r2_score(y_test, predictions) #r^2
n = len(X_test) 
p = X_test.shape[1]
adj_r2 = 1-((1-r2)*(n-1)/(n-p-1))

#Print

print("\nTest Results:\n")
print("\nMSE: %s\n" % mse)
print("\nRMSE: %s\n" % rmse)
print("\nR2: %s\n" % r2)
print("\nAdj. R2: %s\n" % adj_r2)


Test Results:


MSE: 14.679155217867262


RMSE: 3.831338567376585


R2: 0.6783367501393713


Adj. R2: 0.6714601237007194

