In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
import numpy as np
import statistics as stats
import sklearn
import re
from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

pd.set_option('display.max_columns', None)



%matplotlib inline

In [5]:
# Get the csv

df = pd.read_csv('fifa21_train.csv')


In [6]:
# Define a function


def preprocess(data):

    # Fix the weights (as kg)
    Weight = [] 
    for i in data['Weight']:
        Weight.append(str(i).replace('lbs' , ''))

    data['Weight'] = Weight
    data['Weight'] = data['Weight'].astype(np.number)
    data['Weight'] = data['Weight']/2.2046

    
    # Fix the heights (as cm)
    

    cmheight = []
    for item in data['Height']:
        item = str(item).replace('\"', '')
        feetinches = item.split('\'')
        cmheight.append(round(int(feetinches[0]) * 30.48) + int(feetinches[1]) * 2.54)
    data['Height'] = cmheight
        
    # Remove the stars from columns 'IR', 'W/F', and 'SM'
    newvalues = []
    for item in data['IR']:
        item = str(item).replace('★', '')
        item = item.strip()
        newvalues.append(int(item))
    data['IR']=newvalues
  
    newvalues = []
    for item in data['W/F']:
        item = str(item).replace('★', '')
        item = item.strip()
        newvalues.append(int(item))
    data['W/F']=newvalues
    
    newvalues = []
    for item in data['SM']:
        item = str(item).replace('★', '')
        item = item.strip()
        newvalues.append(int(item))
    data['SM']=newvalues
    
  
    # Transform the 'Contract' column to only give the end date of the contract 
    contractend = []
    for item in data['Contract']:
        if 'Free' in item:
            contractend.append('DELETE')
        else:
            item = item.replace(' On Loan', '')
            item = item[-4:]
            contractend.append(int(item))
        
    data['Contract']=contractend

    # Transform the 'loan date end' column to an on loan (yes/no) column
    data['Loan Date End'] = data['Loan Date End'].fillna(0)

    loanyesno = []
    for item in data['Loan Date End']:
        if item == 0:
            loanyesno.append(item)
        else: 
            loanyesno.append(1)

    data['Loan Date End'] = loanyesno
    data = data.rename(columns={'Loan Date End':'On Loan'})

    data['On Loan']
    
    # Remove the plusses and minuses on the last few rows   
    toclean = ['LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW', 'LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM', 'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB', 'GK']

    superlist = []

    for column in toclean: 
        values = []
        for item in data[column]:
            if item[1] == '-' or item[1] == '+':
                values.append(int(item[0]))
            else: 
                values.append(int(item[:2]))
        superlist.append(values)


    for i in range(len(toclean)):
        data[toclean[i]] = superlist[i]

    # Clean up currency cells by removing € sign and converting 'M' and 'K' to numbers
    
    newvalues1 = []
    for item in data['Value']:
        item = item.replace('€', '')
        if 'K' in item: 
            item = float(item.replace('K', ''))
            item = item * 1000
            newvalues1.append(int(item))
        elif 'M' in item: 
            item = float(item.replace('M', ''))
            item = item * 1000000      
            newvalues1.append(int(item))
        else: 
            item = int(item)
            newvalues1.append(int(item))
    data['Value']=newvalues1
    
    
    newvalues2 = []
    for item in data['Wage']:
        item = item.replace('€', '')
        if 'K' in item: 
            item = float(item.replace('K', ''))
            item *= 1000
            newvalues2.append(int(item))
        elif 'M' in item: 
            item = float(item.replace('M', ''))
            item *= 1000000      
            newvalues2.append(int(item))
        else: 
            item = int(item)
            newvalues2.append(int(item))
    data['Wage']=newvalues2
    
    
    newvalues3 = []
    for item in data['Release Clause']:
        item = item.replace('€', '')
        if 'K' in item: 
            item = float(item.replace('K', ''))
            item *= 1000
            newvalues3.append(int(item))
        elif 'M' in item: 
            item = float(item.replace('M', ''))
            item *= 1000000      
            newvalues3.append(int(item))
        else: 
            item = int(item)
            newvalues3.append(int(item))
    data['Release Clause']=newvalues3
    
    
    # Clean up Hits column (which contains some values with K in them)¶
    newhits = []
    for item in data['Hits']:
        if 'K' in item:
            item = item.replace('K','')
            item = float(item) * 1000
            newhits.append(int(item))
        else: 
            newhits.append(int(item))
    data['Hits'] = newhits

    # Dropping stuff
    
    # Drop columns with irrelevant attributes
    #Some atributes are used to calculate the rating per category, that we already have, so individual attributes are not necessary here

    data = data.drop(['Nationality','Club','Team & Contract', 'Position', 'ID','Growth','Joined', 'Contract','Name','Crossing','Finishing','Heading Accuracy','Short Passing','Volleys',
           'Dribbling','Curve','FK Accuracy','Long Passing','Ball Control',
           'Acceleration','Sprint Speed','Agility','Reactions','Balance',
          'Shot Power','Jumping','Stamina','Strength','Long Shots',
          'Aggression','Interceptions','Positioning','Vision','Penalties','Composure',
          'Marking','Standing Tackle','Sliding Tackle',
          'GK Diving','GK Handling','GK Kicking','GK Positioning','GK Reflexes'], axis=1)
    

    #Drop the rows with less than 5% of NaN
    data = data[data['A/W'].isna()==False]
    data = data[data['D/W'].isna()==False]
    
    return data

data = preprocess(df)

In [7]:
# And a function to do the X-Y split

def xysplit(dataframe): 
    
    #Initial split (y will remain unchanged, X will undergo more operations)
    y = dataframe['OVA']
    X = dataframe.drop(['OVA'], axis = 1)
    
    #Split numerical and categorical data
    X_num = X.select_dtypes(np.number)
    X_cat = X.select_dtypes(object)

    #Normalize the numerical data
    transformer = MinMaxScaler().fit(X_num)
    x_normalized = transformer.transform(X_num)
    X_normal = pd.DataFrame(x_normalized, columns=X_num.columns)

    #Encode categorical data
    encoder = OneHotEncoder().fit(X_cat)
    encoded = encoder.transform(X_cat).toarray()

    # And get relevant headers for the encoded categorical data
    headers = []

    for category in encoder.categories_:
        for unit in category: 
            headers.append(unit)

    categ_encoded=pd.DataFrame(encoded, columns=headers)
    
    #Finally, concatenate the (normalized) numerical and (encoded) categorical data
    X = pd.concat([X_normal, categ_encoded], axis = 1)
    
    return X, y

In [8]:
X, y = xysplit(data)

In [None]:


#X/Y split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)



In [13]:
# Build the model 

lm = linear_model.LinearRegression()
lm.fit(X_train,y_train)

NameError: name 'X_train' is not defined

In [18]:
# Get the R2 score for the training and test data

from sklearn.metrics import r2_score
predictions = lm.predict(X_train)
print(r2_score(y_train, predictions))

predictions_test = lm.predict(X_test)
print(r2_score(y_test, predictions_test))

0.8990542240860389
0.9034322259280747


In [21]:
from sklearn.metrics import mean_squared_error
print(mean_squared_error(y_train,predictions))
print(mean_squared_error(y_test,predictions_test))

4.730319874813769
4.5643977766745385


0.9034322259280747