In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
import numpy as np
import statistics as stats
import sklearn
import re

pd.set_option('display.max_columns', None)



%matplotlib inline

In [2]:
# Get the csv

df = pd.read_csv('fifa21_train.csv')


In [3]:
# Define a function


def preprocess(data):

    # Fix the weights (as kg)
    Weight = [] 
    for i in data['Weight']:
        Weight.append(str(i).replace('lbs' , ''))

    data['Weight'] = Weight
    data['Weight'] = data['Weight'].astype(np.number)
    data['Weight'] = data['Weight']/2.2046

    
    # Fix the heights (as cm)
    

    cmheight = []
    for item in data['Height']:
        item = str(item).replace('\"', '')
        feetinches = item.split('\'')
        cmheight.append(round(int(feetinches[0]) * 30.48) + int(feetinches[1]) * 2.54)
    data['Height'] = cmheight
        
    # Remove the stars from columns 'IR', 'W/F', and 'SM'
    newvalues = []
    for item in data['IR']:
        item = str(item).replace('★', '')
        item = item.strip()
        newvalues.append(int(item))
    data['IR']=newvalues
  
    newvalues = []
    for item in data['W/F']:
        item = str(item).replace('★', '')
        item = item.strip()
        newvalues.append(int(item))
    data['W/F']=newvalues
    
    newvalues = []
    for item in data['SM']:
        item = str(item).replace('★', '')
        item = item.strip()
        newvalues.append(int(item))
    data['SM']=newvalues
    
  
    # Transform the 'Contract' column to only give the end date of the contract 
    contractend = []
    for item in data['Contract']:
        item = item.replace(' On Loan', '')
        contractend.append(item[-4:])

    data['Contract']=contractend

    # Transform the 'loan date end' column to an on loan (yes/no) column
    data['Loan Date End'] = data['Loan Date End'].fillna(0)

    loanyesno = []
    for item in data['Loan Date End']:
        if item == 0:
            loanyesno.append(item)
        else: 
            loanyesno.append(1)

    data['Loan Date End'] = loanyesno
    data = data.rename(columns={'Loan Date End':'On Loan'})

    data['On Loan']
    
    # Remove the plusses and minuses on the last few rows   
    toclean = ['LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW', 'LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM', 'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB', 'GK']

    superlist = []

    for column in toclean: 
        values = []
        for item in data[column]:
            if item[1] == '-' or item[1] == '+':
                values.append(int(item[0]))
            else: 
                values.append(int(item[:2]))
        superlist.append(values)


    for i in range(len(toclean)):
        data[toclean[i]] = superlist[i]

    # Clean up currency cells by removing € sign and converting 'M' and 'K' to numbers
    
    column = 'Value'
    newvalues = []
    for item in data[column]:
        item = item.replace('€', '')
        if 'K' in item: 
            item = float(item.replace('K', ''))
            item *= 1000
            newvalues.append(int(item))
        elif 'M' in item: 
            item = float(item.replace('M', ''))
            item *= 1000000      
            newvalues.append(int(item))
        else: 
            item = int(item)
            newvalues.append(int(item))
    data[column]=newvalues
    
    
    column = 'Wage'
    newvalues = []
    for item in data[column]:
        item = item.replace('€', '')
        if 'K' in item: 
            item = float(item.replace('K', ''))
            item *= 1000
            newvalues.append(int(item))
        elif 'M' in item: 
            item = float(item.replace('M', ''))
            item *= 1000000      
            newvalues.append(int(item))
        else: 
            item = int(item)
            newvalues.append(int(item))
    data[column]=newvalues
    
    
    column = 'Release Clause'
    newvalues = []
    for item in data[column]:
        item = item.replace('€', '')
        if 'K' in item: 
            item = float(item.replace('K', ''))
            item *= 1000
            newvalues.append(int(item))
        elif 'M' in item: 
            item = float(item.replace('M', ''))
            item *= 1000000      
            newvalues.append(int(item))
        else: 
            item = int(item)
            newvalues.append(int(item))
    data[column]=newvalues
    
    
    # Clean up Hits column (which contains some values with K in them)¶
    newhits = []
    for item in data['Hits']:
        if 'K' in item:
            item = item.replace('K','')
            item = float(item) * 1000
            newhits.append(int(item))
        else: 
            newhits.append(int(item))
    data['Hits'] = newhits

    # Dropping stuff
    
    # Drop columns with irrelevant attributes
    #Some atributes are used to calculate the rating per category, that we already have, so individual attributes are not necessary here

    data.drop(['Crossing','Finishing','Heading Accuracy','Short Passing','Volleys',
           'Dribbling','Curve','FK Accuracy','Long Passing','Ball Control',
           'Acceleration','Sprint Speed','Agility','Reactions','Balance',
          'Shot Power','Jumping','Stamina','Strength','Long Shots',
          'Aggression','Interceptions','Positioning','Vision','Penalties','Composure',
          'Marking','Standing Tackle','Sliding Tackle',
          'GK Diving','GK Handling','GK Kicking','GK Positioning','GK Reflexes'], axis=1)
    

    #Drop the rows with less than 5% of NaN
    data = data[data['Club'].isna()==False]
    data = data[data['Position'].isna()==False]
    data = data[data['Joined'].isna()==False]
    data = data[data['A/W'].isna()==False]
    data = data[data['D/W'].isna()==False]

    
    return data

data = preprocess(df)

In [4]:
data.head()

Unnamed: 0,ID,Name,Age,Nationality,Club,BP,Position,Team & Contract,Height,Weight,foot,Growth,Joined,On Loan,Value,Wage,Release Clause,Contract,Attacking,Crossing,Finishing,Heading Accuracy,Short Passing,Volleys,Skill,Dribbling,Curve,FK Accuracy,Long Passing,Ball Control,Movement,Acceleration,Sprint Speed,Agility,Reactions,Balance,Power,Shot Power,Jumping,Stamina,Strength,Long Shots,Mentality,Aggression,Interceptions,Positioning,Vision,Penalties,Composure,Defending,Marking,Standing Tackle,Sliding Tackle,Goalkeeping,GK Diving,GK Handling,GK Kicking,GK Positioning,GK Reflexes,Total Stats,Base Stats,W/F,SM,A/W,D/W,IR,PAC,SHO,PAS,DRI,DEF,PHY,Hits,LS,ST,RS,LW,LF,CF,RF,RW,LAM,CAM,RAM,LM,LCM,CM,RCM,RM,LWB,LDM,CDM,RDM,RWB,LB,LCB,CB,RCB,RB,GK,OVA
0,184383,A. Pasche,26,Switzerland,FC Lausanne-Sport,CM,CM CDM,FC Lausanne-Sport 2015 ~ 2020,174.86,73.029121,Right,1,"Jul 1, 2015",0,525000,4000,801000,2020,258,54,47,43,70,44.0,286,61,44.0,55,63,63,346,64,73,61.0,66,82.0,306,62,73.0,71,55,45,290,54,52.0,62.0,68.0,54,54.0,148,49,56,43.0,48,7,12,14,9,6,1682,357,4,2,High,Medium,1,69,51,63,63,51,60,3,58,58,58,61,62,62,62,61,63,63,63,63,63,63,63,63,59,59,59,59,59,58,54,54,54,58,15,64
1,188044,Alan Carvalho,30,China PR,Beijing Sinobo Guoan FC,ST,ST LW LM,"Beijing Sinobo Guoan FC Dec 31, 2020 On Loan",183.0,72.121927,Right,0,"Jan 16, 2015",1,8500000,23000,0,2020,365,66,79,76,68,76.0,375,83,78.0,72,63,79,404,83,83,88.0,75,75.0,372,74,81.0,75,74,68,313,54,33.0,78.0,72.0,76,70.0,77,35,20,22.0,55,11,7,14,7,16,1961,412,3,4,High,Low,2,83,75,68,82,33,71,44,77,77,77,77,77,77,77,77,76,76,76,76,68,68,68,76,57,53,53,53,57,53,48,48,48,53,18,77
2,184431,S. Giovinco,33,Italy,Al Hilal,CAM,CAM CF,Al Hilal 2019 ~ 2022,162.16,60.782001,Right,0,"Jan 31, 2019",0,9000000,49000,15300000,2022,336,73,76,34,78,75.0,424,85,89.0,91,74,85,424,84,76,93.0,78,93.0,308,79,34.0,75,42,78,332,75,26.0,80.0,78.0,73,82.0,80,23,29,28.0,21,6,3,6,3,3,1925,404,4,4,High,Medium,2,80,77,78,86,27,56,73,73,73,73,80,79,79,79,80,80,80,80,79,74,74,74,79,59,56,56,56,59,53,41,41,41,53,12,80
3,233796,J. Evans,22,Wales,Swansea City,CDM,CDM CM,Swansea City 2016 ~ 2021,177.4,68.946748,Right,13,"Jul 1, 2016",0,275000,4000,694000,2021,242,44,42,58,62,36.0,259,54,41.0,46,57,61,282,54,59,59.0,55,55.0,277,57,60.0,64,58,38,257,61,57.0,31.0,54.0,54,48.0,168,55,58,55.0,42,8,9,6,7,12,1527,329,2,2,Medium,Medium,1,57,44,54,57,57,60,7,50,50,50,51,51,51,51,51,53,53,53,53,56,56,56,53,56,58,58,58,56,57,58,58,58,57,14,59
4,234799,Y. Demoncy,23,France,US Orléans Loiret Football,CDM,CDM CM,US Orléans Loiret Football 2018 ~ 2021,179.94,68.039554,Right,8,"Jul 1, 2018",0,725000,2000,1400000,2021,249,49,37,61,68,34.0,280,64,44.0,45,61,66,324,66,66,61.0,62,69.0,280,61,34.0,81,61,43,294,66,60.0,55.0,64.0,49,58.0,185,58,61,66.0,52,8,9,15,5,15,1664,360,2,3,Low,Medium,1,66,44,60,64,60,66,4,56,56,56,59,59,59,59,59,61,61,61,62,63,63,63,62,64,64,64,64,64,63,61,61,61,63,15,65


Unnamed: 0,ID,Name,Age,Nationality,Club,BP,Position,Team & Contract,Height,Weight,foot,Growth,Joined,Loan Date End,Value,Wage,Release Clause,Contract,Attacking,Crossing,Finishing,Heading Accuracy,Short Passing,Volleys,Skill,Dribbling,Curve,FK Accuracy,Long Passing,Ball Control,Movement,Acceleration,Sprint Speed,Agility,Reactions,Balance,Power,Shot Power,Jumping,Stamina,Strength,Long Shots,Mentality,Aggression,Interceptions,Positioning,Vision,Penalties,Composure,Defending,Marking,Standing Tackle,Sliding Tackle,Goalkeeping,GK Diving,GK Handling,GK Kicking,GK Positioning,GK Reflexes,Total Stats,Base Stats,W/F,SM,A/W,D/W,IR,PAC,SHO,PAS,DRI,DEF,PHY,Hits,LS,ST,RS,LW,LF,CF,RF,RW,LAM,CAM,RAM,LM,LCM,CM,RCM,RM,LWB,LDM,CDM,RDM,RWB,LB,LCB,CB,RCB,RB,GK,OVA
0,184383,A. Pasche,26,Switzerland,FC Lausanne-Sport,CM,CM CDM,FC Lausanne-Sport 2015 ~ 2020,"5'9""",161lbs,Right,1,"Jul 1, 2015",,€525K,€4K,€801K,2015 ~ 2020,258,54,47,43,70,44.0,286,61,44.0,55,63,63,346,64,73,61.0,66,82.0,306,62,73.0,71,55,45,290,54,52.0,62.0,68.0,54,54.0,148,49,56,43.0,48,7,12,14,9,6,1682,357,4 ★,2★,High,Medium,1 ★,69,51,63,63,51,60,3,58+1,58+1,58+1,61+0,62+0,62+0,62+0,61+0,63+1,63+1,63+1,63+1,63+1,63+1,63+1,63+1,59+1,59+1,59+1,59+1,59+1,58+1,54+1,54+1,54+1,58+1,15+1,64
1,188044,Alan Carvalho,30,China PR,Beijing Sinobo Guoan FC,ST,ST LW LM,"Beijing Sinobo Guoan FC Dec 31, 2020 On Loan","6'0""",159lbs,Right,0,"Jan 16, 2015","Dec 31, 2020",€8.5M,€23K,€0,"Dec 31, 2020 On Loan",365,66,79,76,68,76.0,375,83,78.0,72,63,79,404,83,83,88.0,75,75.0,372,74,81.0,75,74,68,313,54,33.0,78.0,72.0,76,70.0,77,35,20,22.0,55,11,7,14,7,16,1961,412,3 ★,4★,High,Low,2 ★,83,75,68,82,33,71,44,77+0,77+0,77+0,77+0,77+0,77+0,77+0,77+0,76+1,76+1,76+1,76+1,68+2,68+2,68+2,76+1,57+2,53+2,53+2,53+2,57+2,53+2,48+2,48+2,48+2,53+2,18+2,77
2,184431,S. Giovinco,33,Italy,Al Hilal,CAM,CAM CF,Al Hilal 2019 ~ 2022,"5'4""",134lbs,Right,0,"Jan 31, 2019",,€9M,€49K,€15.3M,2019 ~ 2022,336,73,76,34,78,75.0,424,85,89.0,91,74,85,424,84,76,93.0,78,93.0,308,79,34.0,75,42,78,332,75,26.0,80.0,78.0,73,82.0,80,23,29,28.0,21,6,3,6,3,3,1925,404,4 ★,4★,High,Medium,2 ★,80,77,78,86,27,56,73,73+2,73+2,73+2,80+0,79+0,79+0,79+0,80+0,80+0,80+0,80+0,79+1,74+2,74+2,74+2,79+1,59+2,56+2,56+2,56+2,59+2,53+2,41+2,41+2,41+2,53+2,12+2,80
3,233796,J. Evans,22,Wales,Swansea City,CDM,CDM CM,Swansea City 2016 ~ 2021,"5'10""",152lbs,Right,13,"Jul 1, 2016",,€275K,€4K,€694K,2016 ~ 2021,242,44,42,58,62,36.0,259,54,41.0,46,57,61,282,54,59,59.0,55,55.0,277,57,60.0,64,58,38,257,61,57.0,31.0,54.0,54,48.0,168,55,58,55.0,42,8,9,6,7,12,1527,329,2 ★,2★,Medium,Medium,1 ★,57,44,54,57,57,60,7,50+2,50+2,50+2,51+0,51+0,51+0,51+0,51+0,53+2,53+2,53+2,53+2,56+2,56+2,56+2,53+2,56+2,58+2,58+2,58+2,56+2,57+2,58+2,58+2,58+2,57+2,14+2,59
4,234799,Y. Demoncy,23,France,US Orléans Loiret Football,CDM,CDM CM,US Orléans Loiret Football 2018 ~ 2021,"5'11""",150lbs,Right,8,"Jul 1, 2018",,€725K,€2K,€1.4M,2018 ~ 2021,249,49,37,61,68,34.0,280,64,44.0,45,61,66,324,66,66,61.0,62,69.0,280,61,34.0,81,61,43,294,66,60.0,55.0,64.0,49,58.0,185,58,61,66.0,52,8,9,15,5,15,1664,360,2 ★,3★,Low,Medium,1 ★,66,44,60,64,60,66,4,56+2,56+2,56+2,59+0,59+0,59+0,59+0,59+0,61+2,61+2,61+2,62+2,63+2,63+2,63+2,62+2,64+2,64+2,64+2,64+2,64+2,63+2,61+2,61+2,61+2,63+2,15+2,65
