In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
import numpy as np
import statistics as stats
import sklearn
import re
from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

pd.set_option('display.max_columns', None)



%matplotlib inline

In [2]:
# Get the csv

df = pd.read_csv('fifa21_train.csv')


In [3]:
# Define a function


def preprocess(data):

    # Fix the weights (as kg)
    Weight = [] 
    for i in data['Weight']:
        Weight.append(str(i).replace('lbs' , ''))

    data['Weight'] = Weight
    data['Weight'] = data['Weight'].astype(np.number)
    data['Weight'] = data['Weight']/2.2046

    
    # Fix the heights (as cm)
    

    cmheight = []
    for item in data['Height']:
        item = str(item).replace('\"', '')
        feetinches = item.split('\'')
        cmheight.append(round(int(feetinches[0]) * 30.48) + int(feetinches[1]) * 2.54)
    data['Height'] = cmheight
        
    # Remove the stars from columns 'IR', 'W/F', and 'SM'
    newvalues = []
    for item in data['IR']:
        item = str(item).replace('★', '')
        item = item.strip()
        newvalues.append(int(item))
    data['IR']=newvalues
  
    newvalues = []
    for item in data['W/F']:
        item = str(item).replace('★', '')
        item = item.strip()
        newvalues.append(int(item))
    data['W/F']=newvalues
    
    newvalues = []
    for item in data['SM']:
        item = str(item).replace('★', '')
        item = item.strip()
        newvalues.append(int(item))
    data['SM']=newvalues
    
  
    # Transform the 'Contract' column to only give the end date of the contract 
#     contractend = []
#     for item in data['Contract']:
#         if 'Free' in item:
#             contractend.append('DELETE')
#         else:
#             item = item.replace(' On Loan', '')
#             item = item[-4:]
#             contractend.append(int(item))
        
#     data['Contract']=contractend

    # Transform the 'loan date end' column to an on loan (yes/no) column
    data['Loan Date End'] = data['Loan Date End'].fillna(0)

    loanyesno = []
    for item in data['Loan Date End']:
        if item == 0:
            loanyesno.append(item)
        else: 
            loanyesno.append(1)

    data['Loan Date End'] = loanyesno
    data = data.rename(columns={'Loan Date End':'On Loan'})

    data['On Loan']
    
    # Remove the plusses and minuses on the last few rows   
    toclean = ['LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW', 'LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM', 'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB', 'GK']

    superlist = []

    for column in toclean: 
        values = []
        for item in data[column]:
            if item[1] == '-' or item[1] == '+':
                values.append(int(item[0]))
            else: 
                values.append(int(item[:2]))
        superlist.append(values)


    for i in range(len(toclean)):
        data[toclean[i]] = superlist[i]

    # Clean up currency cells by removing € sign and converting 'M' and 'K' to numbers
    
    newvalues1 = []
    for item in data['Value']:
        item = item.replace('€', '')
        if 'K' in item: 
            item = float(item.replace('K', ''))
            item = item * 1000
            newvalues1.append(int(item))
        elif 'M' in item: 
            item = float(item.replace('M', ''))
            item = item * 1000000      
            newvalues1.append(int(item))
        else: 
            item = int(item)
            newvalues1.append(int(item))
    data['Value']=newvalues1
    
    
    newvalues2 = []
    for item in data['Wage']:
        item = item.replace('€', '')
        if 'K' in item: 
            item = float(item.replace('K', ''))
            item *= 1000
            newvalues2.append(int(item))
        elif 'M' in item: 
            item = float(item.replace('M', ''))
            item *= 1000000      
            newvalues2.append(int(item))
        else: 
            item = int(item)
            newvalues2.append(int(item))
    data['Wage']=newvalues2
    
    
    newvalues3 = []
    for item in data['Release Clause']:
        item = item.replace('€', '')
        if 'K' in item: 
            item = float(item.replace('K', ''))
            item *= 1000
            newvalues3.append(int(item))
        elif 'M' in item: 
            item = float(item.replace('M', ''))
            item *= 1000000      
            newvalues3.append(int(item))
        else: 
            item = int(item)
            newvalues3.append(int(item))
    data['Release Clause']=newvalues3
    
    
    # Clean up Hits column (which contains some values with K in them)¶
    newhits = []
    for item in data['Hits']:
        if 'K' in item:
            item = item.replace('K','')
            item = float(item) * 1000
            newhits.append(int(item))
        else: 
            newhits.append(int(item))
    data['Hits'] = newhits

    # Dropping stuff
    
    # Drop columns with irrelevant attributes
    #Some atributes are used to calculate the rating per category, that we already have, so individual attributes are not necessary here

    # Different versions for commenting out and experimenting
    
#     # First, a version with only the summed up columns (Attacking etc)
#     data = data.drop(['Nationality','Club','Team & Contract', 'Position', 'ID','Growth','Joined', 'Contract','Name','Crossing','Finishing','Heading Accuracy','Short Passing','Volleys',
#            'Dribbling','Curve','FK Accuracy','Long Passing','Ball Control',
#            'Acceleration','Sprint Speed','Agility','Reactions','Balance',
#           'Shot Power','Jumping','Stamina','Strength','Long Shots',
#           'Aggression','Interceptions','Positioning','Vision','Penalties','Composure',
#           'Marking','Standing Tackle','Sliding Tackle',
#           'GK Diving','GK Handling','GK Kicking','GK Positioning','GK Reflexes'], axis=1)
    
    # Then, a version with only the component columns
    data = data.drop(['Nationality','Club','Team & Contract', 'Position', 'ID','Growth','Joined', 'Contract','Name','Crossing','Finishing','Heading Accuracy','Short Passing','Volleys',
           'Attacking', 'Skill', 'Movement', 'Power', 'Mentality', 'Defending', 'Defending'], axis=1)
    
    
    
    #data = data.drop(['Age', 'Height', 'Weight', 'On Loan', 'Value', 'Wage', 'Release Clause'], axis=1)
    #data = data.drop(['Value', 'Wage', 'Release Clause'], axis=1)
    #data = data.drop(['Age', 'Height', 'Weight'], axis=1)

    for column in data.columns: 
        data = data[data[column].isna()==False]

    #Drop the rows with less than 5% of NaN
#     data = data[data['A/W'].isna()==False]
#     data = data[data['D/W'].isna()==False]
    
    return data

data = preprocess(df)

In [6]:
# And a function to do the X-Y split
y = data['OVA']
X = data.drop(['OVA'], axis = 1)

#Split numerical and categorical data
X_num = X.select_dtypes(np.number)
X_cat = X.select_dtypes(object)


transformer = MinMaxScaler().fit(X_num)
# encoder = OneHotEncoder().fit(X_cat)


def xysplit(dataframe): 
    
    #Initial split (y will remain unchanged, X will undergo more operations)
    y = dataframe['OVA']
    X = dataframe.drop(['OVA'], axis = 1)
    
    #Split numerical and categorical data
    X_num = X.select_dtypes(np.number)
    X_cat = X.select_dtypes(object)

    #Normalize the numerical data
    x_normalized = transformer.transform(X_num)
    X_normal = pd.DataFrame(x_normalized, columns=X_num.columns)

    #Encode categorical data
    encoder = OneHotEncoder().fit(X_cat)

    encoded = encoder.transform(X_cat).toarray()

    # And get relevant headers for the encoded categorical data
    headers = []

    for category in encoder.categories_:
        for unit in category: 
            headers.append(unit)

    categ_encoded=pd.DataFrame(encoded, columns=headers)
    
    #Finally, concatenate the (normalized) numerical and (encoded) categorical data
    X = pd.concat([X_normal, categ_encoded], axis = 1)
    
    return X, y

In [7]:
X, y = xysplit(data)

In [8]:
#Test train split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)

In [9]:
# Build the model 

lm = linear_model.LinearRegression()
lm.fit(X_train,y_train)

In [10]:
# A function to generate scores to evaluate

def scores(y_data, predictions):
    print("r2 score:",r2_score(y_data, predictions))
    print("MSE score:",mean_squared_error(y_data, predictions))
    print("RMSE score:",np.sqrt(mean_squared_error(y_data, predictions)))
    print("MAE score:", mean_absolute_error(y_data, predictions))


In [11]:
# Get the R2 score for the training and test data

predictions = lm.predict(X_train)
predictions_test = lm.predict(X_test)

print("Training data")
scores(y_train, predictions)

print("Test data")
scores(y_test, predictions_test)


Training data
r2 score: 0.9204477752628422
MSE score: 3.7003128809102765
RMSE score: 1.923619733967781
MAE score: 1.490929949997264
Test data
r2 score: 0.9229749236306197
MSE score: 3.7086299119945165
RMSE score: 1.9257803384588068
MAE score: 1.512179465262582


# Validate new data

In [12]:
df2 = pd.read_csv('fifa21_validate.csv')

In [13]:
data2 = preprocess(df2)

In [14]:
# y2 here gives a meaningless value, but we'll ignore that and keep working with the original y

X2, y2 = xysplit(data2)

In [15]:
X

Unnamed: 0,Age,Height,Weight,On Loan,Value,Wage,Release Clause,Dribbling,Curve,FK Accuracy,Long Passing,Ball Control,Acceleration,Sprint Speed,Agility,Reactions,Balance,Shot Power,Jumping,Stamina,Strength,Long Shots,Aggression,Interceptions,Positioning,Vision,Penalties,Composure,Marking,Standing Tackle,Sliding Tackle,Goalkeeping,GK Diving,GK Handling,GK Kicking,GK Positioning,GK Reflexes,Total Stats,Base Stats,W/F,SM,IR,PAC,SHO,PAS,DRI,DEF,PHY,Hits,LS,ST,RS,LW,LF,CF,RF,RW,LAM,CAM,RAM,LM,LCM,CM,RCM,RM,LWB,LDM,CDM,RDM,RWB,LB,LCB,CB,RCB,RB,GK,CAM.1,CB.1,CDM.1,CF.1,CM.1,GK.1,LB.1,LM.1,LW.1,LWB.1,RB.1,RM.1,RW.1,RWB.1,ST.1,Left,Right,High,Low,Medium,High.1,Low.1,Medium.1
0,0.370370,0.395947,0.349206,0.0,0.005833,0.007143,0.004811,0.615385,0.444444,0.561798,0.642857,0.637363,0.607143,0.729412,0.573171,0.591549,0.8125,0.602410,0.685714,0.694118,0.454545,0.455556,0.517241,0.552941,0.645161,0.682353,0.547619,0.500000,0.516854,0.595238,0.440476,0.084309,0.056818,0.116279,0.139535,0.078652,0.045455,0.604577,0.492366,0.75,0.25,0.00,0.614286,0.447368,0.558824,0.522388,0.506494,0.515625,0.000000,0.565789,0.565789,0.565789,0.602564,0.602564,0.602564,0.602564,0.602564,0.610390,0.610390,0.610390,0.631579,0.643836,0.643836,0.643836,0.631579,0.642857,0.614286,0.614286,0.614286,0.642857,0.637681,0.527778,0.527778,0.527778,0.637681,0.075949,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.518519,0.554560,0.333333,1.0,0.094444,0.041071,0.000000,0.857143,0.822222,0.752809,0.642857,0.813187,0.833333,0.847059,0.902439,0.718310,0.7250,0.746988,0.800000,0.741176,0.701299,0.711111,0.517241,0.329412,0.817204,0.729412,0.809524,0.690476,0.359551,0.166667,0.190476,0.100703,0.102273,0.058140,0.139535,0.056180,0.159091,0.781945,0.702290,0.50,0.75,0.25,0.814286,0.763158,0.632353,0.805970,0.272727,0.687500,0.004458,0.815789,0.815789,0.815789,0.807692,0.794872,0.794872,0.794872,0.807692,0.779221,0.779221,0.779221,0.802632,0.712329,0.712329,0.712329,0.802632,0.614286,0.528571,0.528571,0.528571,0.614286,0.565217,0.444444,0.444444,0.444444,0.565217,0.113924,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.629630,0.148480,0.134921,0.0,0.100000,0.087500,0.091892,0.879121,0.944444,0.966292,0.773810,0.879121,0.845238,0.764706,0.963415,0.760563,0.9500,0.807229,0.128571,0.741176,0.285714,0.822222,0.758621,0.247059,0.838710,0.800000,0.773810,0.833333,0.224719,0.273810,0.261905,0.021077,0.045455,0.011628,0.046512,0.011236,0.011364,0.759059,0.671756,0.75,0.75,0.25,0.771429,0.789474,0.779412,0.865672,0.194805,0.453125,0.007611,0.763158,0.763158,0.763158,0.846154,0.820513,0.820513,0.820513,0.846154,0.831169,0.831169,0.831169,0.842105,0.794521,0.794521,0.794521,0.842105,0.642857,0.571429,0.571429,0.571429,0.642857,0.565217,0.347222,0.347222,0.347222,0.565217,0.037975,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0.222222,0.445440,0.277778,0.0,0.003056,0.007143,0.004168,0.538462,0.411111,0.460674,0.571429,0.615385,0.488095,0.564706,0.548780,0.436620,0.4750,0.542169,0.500000,0.611765,0.493506,0.377778,0.597701,0.611765,0.311828,0.517647,0.547619,0.428571,0.584270,0.619048,0.583333,0.070258,0.068182,0.081395,0.046512,0.056180,0.113636,0.506039,0.385496,0.25,0.25,0.00,0.442857,0.355263,0.426471,0.432836,0.584416,0.515625,0.000435,0.460526,0.460526,0.460526,0.474359,0.461538,0.461538,0.461538,0.474359,0.480519,0.480519,0.480519,0.500000,0.547945,0.547945,0.547945,0.500000,0.600000,0.600000,0.600000,0.600000,0.600000,0.623188,0.583333,0.583333,0.583333,0.623188,0.063291,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0.259259,0.494934,0.261905,0.0,0.008056,0.003571,0.008408,0.648352,0.444444,0.449438,0.619048,0.670330,0.630952,0.647059,0.573171,0.535211,0.6500,0.590361,0.128571,0.811765,0.532468,0.433333,0.655172,0.647059,0.569892,0.635294,0.488095,0.547619,0.617978,0.654762,0.714286,0.093677,0.068182,0.081395,0.151163,0.033708,0.147727,0.593134,0.503817,0.25,0.50,0.00,0.571429,0.355263,0.514706,0.537313,0.623377,0.609375,0.000109,0.539474,0.539474,0.539474,0.576923,0.564103,0.564103,0.564103,0.576923,0.584416,0.584416,0.584416,0.618421,0.643836,0.643836,0.643836,0.618421,0.714286,0.685714,0.685714,0.685714,0.714286,0.710145,0.625000,0.625000,0.625000,0.710145,0.075949,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11417,0.481481,0.296960,0.293651,0.0,0.006111,0.000893,0.007207,0.626374,0.577778,0.573034,0.440476,0.637363,0.642857,0.635294,0.756098,0.661972,0.9000,0.626506,0.471429,0.517647,0.519481,0.677778,0.471264,0.329412,0.666667,0.317647,0.738095,0.571429,0.426966,0.154762,0.166667,0.112412,0.113636,0.127907,0.069767,0.123596,0.136364,0.576605,0.416031,0.50,0.50,0.00,0.571429,0.671053,0.323529,0.567164,0.272727,0.468750,0.000109,0.657895,0.657895,0.657895,0.602564,0.615385,0.615385,0.615385,0.602564,0.571429,0.571429,0.571429,0.565789,0.506849,0.506849,0.506849,0.565789,0.457143,0.414286,0.414286,0.414286,0.457143,0.434783,0.375000,0.375000,0.375000,0.434783,0.126582,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
11418,0.333333,0.604053,0.404762,0.0,0.006944,0.005357,0.005351,0.571429,0.500000,0.269663,0.535714,0.593407,0.773810,0.729412,0.658537,0.408451,0.5500,0.228916,0.671429,0.764706,0.623377,0.233333,0.620690,0.635294,0.526882,0.517647,0.369048,0.404762,0.629213,0.690476,0.654762,0.114754,0.125000,0.139535,0.139535,0.044944,0.136364,0.545455,0.454198,0.50,0.25,0.00,0.700000,0.144737,0.455882,0.462687,0.623377,0.656250,0.000000,0.434211,0.434211,0.434211,0.525641,0.474359,0.474359,0.474359,0.525641,0.493506,0.493506,0.493506,0.565789,0.547945,0.547945,0.547945,0.565789,0.700000,0.628571,0.628571,0.628571,0.700000,0.710145,0.625000,0.625000,0.625000,0.710145,0.088608,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
11419,0.407407,0.604053,0.452381,0.0,0.017778,0.007143,0.017417,0.131868,0.122222,0.157303,0.214286,0.142857,0.321429,0.341176,0.292683,0.577465,0.4625,0.457831,0.485714,0.200000,0.571429,0.122222,0.241379,0.200000,0.139785,0.058824,0.130952,0.523810,0.101124,0.119048,0.083333,0.784543,0.795455,0.755814,0.744186,0.764045,0.795455,0.249205,0.606870,0.25,0.00,0.00,0.657143,0.657895,0.602941,0.656716,0.363636,0.671875,0.000000,0.171053,0.171053,0.171053,0.128205,0.141026,0.141026,0.141026,0.128205,0.103896,0.103896,0.103896,0.131579,0.123288,0.123288,0.123288,0.131579,0.157143,0.142857,0.142857,0.142857,0.157143,0.144928,0.138889,0.138889,0.138889,0.144928,0.759494,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
11420,0.222222,0.247467,0.206349,0.0,0.012222,0.003571,0.010210,0.725275,0.588889,0.370787,0.583333,0.681319,0.761905,0.764706,0.878049,0.535211,0.8750,0.590361,0.657143,0.611765,0.220779,0.455556,0.206897,0.235294,0.688172,0.564706,0.678571,0.547619,0.382022,0.071429,0.083333,0.105386,0.125000,0.093023,0.104651,0.056180,0.159091,0.554355,0.416031,0.50,0.50,0.00,0.714286,0.565789,0.514706,0.656716,0.181818,0.250000,0.000326,0.618421,0.618421,0.618421,0.679487,0.641026,0.641026,0.641026,0.679487,0.636364,0.636364,0.636364,0.671053,0.575342,0.575342,0.575342,0.671053,0.500000,0.400000,0.400000,0.400000,0.500000,0.449275,0.263889,0.263889,0.263889,0.449275,0.101266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0


In [16]:
predictions2 = lm.predict(X2)

In [17]:
print(predictions2)

[65.78515625 65.81640625 53.70703125 ... 73.95703125 64.21484375
 61.12109375]


In [18]:
scores(y2, predictions2)

r2 score: 0.9186238254730352
MSE score: 3.6814679634638354
RMSE score: 1.918715185603073
MAE score: 1.492321783606978
