In [121]:
import os
os.chdir('/Users/sophiaperides/Desktop/Thinkful')
import numpy as np
import pandas as pd
import datetime as dt
import pylab
from matplotlib import pyplot as plt
%matplotlib inline
import scipy as sc
from scipy.stats import ttest_ind
from scipy.stats import boxcox
import re
import seaborn as sns
import scipy.stats as stats
from scipy.stats.mstats import winsorize
from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import power_transform
from sklearn.decomposition import PCA
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn import linear_model
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from statsmodels.tools.eval_measures import mse, rmse
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings(action="ignore")

In [122]:
fifa = pd.read_csv('2019fifadata.csv')
fifa = pd.DataFrame(fifa)
print(fifa.columns)
print(fifa.info())

Index(['Unnamed: 0', 'ID', 'Name', 'Age', 'Photo', 'Nationality', 'Flag',
       'Overall', 'Potential', 'Club', 'Club Logo', 'Value', 'Wage', 'Special',
       'Preferred Foot', 'International Reputation', 'Weak Foot',
       'Skill Moves', 'Work Rate', 'Body Type', 'Real Face', 'Position',
       'Jersey Number', 'Joined', 'Loaned From', 'Contract Valid Until',
       'Height', 'Weight', 'LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW',
       'LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM',
       'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB', 'Crossing',
       'Finishing', 'HeadingAccuracy', 'ShortPassing', 'Volleys', 'Dribbling',
       'Curve', 'FKAccuracy', 'LongPassing', 'BallControl', 'Acceleration',
       'SprintSpeed', 'Agility', 'Reactions', 'Balance', 'ShotPower',
       'Jumping', 'Stamina', 'Strength', 'LongShots', 'Aggression',
       'Interceptions', 'Positioning', 'Vision', 'Penalties', 'Composure',
       'Marking', 'StandingTackle', 'SlidingT

## Introduction 
We'll look at how to predict a player's wage using the FIFA dataset . I'll remove columns that don't provide useful information (Unnamed:0, ID, Name, Photo, Flag, Club Logo, and Real Face). As we have 18,207 entries and multiple columns are missing 48 values, my instinct is that these rows are missing information in these columns. As such, I'll remove rows missing information in 48 cells.

Columns LS through RB are missing values in over two thousand cells and while it's not obvious what each variable represents (there's no information on the Kaggle page and it's not immediately clear on the sofifa.com page), they could contain valuable information. Each value is a string of a number plus another number, so I'm going to strip the addition sign and the second number, and populate these cells with the first number. I will then fill the missing values with the mean.

Joined, Jersey Number, Loaned From, and Release Clause

After these updates, we're still missing values in Club, Contract Valid Until, and Position columns. While I could go through and find the information on each of these, I don't think that would be a great use of time for this project, so I'm going to drop rows for which there aren't values for Club/Contract valid until and hope this takes care of rows lacking a value in Position as well. Finally, Club is a categorical variable with over 650 categories, which will be a pain to create/utilize dummies for, so I will remove this column.

We've managed to clean up our data and keep over 98% of the rows. If we have trouble coming up with a model, we'll look at adding back some columns we've removed.

In [123]:
fifa = fifa.dropna(thresh=48, axis=0)

variables = ['LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW',
       'LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM',
       'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB']

data = fifa[variables]
df = data.astype(str).apply(lambda x: x.str.split('+').str[0])
df = df.astype(float)
df.fillna(df.mean(), inplace=True)
df.head(10)


Unnamed: 0,LS,ST,RS,LW,LF,CF,RF,RW,LAM,CAM,RAM,LM,LCM,CM,RCM,RM,LWB,LDM,CDM,RDM,RWB,LB,LCB,CB,RCB,RB
0,88.0,88.0,88.0,92.0,93.0,93.0,93.0,92.0,93.0,93.0,93.0,91.0,84.0,84.0,84.0,91.0,64.0,61.0,61.0,61.0,64.0,59.0,47.0,47.0,47.0,59.0
1,91.0,91.0,91.0,89.0,90.0,90.0,90.0,89.0,88.0,88.0,88.0,88.0,81.0,81.0,81.0,88.0,65.0,61.0,61.0,61.0,65.0,61.0,53.0,53.0,53.0,61.0
2,84.0,84.0,84.0,89.0,89.0,89.0,89.0,89.0,89.0,89.0,89.0,88.0,81.0,81.0,81.0,88.0,65.0,60.0,60.0,60.0,65.0,60.0,47.0,47.0,47.0,60.0
3,57.81547,57.81547,57.81547,59.03765,58.71939,58.71939,58.71939,59.03765,58.983129,58.983129,58.983129,59.73471,58.220878,58.220878,58.220878,59.73471,57.513274,56.825146,56.825146,56.825146,57.513274,56.806786,55.614626,55.614626,55.614626,56.806786
4,82.0,82.0,82.0,87.0,87.0,87.0,87.0,87.0,88.0,88.0,88.0,88.0,87.0,87.0,87.0,88.0,77.0,77.0,77.0,77.0,77.0,73.0,66.0,66.0,66.0,73.0
5,83.0,83.0,83.0,89.0,88.0,88.0,88.0,89.0,89.0,89.0,89.0,89.0,82.0,82.0,82.0,89.0,66.0,63.0,63.0,63.0,66.0,60.0,49.0,49.0,49.0,60.0
6,77.0,77.0,77.0,85.0,84.0,84.0,84.0,85.0,87.0,87.0,87.0,86.0,88.0,88.0,88.0,86.0,82.0,81.0,81.0,81.0,82.0,79.0,71.0,71.0,71.0,79.0
7,87.0,87.0,87.0,86.0,87.0,87.0,87.0,86.0,85.0,85.0,85.0,84.0,79.0,79.0,79.0,84.0,69.0,68.0,68.0,68.0,69.0,66.0,63.0,63.0,63.0,66.0
8,73.0,73.0,73.0,70.0,71.0,71.0,71.0,70.0,71.0,71.0,71.0,72.0,75.0,75.0,75.0,72.0,81.0,84.0,84.0,84.0,81.0,84.0,87.0,87.0,87.0,84.0
9,57.81547,57.81547,57.81547,59.03765,58.71939,58.71939,58.71939,59.03765,58.983129,58.983129,58.983129,59.73471,58.220878,58.220878,58.220878,59.73471,57.513274,56.825146,56.825146,56.825146,57.513274,56.806786,55.614626,55.614626,55.614626,56.806786


In [124]:
fifa = fifa.drop(columns=['LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW',
       'LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM',
       'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB'])

fifa = pd.concat([fifa, df], axis=1)
pd.options.display.max_columns = None
fifa.head()

Unnamed: 0.1,Unnamed: 0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,Club Logo,Value,Wage,Special,Preferred Foot,International Reputation,Weak Foot,Skill Moves,Work Rate,Body Type,Real Face,Position,Jersey Number,Joined,Loaned From,Contract Valid Until,Height,Weight,Crossing,Finishing,HeadingAccuracy,ShortPassing,Volleys,Dribbling,Curve,FKAccuracy,LongPassing,BallControl,Acceleration,SprintSpeed,Agility,Reactions,Balance,ShotPower,Jumping,Stamina,Strength,LongShots,Aggression,Interceptions,Positioning,Vision,Penalties,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause,LS,ST,RS,LW,LF,CF,RF,RW,LAM,CAM,RAM,LM,LCM,CM,RCM,RM,LWB,LDM,CDM,RDM,RWB,LB,LCB,CB,RCB,RB
0,0,158023,L. Messi,31,https://cdn.sofifa.org/players/4/19/158023.png,Argentina,https://cdn.sofifa.org/flags/52.png,94,94,FC Barcelona,https://cdn.sofifa.org/teams/2/light/241.png,€110.5M,€565K,2202,Left,5.0,4.0,4.0,Medium/ Medium,Messi,Yes,RF,10.0,"Jul 1, 2004",,2021,5'7,159lbs,84.0,95.0,70.0,90.0,86.0,97.0,93.0,94.0,87.0,96.0,91.0,86.0,91.0,95.0,95.0,85.0,68.0,72.0,59.0,94.0,48.0,22.0,94.0,94.0,75.0,96.0,33.0,28.0,26.0,6.0,11.0,15.0,14.0,8.0,€226.5M,88.0,88.0,88.0,92.0,93.0,93.0,93.0,92.0,93.0,93.0,93.0,91.0,84.0,84.0,84.0,91.0,64.0,61.0,61.0,61.0,64.0,59.0,47.0,47.0,47.0,59.0
1,1,20801,Cristiano Ronaldo,33,https://cdn.sofifa.org/players/4/19/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94,94,Juventus,https://cdn.sofifa.org/teams/2/light/45.png,€77M,€405K,2228,Right,5.0,4.0,5.0,High/ Low,C. Ronaldo,Yes,ST,7.0,"Jul 10, 2018",,2022,6'2,183lbs,84.0,94.0,89.0,81.0,87.0,88.0,81.0,76.0,77.0,94.0,89.0,91.0,87.0,96.0,70.0,95.0,95.0,88.0,79.0,93.0,63.0,29.0,95.0,82.0,85.0,95.0,28.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0,€127.1M,91.0,91.0,91.0,89.0,90.0,90.0,90.0,89.0,88.0,88.0,88.0,88.0,81.0,81.0,81.0,88.0,65.0,61.0,61.0,61.0,65.0,61.0,53.0,53.0,53.0,61.0
2,2,190871,Neymar Jr,26,https://cdn.sofifa.org/players/4/19/190871.png,Brazil,https://cdn.sofifa.org/flags/54.png,92,93,Paris Saint-Germain,https://cdn.sofifa.org/teams/2/light/73.png,€118.5M,€290K,2143,Right,5.0,5.0,5.0,High/ Medium,Neymar,Yes,LW,10.0,"Aug 3, 2017",,2022,5'9,150lbs,79.0,87.0,62.0,84.0,84.0,96.0,88.0,87.0,78.0,95.0,94.0,90.0,96.0,94.0,84.0,80.0,61.0,81.0,49.0,82.0,56.0,36.0,89.0,87.0,81.0,94.0,27.0,24.0,33.0,9.0,9.0,15.0,15.0,11.0,€228.1M,84.0,84.0,84.0,89.0,89.0,89.0,89.0,89.0,89.0,89.0,89.0,88.0,81.0,81.0,81.0,88.0,65.0,60.0,60.0,60.0,65.0,60.0,47.0,47.0,47.0,60.0
3,3,193080,De Gea,27,https://cdn.sofifa.org/players/4/19/193080.png,Spain,https://cdn.sofifa.org/flags/45.png,91,93,Manchester United,https://cdn.sofifa.org/teams/2/light/11.png,€72M,€260K,1471,Right,4.0,3.0,1.0,Medium/ Medium,Lean,Yes,GK,1.0,"Jul 1, 2011",,2020,6'4,168lbs,17.0,13.0,21.0,50.0,13.0,18.0,21.0,19.0,51.0,42.0,57.0,58.0,60.0,90.0,43.0,31.0,67.0,43.0,64.0,12.0,38.0,30.0,12.0,68.0,40.0,68.0,15.0,21.0,13.0,90.0,85.0,87.0,88.0,94.0,€138.6M,57.81547,57.81547,57.81547,59.03765,58.71939,58.71939,58.71939,59.03765,58.983129,58.983129,58.983129,59.73471,58.220878,58.220878,58.220878,59.73471,57.513274,56.825146,56.825146,56.825146,57.513274,56.806786,55.614626,55.614626,55.614626,56.806786
4,4,192985,K. De Bruyne,27,https://cdn.sofifa.org/players/4/19/192985.png,Belgium,https://cdn.sofifa.org/flags/7.png,91,92,Manchester City,https://cdn.sofifa.org/teams/2/light/10.png,€102M,€355K,2281,Right,4.0,5.0,4.0,High/ High,Normal,Yes,RCM,7.0,"Aug 30, 2015",,2023,5'11,154lbs,93.0,82.0,55.0,92.0,82.0,86.0,85.0,83.0,91.0,91.0,78.0,76.0,79.0,91.0,77.0,91.0,63.0,90.0,75.0,91.0,76.0,61.0,87.0,94.0,79.0,88.0,68.0,58.0,51.0,15.0,13.0,5.0,10.0,13.0,€196.4M,82.0,82.0,82.0,87.0,87.0,87.0,87.0,87.0,88.0,88.0,88.0,88.0,87.0,87.0,87.0,88.0,77.0,77.0,77.0,77.0,77.0,73.0,66.0,66.0,66.0,73.0


In [125]:
fifa = fifa.drop(['Unnamed: 0', 'ID', 'Name', 'Photo', 'Flag', 'Club Logo', 'Real Face', 'Joined', 'Loaned From'], axis=1)

fifa.columns

Index(['Age', 'Nationality', 'Overall', 'Potential', 'Club', 'Value', 'Wage',
       'Special', 'Preferred Foot', 'International Reputation', 'Weak Foot',
       'Skill Moves', 'Work Rate', 'Body Type', 'Position', 'Jersey Number',
       'Contract Valid Until', 'Height', 'Weight', 'Crossing', 'Finishing',
       'HeadingAccuracy', 'ShortPassing', 'Volleys', 'Dribbling', 'Curve',
       'FKAccuracy', 'LongPassing', 'BallControl', 'Acceleration',
       'SprintSpeed', 'Agility', 'Reactions', 'Balance', 'ShotPower',
       'Jumping', 'Stamina', 'Strength', 'LongShots', 'Aggression',
       'Interceptions', 'Positioning', 'Vision', 'Penalties', 'Composure',
       'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving', 'GKHandling',
       'GKKicking', 'GKPositioning', 'GKReflexes', 'Release Clause', 'LS',
       'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW', 'LAM', 'CAM', 'RAM', 'LM',
       'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM', 'CDM', 'RDM', 'RWB', 'LB',
       'LCB', 'CB', 'RCB', 'RB'],

In [126]:
fifa = fifa[pd.notnull(fifa['Position'])]
fifa = fifa.drop(['Club', 'Contract Valid Until'], axis=1)

## Update Variables

There are quite a few columns that are currently being read as objects but should be integers or floats. I'll remove the €, K, and M from the Wage and Value columns and update the values as necessary. I will update the Height column into inches and remove the lbs from the Weight column. I'll perform one hot encoding on the remaining values (Nationality, Preferred Foot Work Rate, Body Type, and Position) to obtain dummies.

In [127]:
fifa.head(5)

Unnamed: 0,Age,Nationality,Overall,Potential,Value,Wage,Special,Preferred Foot,International Reputation,Weak Foot,Skill Moves,Work Rate,Body Type,Position,Jersey Number,Height,Weight,Crossing,Finishing,HeadingAccuracy,ShortPassing,Volleys,Dribbling,Curve,FKAccuracy,LongPassing,BallControl,Acceleration,SprintSpeed,Agility,Reactions,Balance,ShotPower,Jumping,Stamina,Strength,LongShots,Aggression,Interceptions,Positioning,Vision,Penalties,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause,LS,ST,RS,LW,LF,CF,RF,RW,LAM,CAM,RAM,LM,LCM,CM,RCM,RM,LWB,LDM,CDM,RDM,RWB,LB,LCB,CB,RCB,RB
0,31,Argentina,94,94,€110.5M,€565K,2202,Left,5.0,4.0,4.0,Medium/ Medium,Messi,RF,10.0,5'7,159lbs,84.0,95.0,70.0,90.0,86.0,97.0,93.0,94.0,87.0,96.0,91.0,86.0,91.0,95.0,95.0,85.0,68.0,72.0,59.0,94.0,48.0,22.0,94.0,94.0,75.0,96.0,33.0,28.0,26.0,6.0,11.0,15.0,14.0,8.0,€226.5M,88.0,88.0,88.0,92.0,93.0,93.0,93.0,92.0,93.0,93.0,93.0,91.0,84.0,84.0,84.0,91.0,64.0,61.0,61.0,61.0,64.0,59.0,47.0,47.0,47.0,59.0
1,33,Portugal,94,94,€77M,€405K,2228,Right,5.0,4.0,5.0,High/ Low,C. Ronaldo,ST,7.0,6'2,183lbs,84.0,94.0,89.0,81.0,87.0,88.0,81.0,76.0,77.0,94.0,89.0,91.0,87.0,96.0,70.0,95.0,95.0,88.0,79.0,93.0,63.0,29.0,95.0,82.0,85.0,95.0,28.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0,€127.1M,91.0,91.0,91.0,89.0,90.0,90.0,90.0,89.0,88.0,88.0,88.0,88.0,81.0,81.0,81.0,88.0,65.0,61.0,61.0,61.0,65.0,61.0,53.0,53.0,53.0,61.0
2,26,Brazil,92,93,€118.5M,€290K,2143,Right,5.0,5.0,5.0,High/ Medium,Neymar,LW,10.0,5'9,150lbs,79.0,87.0,62.0,84.0,84.0,96.0,88.0,87.0,78.0,95.0,94.0,90.0,96.0,94.0,84.0,80.0,61.0,81.0,49.0,82.0,56.0,36.0,89.0,87.0,81.0,94.0,27.0,24.0,33.0,9.0,9.0,15.0,15.0,11.0,€228.1M,84.0,84.0,84.0,89.0,89.0,89.0,89.0,89.0,89.0,89.0,89.0,88.0,81.0,81.0,81.0,88.0,65.0,60.0,60.0,60.0,65.0,60.0,47.0,47.0,47.0,60.0
3,27,Spain,91,93,€72M,€260K,1471,Right,4.0,3.0,1.0,Medium/ Medium,Lean,GK,1.0,6'4,168lbs,17.0,13.0,21.0,50.0,13.0,18.0,21.0,19.0,51.0,42.0,57.0,58.0,60.0,90.0,43.0,31.0,67.0,43.0,64.0,12.0,38.0,30.0,12.0,68.0,40.0,68.0,15.0,21.0,13.0,90.0,85.0,87.0,88.0,94.0,€138.6M,57.81547,57.81547,57.81547,59.03765,58.71939,58.71939,58.71939,59.03765,58.983129,58.983129,58.983129,59.73471,58.220878,58.220878,58.220878,59.73471,57.513274,56.825146,56.825146,56.825146,57.513274,56.806786,55.614626,55.614626,55.614626,56.806786
4,27,Belgium,91,92,€102M,€355K,2281,Right,4.0,5.0,4.0,High/ High,Normal,RCM,7.0,5'11,154lbs,93.0,82.0,55.0,92.0,82.0,86.0,85.0,83.0,91.0,91.0,78.0,76.0,79.0,91.0,77.0,91.0,63.0,90.0,75.0,91.0,76.0,61.0,87.0,94.0,79.0,88.0,68.0,58.0,51.0,15.0,13.0,5.0,10.0,13.0,€196.4M,82.0,82.0,82.0,87.0,87.0,87.0,87.0,87.0,88.0,88.0,88.0,88.0,87.0,87.0,87.0,88.0,77.0,77.0,77.0,77.0,77.0,73.0,66.0,66.0,66.0,73.0


In [128]:
fifa['Value'] = fifa['Value'].str.replace('€', '')
fifa['Value'] = fifa['Value'].str.replace('.', '')
fifa['Value'] = fifa['Value'].apply(lambda row: row.replace('K', '000') if 'K' in row else row.replace('M', '000000')).astype(float)

fifa['Wage'] = fifa['Wage'].str.replace('€', '')
fifa['Wage'] = fifa['Wage'].str.replace('.', '')
fifa['Wage'] = fifa['Wage'].apply(lambda row: row.replace('K', '000') if 'K' in row else row.replace('M', '000000')).astype(float)
fifa.head(5)

Unnamed: 0,Age,Nationality,Overall,Potential,Value,Wage,Special,Preferred Foot,International Reputation,Weak Foot,Skill Moves,Work Rate,Body Type,Position,Jersey Number,Height,Weight,Crossing,Finishing,HeadingAccuracy,ShortPassing,Volleys,Dribbling,Curve,FKAccuracy,LongPassing,BallControl,Acceleration,SprintSpeed,Agility,Reactions,Balance,ShotPower,Jumping,Stamina,Strength,LongShots,Aggression,Interceptions,Positioning,Vision,Penalties,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause,LS,ST,RS,LW,LF,CF,RF,RW,LAM,CAM,RAM,LM,LCM,CM,RCM,RM,LWB,LDM,CDM,RDM,RWB,LB,LCB,CB,RCB,RB
0,31,Argentina,94,94,1105000000.0,565000.0,2202,Left,5.0,4.0,4.0,Medium/ Medium,Messi,RF,10.0,5'7,159lbs,84.0,95.0,70.0,90.0,86.0,97.0,93.0,94.0,87.0,96.0,91.0,86.0,91.0,95.0,95.0,85.0,68.0,72.0,59.0,94.0,48.0,22.0,94.0,94.0,75.0,96.0,33.0,28.0,26.0,6.0,11.0,15.0,14.0,8.0,€226.5M,88.0,88.0,88.0,92.0,93.0,93.0,93.0,92.0,93.0,93.0,93.0,91.0,84.0,84.0,84.0,91.0,64.0,61.0,61.0,61.0,64.0,59.0,47.0,47.0,47.0,59.0
1,33,Portugal,94,94,77000000.0,405000.0,2228,Right,5.0,4.0,5.0,High/ Low,C. Ronaldo,ST,7.0,6'2,183lbs,84.0,94.0,89.0,81.0,87.0,88.0,81.0,76.0,77.0,94.0,89.0,91.0,87.0,96.0,70.0,95.0,95.0,88.0,79.0,93.0,63.0,29.0,95.0,82.0,85.0,95.0,28.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0,€127.1M,91.0,91.0,91.0,89.0,90.0,90.0,90.0,89.0,88.0,88.0,88.0,88.0,81.0,81.0,81.0,88.0,65.0,61.0,61.0,61.0,65.0,61.0,53.0,53.0,53.0,61.0
2,26,Brazil,92,93,1185000000.0,290000.0,2143,Right,5.0,5.0,5.0,High/ Medium,Neymar,LW,10.0,5'9,150lbs,79.0,87.0,62.0,84.0,84.0,96.0,88.0,87.0,78.0,95.0,94.0,90.0,96.0,94.0,84.0,80.0,61.0,81.0,49.0,82.0,56.0,36.0,89.0,87.0,81.0,94.0,27.0,24.0,33.0,9.0,9.0,15.0,15.0,11.0,€228.1M,84.0,84.0,84.0,89.0,89.0,89.0,89.0,89.0,89.0,89.0,89.0,88.0,81.0,81.0,81.0,88.0,65.0,60.0,60.0,60.0,65.0,60.0,47.0,47.0,47.0,60.0
3,27,Spain,91,93,72000000.0,260000.0,1471,Right,4.0,3.0,1.0,Medium/ Medium,Lean,GK,1.0,6'4,168lbs,17.0,13.0,21.0,50.0,13.0,18.0,21.0,19.0,51.0,42.0,57.0,58.0,60.0,90.0,43.0,31.0,67.0,43.0,64.0,12.0,38.0,30.0,12.0,68.0,40.0,68.0,15.0,21.0,13.0,90.0,85.0,87.0,88.0,94.0,€138.6M,57.81547,57.81547,57.81547,59.03765,58.71939,58.71939,58.71939,59.03765,58.983129,58.983129,58.983129,59.73471,58.220878,58.220878,58.220878,59.73471,57.513274,56.825146,56.825146,56.825146,57.513274,56.806786,55.614626,55.614626,55.614626,56.806786
4,27,Belgium,91,92,102000000.0,355000.0,2281,Right,4.0,5.0,4.0,High/ High,Normal,RCM,7.0,5'11,154lbs,93.0,82.0,55.0,92.0,82.0,86.0,85.0,83.0,91.0,91.0,78.0,76.0,79.0,91.0,77.0,91.0,63.0,90.0,75.0,91.0,76.0,61.0,87.0,94.0,79.0,88.0,68.0,58.0,51.0,15.0,13.0,5.0,10.0,13.0,€196.4M,82.0,82.0,82.0,87.0,87.0,87.0,87.0,87.0,88.0,88.0,88.0,88.0,87.0,87.0,87.0,88.0,77.0,77.0,77.0,77.0,77.0,73.0,66.0,66.0,66.0,73.0


In [129]:
fifa['Release Clause'] = fifa['Release Clause'].str[1:]
print(fifa[:5]['Release Clause'])
fifa.loc[fifa['Release Clause'].str.contains('M', na=False), 'Release Clause'] = fifa[fifa['Release Clause'].str.contains('M', na=False)]['Release Clause'].str[:-1].astype(float)*1000000
fifa.loc[fifa['Release Clause'].str.contains('K', na=False), 'Release Clause'] = fifa[fifa['Release Clause'].str.contains('K', na=False)]['Release Clause'].str[:-1].astype(float)*1000
print(fifa[:5]['Release Clause'])
fifa['Release Clause'].isnull().value_counts()
fifa['Release Clause'].fillna(fifa['Release Clause'].mean(), inplace=True)
print(fifa.info())


0    226.5M
1    127.1M
2    228.1M
3    138.6M
4    196.4M
Name: Release Clause, dtype: object
0    2.265e+08
1    1.271e+08
2    2.281e+08
3    1.386e+08
4    1.964e+08
Name: Release Clause, dtype: object
<class 'pandas.core.frame.DataFrame'>
Int64Index: 18147 entries, 0 to 18206
Data columns (total 78 columns):
Age                         18147 non-null int64
Nationality                 18147 non-null object
Overall                     18147 non-null int64
Potential                   18147 non-null int64
Value                       18147 non-null float64
Wage                        18147 non-null float64
Special                     18147 non-null int64
Preferred Foot              18147 non-null object
International Reputation    18147 non-null float64
Weak Foot                   18147 non-null float64
Skill Moves                 18147 non-null float64
Work Rate                   18147 non-null object
Body Type                   18147 non-null object
Position                    18147

In [130]:
def parse_ht(ht):
    # format: 7' 0.0"
    feet = float(ht[0])
    inches = float(ht[2:])
    return 12*feet + inches


fifa['Height'] = fifa['Height'].apply(lambda x: parse_ht(x))

In [131]:
fifa['Weight'] = fifa['Weight'].apply(lambda x: float(str(x).replace('lbs', '')))
pd.options.display.max_columns = None
fifa.head()

Unnamed: 0,Age,Nationality,Overall,Potential,Value,Wage,Special,Preferred Foot,International Reputation,Weak Foot,Skill Moves,Work Rate,Body Type,Position,Jersey Number,Height,Weight,Crossing,Finishing,HeadingAccuracy,ShortPassing,Volleys,Dribbling,Curve,FKAccuracy,LongPassing,BallControl,Acceleration,SprintSpeed,Agility,Reactions,Balance,ShotPower,Jumping,Stamina,Strength,LongShots,Aggression,Interceptions,Positioning,Vision,Penalties,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause,LS,ST,RS,LW,LF,CF,RF,RW,LAM,CAM,RAM,LM,LCM,CM,RCM,RM,LWB,LDM,CDM,RDM,RWB,LB,LCB,CB,RCB,RB
0,31,Argentina,94,94,1105000000.0,565000.0,2202,Left,5.0,4.0,4.0,Medium/ Medium,Messi,RF,10.0,67.0,159.0,84.0,95.0,70.0,90.0,86.0,97.0,93.0,94.0,87.0,96.0,91.0,86.0,91.0,95.0,95.0,85.0,68.0,72.0,59.0,94.0,48.0,22.0,94.0,94.0,75.0,96.0,33.0,28.0,26.0,6.0,11.0,15.0,14.0,8.0,226500000.0,88.0,88.0,88.0,92.0,93.0,93.0,93.0,92.0,93.0,93.0,93.0,91.0,84.0,84.0,84.0,91.0,64.0,61.0,61.0,61.0,64.0,59.0,47.0,47.0,47.0,59.0
1,33,Portugal,94,94,77000000.0,405000.0,2228,Right,5.0,4.0,5.0,High/ Low,C. Ronaldo,ST,7.0,74.0,183.0,84.0,94.0,89.0,81.0,87.0,88.0,81.0,76.0,77.0,94.0,89.0,91.0,87.0,96.0,70.0,95.0,95.0,88.0,79.0,93.0,63.0,29.0,95.0,82.0,85.0,95.0,28.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0,127100000.0,91.0,91.0,91.0,89.0,90.0,90.0,90.0,89.0,88.0,88.0,88.0,88.0,81.0,81.0,81.0,88.0,65.0,61.0,61.0,61.0,65.0,61.0,53.0,53.0,53.0,61.0
2,26,Brazil,92,93,1185000000.0,290000.0,2143,Right,5.0,5.0,5.0,High/ Medium,Neymar,LW,10.0,69.0,150.0,79.0,87.0,62.0,84.0,84.0,96.0,88.0,87.0,78.0,95.0,94.0,90.0,96.0,94.0,84.0,80.0,61.0,81.0,49.0,82.0,56.0,36.0,89.0,87.0,81.0,94.0,27.0,24.0,33.0,9.0,9.0,15.0,15.0,11.0,228100000.0,84.0,84.0,84.0,89.0,89.0,89.0,89.0,89.0,89.0,89.0,89.0,88.0,81.0,81.0,81.0,88.0,65.0,60.0,60.0,60.0,65.0,60.0,47.0,47.0,47.0,60.0
3,27,Spain,91,93,72000000.0,260000.0,1471,Right,4.0,3.0,1.0,Medium/ Medium,Lean,GK,1.0,76.0,168.0,17.0,13.0,21.0,50.0,13.0,18.0,21.0,19.0,51.0,42.0,57.0,58.0,60.0,90.0,43.0,31.0,67.0,43.0,64.0,12.0,38.0,30.0,12.0,68.0,40.0,68.0,15.0,21.0,13.0,90.0,85.0,87.0,88.0,94.0,138600000.0,57.81547,57.81547,57.81547,59.03765,58.71939,58.71939,58.71939,59.03765,58.983129,58.983129,58.983129,59.73471,58.220878,58.220878,58.220878,59.73471,57.513274,56.825146,56.825146,56.825146,57.513274,56.806786,55.614626,55.614626,55.614626,56.806786
4,27,Belgium,91,92,102000000.0,355000.0,2281,Right,4.0,5.0,4.0,High/ High,Normal,RCM,7.0,71.0,154.0,93.0,82.0,55.0,92.0,82.0,86.0,85.0,83.0,91.0,91.0,78.0,76.0,79.0,91.0,77.0,91.0,63.0,90.0,75.0,91.0,76.0,61.0,87.0,94.0,79.0,88.0,68.0,58.0,51.0,15.0,13.0,5.0,10.0,13.0,196400000.0,82.0,82.0,82.0,87.0,87.0,87.0,87.0,87.0,88.0,88.0,88.0,88.0,87.0,87.0,87.0,88.0,77.0,77.0,77.0,77.0,77.0,73.0,66.0,66.0,66.0,73.0


In [132]:
fifa.describe()

Unnamed: 0,Age,Overall,Potential,Value,Wage,Special,International Reputation,Weak Foot,Skill Moves,Jersey Number,Height,Weight,Crossing,Finishing,HeadingAccuracy,ShortPassing,Volleys,Dribbling,Curve,FKAccuracy,LongPassing,BallControl,Acceleration,SprintSpeed,Agility,Reactions,Balance,ShotPower,Jumping,Stamina,Strength,LongShots,Aggression,Interceptions,Positioning,Vision,Penalties,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause,LS,ST,RS,LW,LF,CF,RF,RW,LAM,CAM,RAM,LM,LCM,CM,RCM,RM,LWB,LDM,CDM,RDM,RWB,LB,LCB,CB,RCB,RB
count,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0,18147.0
mean,25.121122,66.253926,71.324076,13673950.0,9759.02353,1598.002645,1.113297,2.947154,2.361492,19.546096,71.361988,165.982972,49.738414,45.550229,52.300766,58.695432,42.912217,55.375158,47.176283,42.866038,52.721386,58.374828,64.612829,64.726236,63.501295,61.839147,63.964292,55.465201,65.091034,63.221579,65.31862,47.113187,55.876068,46.702761,49.962198,53.407781,48.546371,58.651127,47.286053,47.701879,45.666336,16.616906,16.393839,16.233041,16.389651,16.712019,4585061.0,57.81547,57.81547,57.81547,59.03765,58.71939,58.71939,58.71939,59.03765,58.983129,58.983129,58.983129,59.73471,58.220878,58.220878,58.220878,59.73471,57.513274,56.825146,56.825146,56.825146,57.513274,56.806786,55.614626,55.614626,55.614626,56.806786
std,4.669796,6.91332,6.132286,39934340.0,22030.250349,272.882706,0.39415,0.660498,0.756274,15.947765,2.649738,15.593388,18.364255,19.527445,17.381753,14.696075,17.6959,18.912224,18.396009,17.480034,15.325211,16.685643,14.93032,14.651776,14.768956,9.011056,14.136073,17.235534,11.822327,15.896381,12.552479,19.263142,17.366534,20.697462,19.530469,14.146594,15.703113,11.437138,19.90045,21.66363,21.287961,17.698612,16.909971,16.504103,17.037031,17.957521,10647970.0,8.585184,8.585184,8.585184,9.293333,9.255877,9.255877,9.255877,9.293333,9.188846,9.188846,9.188846,8.701323,8.276264,8.276264,8.276264,8.701323,8.441609,9.489691,9.489691,9.489691,8.441609,8.969557,11.03037,11.03037,11.03037,8.969557
min,16.0,46.0,48.0,0.0,0.0,731.0,1.0,1.0,1.0,1.0,61.0,110.0,5.0,2.0,4.0,7.0,4.0,4.0,6.0,3.0,9.0,5.0,12.0,12.0,14.0,21.0,16.0,2.0,15.0,12.0,17.0,3.0,11.0,3.0,2.0,10.0,5.0,3.0,3.0,2.0,3.0,1.0,1.0,1.0,1.0,1.0,13000.0,31.0,31.0,31.0,25.0,27.0,27.0,27.0,25.0,27.0,27.0,27.0,27.0,30.0,30.0,30.0,27.0,30.0,28.0,28.0,28.0,30.0,29.0,25.0,25.0,25.0,29.0
25%,21.0,62.0,67.0,300000.0,1000.0,1457.0,1.0,3.0,2.0,8.0,69.0,154.0,38.0,30.0,44.0,54.0,30.0,49.0,34.0,31.0,43.0,54.0,57.0,57.0,55.0,56.0,56.0,45.0,58.0,56.0,58.0,33.0,44.0,26.0,38.0,44.0,39.0,51.0,30.0,27.0,24.0,8.0,8.0,8.0,8.0,8.0,569000.0,53.0,53.0,53.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,55.0,53.0,53.0,53.0,55.0,52.0,51.0,51.0,51.0,52.0,51.0,47.0,47.0,47.0,51.0
50%,25.0,66.0,71.0,675000.0,3000.0,1635.0,1.0,3.0,2.0,17.0,71.0,165.0,54.0,49.0,56.0,62.0,44.0,61.0,48.0,41.0,56.0,63.0,67.0,67.0,66.0,62.0,66.0,59.0,66.0,66.0,67.0,51.0,59.0,52.0,55.0,55.0,49.0,60.0,53.0,55.0,52.0,11.0,11.0,11.0,11.0,11.0,1300000.0,57.81547,57.81547,57.81547,59.03765,58.71939,58.71939,58.71939,59.03765,58.983129,58.983129,58.983129,59.73471,58.220878,58.220878,58.220878,59.73471,57.513274,56.825146,56.825146,56.825146,57.513274,56.806786,55.614626,55.614626,55.614626,56.806786
75%,28.0,71.0,75.0,13000000.0,9000.0,1787.0,1.0,3.0,3.0,26.0,73.0,176.0,64.0,62.0,64.0,68.0,57.0,68.0,62.0,57.0,64.0,69.0,75.0,75.0,74.0,68.0,74.0,68.0,73.0,74.0,74.0,62.0,69.0,64.0,64.0,64.0,60.0,67.0,64.0,66.0,64.0,14.0,14.0,14.0,14.0,14.0,4585061.0,64.0,64.0,64.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,63.0,63.0,63.0,65.0,63.0,63.0,63.0,63.0,63.0,63.0,64.0,64.0,64.0,63.0
max,45.0,94.0,95.0,1185000000.0,565000.0,2346.0,5.0,5.0,5.0,99.0,81.0,243.0,93.0,95.0,94.0,93.0,90.0,97.0,94.0,94.0,93.0,96.0,97.0,96.0,96.0,96.0,96.0,95.0,95.0,96.0,97.0,94.0,95.0,92.0,95.0,94.0,92.0,96.0,94.0,93.0,91.0,90.0,92.0,91.0,90.0,94.0,228100000.0,91.0,91.0,91.0,92.0,93.0,93.0,93.0,92.0,93.0,93.0,93.0,91.0,88.0,88.0,88.0,91.0,85.0,87.0,87.0,87.0,85.0,84.0,87.0,87.0,87.0,84.0


In [133]:
fifa = pd.concat([fifa, pd.get_dummies(fifa['Nationality'])], axis=1)
fifa = pd.concat([fifa, pd.get_dummies(fifa["Preferred Foot"])], axis=1)
fifa = pd.concat([fifa, pd.get_dummies(fifa["Work Rate"])], axis=1)
fifa = pd.concat([fifa, pd.get_dummies(fifa["Body Type"])], axis=1)
fifa = pd.concat([fifa, pd.get_dummies(fifa["Position"])], axis=1)

In [134]:
corr_mat = np.abs((fifa.corr().loc[:, ['Wage']])).sort_values(by='Wage', ascending=False)
corr_mat[:50]

Unnamed: 0,Wage
Wage,1.0
Release Clause,0.828363
International Reputation,0.668612
Value,0.623457
Overall,0.571795
Reactions,0.495598
Potential,0.486608
LCM,0.430297
RCM,0.430297
CM,0.430297


## First Model
This model was run using features with absolute values of the correlation with the target greater that 0.4. I will first run it with cross validation and then with training and testing sets.

The cross validation is clearly terrible. However, the model with testing and training sets has a very high R-squared and a lower MSE than the model with cross validation. I'm concerned because these variables were highly correlated with the feature. Perhaps I didn't use enough variables, so I'll add some new variables to see if we can make it any better.

In [137]:
rfr = ensemble.RandomForestRegressor(n_estimators=200)
Y = fifa['Wage']
X = fifa[['International Reputation', 'Value', 'Overall', 'Reactions', 'Potential', 'RCM', 'CM', 'LCM', 'Composure']]

rfr.fit(X, Y)
cross_val_scores =  cross_val_score(rfr, X, Y, cv=10)
nmse_cross_val_score = cross_val_score(rfr, X, Y, cv=10, scoring='neg_mean_squared_error')
print('Mean R-Squared: ', cross_val_scores.mean())
print('Cross Validation Scores: ', cross_val_scores)
print('\n')
print('Mean Negative MSE: ', nmse_cross_val_score.mean())
print('Cross Validation MSE Scores: ', nmse_cross_val_score)
print('\n')
print('Feature Importances: ', rfr.feature_importances_)

Mean R-Squared:  -0.0641447120717421
Cross Validation Scores:  [-0.08807448 -0.01146194 -0.08247439  0.03231962 -0.06440305  0.01478216
  0.01456156 -0.04033929 -0.0679791  -0.34837822]


Mean Negative MSE:  -319220468.90178823
Cross Validation MSE Scores:  [-2.91617169e+09 -1.56713031e+08 -6.53156570e+07 -2.80042825e+07
 -1.28123271e+07 -6.45073699e+06 -2.86551052e+06 -2.12291727e+06
 -1.11377559e+06 -6.34764236e+05]


Feature Importances:  [0.01365503 0.05343957 0.77085552 0.03494443 0.02422837 0.01656937
 0.00133545 0.01754413 0.00376844 0.0165002  0.00168434 0.04547515]


## Second Model
This model uses the fifty features most correlated with the target. The performance of the cross validation model has improved slightly - the R-squared is a touch higher and the MSE a bit smaller. The evaluation metrics for the training and test sets are essentially the same.

In [21]:
rfr = ensemble.RandomForestRegressor(n_estimators=200)
Y = fifa['Wage']
X = fifa[['International Reputation', 'Value' , 'Overall', 'Reactions', 'Potential' , 'RCM' ,
          'CM', 'LCM', 'Composure', 'ST', 'LS', 'RS', 'CAM', 'RAM', 'LAM', 'LM','RM', 'CF', 
          'RF', 'LF', 'RW', 'LW', 'Special', 'RWB', 'LWB', 'Vision', 'RDM', 'LDM', 'CDM', 'ShortPassing',
          'LB', 'RB', 'BallControl', 'LongPassing', 'Skill Moves', 'Curve', 'ShotPower', 'Volleys',
          'LongShots', 'Dribbling', 'FKAccuracy', 'Crossing', 'Positioning', 'Penalties', 'RCB', 'CB', 
          'LCB', 'Finishing', 'Aggression']]

rfr.fit(X, Y)
cross_val_scores =  cross_val_score(rfr, X, Y, cv=10)
nmse_cross_val_score = cross_val_score(rfr, X, Y, cv=10, scoring='neg_mean_squared_error')
print('Mean R-Squared: ', cross_val_scores.mean())
print('Cross Validation Scores: ', cross_val_scores)
print('\n')
print('Mean Negative MSE: ', nmse_cross_val_score.mean())
print('Cross Validation MSE Scores: ', nmse_cross_val_score)
print('\n')
print('Feature Importances: ', rfr.feature_importances_)

Mean R-Squared:  -0.02740073928175999
Cross Validation Scores:  [-0.01970344  0.06397895  0.03063819  0.06092469  0.00240496  0.02851426
 -0.01748112  0.00178256  0.0073663  -0.43243275]


Mean Negative MSE:  -293928678.1341866
Cross Validation MSE Scores:  [-2.68235242e+09 -1.46607636e+08 -5.79380548e+07 -2.72057196e+07
 -1.21724373e+07 -6.30186884e+06 -2.92955450e+06 -2.06955520e+06
 -1.04229130e+06 -6.67238988e+05]


Feature Importances:  [6.57624514e-03 1.76117940e-02 7.56669641e-01 7.80506085e-03
 8.27481186e-03 1.96597761e-03 2.09371346e-04 1.78162276e-03
 1.31685235e-03 2.31270284e-03 2.37496254e-04 9.63497582e-03
 1.82995954e-03 4.82153269e-04 2.19594401e-03 1.13559401e-04
 2.37871349e-03 1.32914870e-04 1.47600451e-03 6.24154677e-04
 1.59556743e-03 1.01695552e-05 1.99453142e-03 1.01424609e-05
 1.52714387e-03 4.10693599e-04 1.51130457e-03 5.89149794e-04
 1.88015246e-03 4.52557785e-05 1.17900131e-03 5.48869153e-05
 1.28106119e-03 1.03562031e-04 1.66875840e-03 3.02152444e-03
 1.71

## Transforming to a Classification Problem
Because these models are performing so poorly under cross validation, I'm going to transform this problem from regression to classification. I will create a new target variable called 'Income Level' that will be determined by the wage - each additional €100K will earn an additional point - €0-€99,999 will have a score of 0, €100,000-199,999 will have a score of 1, etc. Hopefully this will increase the performance of the models.

In [141]:
fifa['Income Level'] = fifa['Wage'].apply(lambda x: 0 if 0 <= x <100000 else 1 if 100000 <= x <200000 
                                          else 2 if 200000 <= x <300000 else 3 if 3000000 <= x <400000
                                          else 4 if 4000000 <= x <500000 else 5)

fifa.head(5)

Unnamed: 0,Age,Overall,Potential,Value,Wage,Special,International Reputation,Weak Foot,Skill Moves,Jersey Number,Height,Weight,Crossing,Finishing,HeadingAccuracy,ShortPassing,Volleys,Dribbling,Curve,FKAccuracy,LongPassing,BallControl,Acceleration,SprintSpeed,Agility,Reactions,Balance,ShotPower,Jumping,Stamina,Strength,LongShots,Aggression,Interceptions,Positioning,Vision,Penalties,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause,LS,ST,RS,LW,LF,CF,RF,RW,LAM,CAM,RAM,LM,LCM,CM,RCM,RM,LWB,LDM,CDM,RDM,RWB,LB,LCB,CB,RCB,RB,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua & Barbuda,Argentina,Armenia,Australia,Austria,Azerbaijan,Barbados,Belarus,Belgium,Belize,Benin,Bermuda,Bolivia,Bosnia Herzegovina,Botswana,Brazil,Bulgaria,Burkina Faso,Burundi,Cameroon,Canada,Cape Verde,Central African Rep.,Chad,Chile,China PR,Colombia,Comoros,Congo,Costa Rica,Croatia,Cuba,Curacao,Cyprus,Czech Republic,DR Congo,Denmark,Dominican Republic,Ecuador,Egypt,El Salvador,England,Equatorial Guinea,Eritrea,Estonia,Ethiopia,FYR Macedonia,Faroe Islands,Fiji,Finland,France,Gabon,Gambia,Georgia,Germany,Ghana,Greece,Grenada,Guam,Guatemala,Guinea,Guinea Bissau,Guyana,Haiti,Honduras,Hong Kong,Hungary,Iceland,India,Indonesia,Iran,Iraq,Israel,Italy,Ivory Coast,Jamaica,Japan,Jordan,Kazakhstan,Kenya,Korea DPR,Korea Republic,Kosovo,Kuwait,Latvia,Lebanon,Liberia,Libya,Liechtenstein,Lithuania,Luxembourg,Madagascar,Mali,Malta,Mauritania,Mauritius,Mexico,Moldova,Montenegro,Montserrat,Morocco,Mozambique,Namibia,Netherlands,New Caledonia,New Zealand,Nicaragua,Niger,Nigeria,Northern Ireland,Norway,Oman,Palestine,Panama,Paraguay,Peru,Philippines,Poland,Portugal,Puerto Rico,Qatar,Republic of Ireland,Romania,Russia,Rwanda,Saudi Arabia,Scotland,Senegal,Serbia,Sierra Leone,Slovakia,Slovenia,South Africa,South Sudan,Spain,St Kitts Nevis,St Lucia,Sudan,Suriname,Sweden,Switzerland,Syria,São Tomé & Príncipe,Tanzania,Thailand,Togo,Trinidad & Tobago,Tunisia,Turkey,Uganda,Ukraine,United Arab Emirates,United States,Uruguay,Uzbekistan,Venezuela,Wales,Zambia,Zimbabwe,Left,Right,High/ High,High/ Low,High/ Medium,Low/ High,Low/ Low,Low/ Medium,Medium/ High,Medium/ Low,Medium/ Medium,Akinfenwa,C. Ronaldo,Courtois,Lean,Messi,Neymar,Normal,PLAYER_BODY_TYPE_25,Shaqiri,Stocky,CAM.1,CB.1,CDM.1,CF.1,CM.1,GK,LAM.1,LB.1,LCB.1,LCM.1,LDM.1,LF.1,LM.1,LS.1,LW.1,LWB.1,RAM.1,RB.1,RCB.1,RCM.1,RDM.1,RF.1,RM.1,RS.1,RW.1,RWB.1,ST.1,Income Level
0,31,94,94,1105000000.0,565000.0,2202,5.0,4.0,4.0,10.0,67.0,159.0,84.0,95.0,70.0,90.0,86.0,97.0,93.0,94.0,87.0,96.0,91.0,86.0,91.0,95.0,95.0,85.0,68.0,72.0,59.0,94.0,48.0,22.0,94.0,94.0,75.0,96.0,33.0,28.0,26.0,6.0,11.0,15.0,14.0,8.0,226500000.0,88.0,88.0,88.0,92.0,93.0,93.0,93.0,92.0,93.0,93.0,93.0,91.0,84.0,84.0,84.0,91.0,64.0,61.0,61.0,61.0,64.0,59.0,47.0,47.0,47.0,59.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,5
1,33,94,94,77000000.0,405000.0,2228,5.0,4.0,5.0,7.0,74.0,183.0,84.0,94.0,89.0,81.0,87.0,88.0,81.0,76.0,77.0,94.0,89.0,91.0,87.0,96.0,70.0,95.0,95.0,88.0,79.0,93.0,63.0,29.0,95.0,82.0,85.0,95.0,28.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0,127100000.0,91.0,91.0,91.0,89.0,90.0,90.0,90.0,89.0,88.0,88.0,88.0,88.0,81.0,81.0,81.0,88.0,65.0,61.0,61.0,61.0,65.0,61.0,53.0,53.0,53.0,61.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,5
2,26,92,93,1185000000.0,290000.0,2143,5.0,5.0,5.0,10.0,69.0,150.0,79.0,87.0,62.0,84.0,84.0,96.0,88.0,87.0,78.0,95.0,94.0,90.0,96.0,94.0,84.0,80.0,61.0,81.0,49.0,82.0,56.0,36.0,89.0,87.0,81.0,94.0,27.0,24.0,33.0,9.0,9.0,15.0,15.0,11.0,228100000.0,84.0,84.0,84.0,89.0,89.0,89.0,89.0,89.0,89.0,89.0,89.0,88.0,81.0,81.0,81.0,88.0,65.0,60.0,60.0,60.0,65.0,60.0,47.0,47.0,47.0,60.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,2
3,27,91,93,72000000.0,260000.0,1471,4.0,3.0,1.0,1.0,76.0,168.0,17.0,13.0,21.0,50.0,13.0,18.0,21.0,19.0,51.0,42.0,57.0,58.0,60.0,90.0,43.0,31.0,67.0,43.0,64.0,12.0,38.0,30.0,12.0,68.0,40.0,68.0,15.0,21.0,13.0,90.0,85.0,87.0,88.0,94.0,138600000.0,57.81547,57.81547,57.81547,59.03765,58.71939,58.71939,58.71939,59.03765,58.983129,58.983129,58.983129,59.73471,58.220878,58.220878,58.220878,59.73471,57.513274,56.825146,56.825146,56.825146,57.513274,56.806786,55.614626,55.614626,55.614626,56.806786,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
4,27,91,92,102000000.0,355000.0,2281,4.0,5.0,4.0,7.0,71.0,154.0,93.0,82.0,55.0,92.0,82.0,86.0,85.0,83.0,91.0,91.0,78.0,76.0,79.0,91.0,77.0,91.0,63.0,90.0,75.0,91.0,76.0,61.0,87.0,94.0,79.0,88.0,68.0,58.0,51.0,15.0,13.0,5.0,10.0,13.0,196400000.0,82.0,82.0,82.0,87.0,87.0,87.0,87.0,87.0,88.0,88.0,88.0,88.0,87.0,87.0,87.0,88.0,77.0,77.0,77.0,77.0,77.0,73.0,66.0,66.0,66.0,73.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,5


In [None]:
# Finding the best parameters for the classifier.

Y = fifa['Income Level']
X = fifa[['Overall',  'Value', 'RB', 'Aggression']]

rfc_gsc = GridSearchCV(
        estimator=ensemble.RandomForestClassifier(),
        param_grid={
            'max_depth': [None, 10, 30, 50],
            'max_features': [2, 3, 4],
            'min_samples_leaf': [1, 2, 3],
            'min_samples_split': [2, 4, 6],
            'n_estimators': [50, 100, 500, 1000]
        },
        cv=10, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)

rfc_gsc.fit(X, Y)
best_params = rfc_gsc.best_params_

best_rfc = ensemble.RandomForestClassifier(max_depth=best_params['max_depth'], max_features=best_params["max_features"],
                                         min_samples_leaf=best_params["min_samples_leaf"], min_samples_split=best_params["min_samples_split"],
                                         n_estimators=best_params['n_estimators'],verbose=1)

print('Parameters for the best Support Vector Model: ', best_rfc)

Fitting 10 folds for each of 432 candidates, totalling 4320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  7.9min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 27.8min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 62.6min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 117.9min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 221.5min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 370.1min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed: 564.2min


## Third Model
This model is much better - clearly, classification was the way to go. Our R-squared is now 0.915 with cross validation and our MSE is below zero.

In [145]:
rfc = ensemble.RandomForestClassifier(bootstrap=True,  class_weight=None,
                       criterion='gini', max_depth=30, max_features=4,
                       max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=4,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
Y = fifa['Income Level']
X = fifa[['Overall',  'Value', 'RB', 'Aggression']]

rfc.fit(X, Y)
cross_val_scores =  cross_val_score(rfc, X, Y, cv=10)
nmse_cross_val_score = cross_val_score(rfc, X, Y, cv=10, scoring='neg_mean_squared_error')
print('Mean R-Squared: ', cross_val_scores.mean())
print('Cross Validation Scores: ', cross_val_scores)
print('\n')
print('Mean Negative MSE: ', nmse_cross_val_score.mean())
print('Cross Validation MSE Scores: ', nmse_cross_val_score)
print('\n')
print('Feature Importances: ', rfc.feature_importances_)

Mean R-Squared:  0.9153984873152792
Cross Validation Scores:  [0.22839846 0.99779736 0.99504405 0.99063361 0.99062845 0.99062845
 0.99007718 0.99007718 0.99062845 0.9900717 ]


Mean Negative MSE:  -0.09104722063824708
Cross Validation MSE Scores:  [-0.78756192 -0.01376652 -0.00440529 -0.02203857 -0.01819184 -0.0137817
 -0.01433297 -0.00937155 -0.00937155 -0.0176503 ]


Feature Importances:  [0.58481607 0.18400363 0.10508509 0.00236581 0.12372939]


# Fourth Model

I'm going to perform PCA on the data and see if I can come up with an even better model. I'll remove Natioanality, Preferred Foot, Work Rate, Body Type, and Position as I've created dummy variables for them.

This model performs great! With ten folds, we acheive an R-squared of 0.983 and a very small MSE. 

In [138]:
fifa = fifa.drop(['Nationality', 'Preferred Foot', 'Work Rate', 'Body Type', 'Position'], axis=1)
fifa_scaled = StandardScaler().fit_transform(fifa)

In [144]:
rfc = ensemble.RandomForestClassifier(max_depth=2, random_state=0)

pca = PCA(n_components=5)
X = pca.fit_transform(fifa)
Y = fifa['Income Level']

rfc.fit(X, Y)
cross_val_scores =  cross_val_score(rfc, X, Y, cv=10)
nmse_cross_val_score = cross_val_score(rfc, X, Y, cv=10, scoring='neg_mean_squared_error')
print('Mean R-Squared: ', cross_val_scores.mean())
print('Cross Validation Scores: ', cross_val_scores)
print('\n')
print('Mean Negative MSE: ', nmse_cross_val_score.mean())
print('Cross Validation MSE Scores: ', nmse_cross_val_score)
print('\n')
print('Feature Importances: ', rfc.feature_importances_)


Mean R-Squared:  0.9825962873810065
Cross Validation Scores:  [0.91414419 0.98898678 0.99118943 0.99063361 0.99062845 0.99007718
 0.99007718 0.99007718 0.99007718 0.9900717 ]


Mean Negative MSE:  -0.03404344545816122
Cross Validation MSE Scores:  [-0.10181618 -0.04240088 -0.02863436 -0.0292011  -0.02260198 -0.02315325
 -0.02149945 -0.02646086 -0.01984564 -0.02482074]


Feature Importances:  [0.27233768 0.29964562 0.36030093 0.06771577 0.        ]
