# Initial setting

## libraries

In [7]:
#RUN THIS CELL 
import requests
from IPython.core.display import HTML
styles = requests.get(
    "https://raw.githubusercontent.com/Harvard-IACS/2018-CS109A/master/content/styles/cs109.css"
).text
HTML(styles)

In [8]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import functools
import xgboost as xgb

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import log_loss

import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

%matplotlib inline

import seaborn as sns
sns.set(style='whitegrid')
pd.set_option('display.width', 1500)
pd.set_option('display.max_columns', 100)

In [9]:
# create a progressbar function
def progressbar(n_step, n_total):
    """Prints self-updating progress bar to stdout to track for-loop progress
    
    There are entire 3rd-party libraries dedicated to custom progress-bars.
    A simple function like this is often more than enough to get the job done.
    
    :param n_total: total number of expected for-loop iterations
    :type n_total: int
    :param n_step: current iteration number, starting at 0
    :type n_step: int

    .. example::
    
        for i in range(n_iterations):
            progressbar(i, n_iterations)
            
    .. source:
    
        This function is a simplified version of code found here:
        https://stackoverflow.com/questions/3160699/python-progress-bar/15860757#15860757
    """
    n_step = n_step + 1
    barlen = 50
    progress = n_step / n_total
    block = int(round(barlen * progress))
    status = ""
    if n_step == n_total:
        status = "Done...\r\n\n"
    text = "\r [{0}] {1}/{2} {3}".format(
        "=" * block + "-" * (barlen - block),
        n_step,
        n_total,
        status,
    )
    sys.stdout.write(text)
    sys.stdout.flush()

## dataset

In [10]:
# Load data
df = pd.read_csv('data/df_fifa.csv')

In [11]:
df.shape

(100995, 146)

In [12]:
df.head()

Unnamed: 0,sofifa_id,short_name,age,dob,height_cm,weight_kg,nationality,club,overall,value_eur,wage_eur,preferred_foot,international_reputation,weak_foot,skill_moves,work_rate,body_type,release_clause_eur,team_position,team_jersey_number,loaned_from,joined,contract_valid_until,year,league_name,d_trait_Cautious_With_Crosses,d_trait_Injury_Prone,d_trait_Avoids_Using_Weaker_Foot,d_trait_Backs_Into_Player,d_trait_Flair,d_trait_Saves_with_Feet,d_trait_Leadership,d_trait_Inflexible,d_trait_Finesse_Shot,d_trait_Team_Player,d_trait_1-on-1_Rush,d_trait_Set_Play_Specialist,d_trait_Rushes_Out_Of_Goal,d_trait_Beat_Offside_Trap,d_trait_Early_Crosser,d_trait_Second_Wind,d_trait_Through_Ball,d_trait_Long_Throw-in,d_trait_Giant_Throw-in,d_trait_Diver,d_trait_Acrobatic_Clearance,d_trait_Selfish,d_trait_GK_Up_for_Corners,d_trait_Argues_with_Officials,d_trait_Takes_Finesse_Free_Kicks,...,d_pos_GK,ab_pace,ab_shooting,ab_passing,ab_dribbling,ab_defending,ab_physic,ab_skill_moves,ab_gk_diving,ab_gk_handling,ab_gk_kicking,ab_gk_reflexes,ab_gk_speed,ab_gk_positioning,ab_attacking_crossing,ab_attacking_finishing,ab_attacking_heading_accuracy,ab_attacking_short_passing,ab_attacking_volleys,ab_skill_dribbling,ab_skill_curve,ab_skill_fk_accuracy,ab_skill_long_passing,ab_skill_ball_control,ab_movement_acceleration,ab_movement_sprint_speed,ab_movement_agility,ab_movement_reactions,ab_movement_balance,ab_power_shot_power,ab_power_jumping,ab_power_stamina,ab_power_strength,ab_power_long_shots,ab_mentality_aggression,ab_mentality_interceptions,ab_mentality_positioning,ab_mentality_vision,ab_mentality_penalties,ab_mentality_composure,ab_defending_marking,ab_defending_standing_tackle,ab_defending_sliding_tackle,ab_goalkeeping_diving,ab_goalkeeping_handling,ab_goalkeeping_kicking,ab_goalkeeping_positioning,ab_goalkeeping_reflexes,d_foot_left,d_div1_league
0,158023,L. Messi,27,1987-06-24,169,67,Argentina,FC Barcelona,93,0,0,Left,5,3,4,Medium/Low,Normal,,CF,10.0,,2004-07-01,2018.0,2015,Spain Primera Division,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,93.0,89.0,86.0,96.0,27.0,63.0,4,,,,,,,84,94,71,89,85,96,89,90,76,96,96,90,94,94,95,80,73,77,60,88,48,22,92,90,76,,25,21,20,6,11,15,14,8,1,1
1,20801,Cristiano Ronaldo,29,1985-02-05,185,80,Portugal,Real Madrid,92,0,0,Right,5,4,5,High/Low,Normal,,LW,7.0,,2009-07-01,2018.0,2015,Spain Primera Division,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,93.0,93.0,81.0,91.0,32.0,79.0,5,,,,,,,83,95,86,82,87,93,88,79,72,92,91,94,93,90,63,94,94,89,79,93,63,24,91,81,85,,22,31,23,7,11,15,14,11,0,1
2,9014,A. Robben,30,1984-01-23,180,80,Netherlands,FC Bayern München,90,0,0,Left,5,2,4,High/Low,Normal,,SUB,10.0,,2009-08-28,2017.0,2015,German 1. Bundesliga,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,...,0,93.0,86.0,83.0,92.0,32.0,64.0,4,,,,,,,80,85,50,86,86,93,85,83,76,90,93,93,93,89,91,86,61,78,65,90,47,39,89,84,80,,29,26,26,10,8,11,5,15,1,1
3,41236,Z. Ibrahimović,32,1981-10-03,195,95,Sweden,Paris Saint-Germain,90,0,0,Right,5,4,4,Medium/Low,Normal,,ST,10.0,,2012-07-01,2016.0,2015,French Ligue 1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,76.0,91.0,81.0,86.0,34.0,86.0,4,,,,,,,76,91,76,84,92,88,80,80,76,90,74,77,86,85,41,93,72,78,93,88,84,20,86,83,91,,25,41,27,13,15,10,9,12,0,1
4,167495,M. Neuer,28,1986-03-27,193,92,Germany,FC Bayern München,90,0,0,Right,5,4,1,Medium/Medium,Normal,,GK,1.0,,2011-07-01,2019.0,2015,German 1. Bundesliga,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,...,1,,,,,,,1,87.0,85.0,92.0,86.0,60.0,90.0,25,25,25,42,25,25,25,25,41,31,58,61,43,89,35,42,78,44,83,25,29,30,25,20,37,,25,25,25,87,85,92,90,86,0,1


To do
- merge overall 2020 & all predictors from 2019
- divide the dataset into train and test sets
- conduct PCA regression
- compute train and test MSE

# Data handling

## create the dataset for Part A

In [13]:
# filter 2020 and 5 clubs (this dataframe is going to be test set)
select_clubs = ['FC Barcelona','FC Bayern München','Real Madrid','Paris Saint-Germain','Juventus','Manchester City','Liverpool']
df_a_2020 = df[(df['year'] == 2020) & (df['club'].isin(select_clubs))]

# filter everything from 2019 (this is going to be training set)
df_a_2019 = df[df['year'] == 2019]

# merge two dataframe
df_a = pd.concat([df_a_2019, df_a_2020]).drop(['d_foot_left'], axis=1)
print(df_a_2020.shape, df_a_2019.shape, df_a.shape)

(221, 146) (17770, 146) (17991, 145)


In [14]:
# create dummies for nationality, work rate, foot
df_a_nationality = pd.get_dummies(df_a[['nationality']], prefix='d_nationality')
df_a_workrate = pd.get_dummies(df_a[['work_rate']], prefix='d_workrate')
df_a_foot = pd.get_dummies(df_a[['preferred_foot']], prefix='d_foot')
df_a_club = pd.get_dummies(df_a[['club']], prefix='d_foot')

# concate them into df_a
df_a = pd.concat([df_a, df_a_nationality, df_a_workrate, df_a_foot, df_a_club], axis=1)
df_a.shape

(17991, 995)

In [15]:
df_a.to_csv('data/df_a.csv', index=False)

## drop columns, imputation, and train-test divide

In [16]:
# drop unnecessary columns
drop_vars = ['sofifa_id','short_name','dob','nationality','work_rate','body_type','team_position','loaned_from',
            'preferred_foot','joined','contract_valid_until','league_name','main_position','team_jersey_number',
            'club']
df_a_all = df_a.drop(drop_vars, axis=1)

# impute zero values for fieldplayers and goalkeeping ability
# this is because goalkeeping ability is not available for field players and vice versa
impute_vars = ['ab_pace','ab_shooting','ab_passing','ab_dribbling','ab_defending','ab_physic','release_clause_eur']
impute_vars = impute_vars + [x for x in df_a_all.columns if x.startswith('ab_gk')]
for var in impute_vars:
    df_a_all[var] = df_a_all[var].fillna(0)
    
# impute mean value for ab_mentality_conposure
# Composure is a Player Attribute in FIFA that determines a player's the state or feeling of being calm and 
# controlling their frustration in matches frustration. (from FIFAplay)
df_a_all['ab_mentality_composure'].fillna(df_a_all['ab_mentality_composure'].mean(), inplace=True)

# select players in the following clubs as test set
df_a_te = df_a_all[df_a_all['year'] == 2020].drop(['year'], axis=1)
df_a_tr = df_a_all[df_a_all['year'] == 2019].drop(['year'], axis=1)
print(df_a_tr.shape, df_a_te.shape)

(17770, 979) (221, 979)


# Regression

In [17]:
# assign X and y
X_tr = df_a_tr.drop(['overall'], axis=1)
X_te = df_a_te.drop(['overall'], axis=1)
y_tr = df_a_tr['overall']
y_te = df_a_te['overall']

# standardization
scaler = StandardScaler().fit(X_tr)
X_tr_stan = scaler.transform(X_tr)
X_te_stan = scaler.transform(X_te)

## Lasso regression

In [12]:
# set parameters
la_alphas = [1e-2, 1e-1, 1, 1e+1, 1e+2]

# create empty lists to store errors
la_tr_err, la_val_err = [],[]

# run regression for each alpha
for i,alpha in enumerate(la_alphas):
    # update progressbar
    progressbar(i, len(la_alphas))
    
    # perform cross-validation on the training data with 10 folds and get the mse_scores
    lasso = Lasso(alpha=alpha, max_iter=10000)
    scores = cross_validate(lasso, 
                            X_tr_stan, 
                            y_tr, 
                            cv=5, 
                            scoring='neg_mean_squared_error', 
                            return_train_score=True,
                            n_jobs=-1)
    
    #Compute the train and validation MSE
    la_tr_err.append(scores['train_score'].mean() * -1)
    la_val_err.append(scores['test_score'].mean() * -1)

# find the degree that returns the minimum validation error
la_min_val_err = min(la_val_err)
la_best_alpha = la_alphas[la_val_err.index(la_min_val_err)]
print(la_min_val_err, la_best_alpha)


18.67553714870375 0.1


In [13]:
lasso_best = Lasso(alpha=la_best_alpha, max_iter=10000)
lasso_best.fit(X_tr_stan, y_tr)

lasso_tr_pred = lasso_best.predict(X_tr_stan)
lasso_te_pred = lasso_best.predict(X_te_stan)

lasso_mse_tr = mean_squared_error(y_tr, lasso_tr_pred)
lasso_mse_te = mean_squared_error(y_te, lasso_te_pred)
print(lasso_mse_tr, lasso_mse_te)

4.945287764552433 7.987019287381537


## Random Forest regression (m=p/3)

In [14]:
# parameters
rf_trees = list(range(100, 450, 50))
rf_depths = list(range(5, 16, 1))
 
rf_params = {'n_estimators': rf_trees, 
             'max_depth': rf_depths}
 
# grid search
rf = RandomForestRegressor(warm_start=True,max_features=int(X_tr_stan.shape[1]/3),random_state=81)
rf_gs = GridSearchCV(estimator=rf,param_grid=rf_params,scoring='neg_mean_squared_error',verbose=1,n_jobs=-1)
rf_gs.fit(X_tr_stan, y_tr)

# extract best parameters and estimator
rf_best_param = rf_gs.best_params_
rf_best_estimator = rf_gs.best_estimator_

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 133.1min


KeyboardInterrupt: 

In [None]:
# simple Random Forest regression
rf_tr_pred = rf_best_estimator.predict(X_tr_stan)
rf_te_pred = rf_best_estimator.predict(X_te_stan)

rf_mse_tr = mean_squared_error(y_tr, rf_tr_pred)
rf_mse_te = mean_squared_error(y_te, rf_te_pred)
print(rf_mse_tr, rf_mse_te)

## XGBoost

### grid search

In [18]:
# set parameters
param_space = {'min_child_weight': hp.loguniform('min_child_weight', np.log(1), np.log(10)),
               'max_depth': hp.quniform('max_depth', 3, 9, 1),
               'subsample': hp.quniform('subsample', 0.6, 0.95, 0.05),
               'colsample_bytree': hp.quniform('colsample_bytree', 0.6, 0.95, 0.05),
               'gamma': hp.loguniform('gamma', np.log(1e-8), np.log(1.0)),
               'reg_alpha': hp.loguniform('reg_alpha', np.log(1e-8), np.log(1.0)),
               'reg_lambda': hp.loguniform('reg_lambda', np.log(1e-6), np.log(10.0))}

# define score function
def score(params):
    xgb = XGBRegressor(random_state=81,
                       min_child_weight=params['min_child_weight'],
                       max_depth=int(params['max_depth']), 
                       subsample=params['subsample'], 
                       colsample_bytree=params['colsample_bytree'],
                       gamma=params['gamma'], 
                       reg_alpha=params['reg_alpha'], 
                       reg_lambda=params['reg_lambda'])
    scores = cross_validate(xgb, X_tr_stan, y_tr, 
                            cv=5, 
                            scoring='neg_mean_squared_error', 
                            n_jobs=-1)
    return -1 * scores['test_score'].mean()

In [19]:
# run gridsearch and find best parameters
max_evals = 100
trials = Trials()
history = []
rstate = np.random.RandomState(81)
best_params = fmin(score, param_space, algo=tpe.suggest, trials=trials, max_evals=max_evals, rstate=rstate)

# refit with the best parameters
xgb_best = XGBRegressor(random_state=81,
                        min_child_weight=best_params['min_child_weight'],
                        max_depth=int(best_params['max_depth']), 
                        subsample=best_params['subsample'], 
                        colsample_bytree=best_params['colsample_bytree'],
                        gamma=best_params['gamma'], 
                        reg_alpha=best_params['reg_alpha'], 
                        reg_lambda=best_params['reg_lambda'])
xgb_best.fit(X_tr_stan, y_tr)

 54%|█████████████████████████▉                      | 54/100 [46:02<39:13, 51.16s/trial, best loss: 9.486956623152892]


KeyboardInterrupt: 

In [None]:
xgb_tr_pred = xgb_best.predict(X_tr_stan)
xgb_te_pred = xgb_best.predict(X_te_stan)

xgb_mse_tr = mean_squared_error(y_tr, xgb_tr_pred)
xgb_mse_te = mean_squared_error(y_te, xgb_te_pred)
print(xgb_mse_tr, xgb_mse_te)

In [15]:
# create validation set
X_tr_xgb, X_val_xgb, y_tr_xgb, y_val_xgb = train_test_split(X_tr_stan, y_tr, train_size=0.7, random_state=81)
dtrain = xgb.DMatrix(X_tr_xgb, label=y_tr_xgb)
dvalid = xgb.DMatrix(X_val_xgb, label=y_val_xgb)
dtest = xgb.DMatrix(X_te_stan)

In [25]:
# set parameters
param_space = {'objective': 'reg:squarederror', 
               'booster': hp.choice('booster', ['gblinear', 'dart', 'gbtree']),
               'eta': 0.05,
               'gamma': hp.loguniform('gamma', np.log(1e-8), np.log(1.0)),
               'alpha': hp.loguniform('alpha', np.log(1e-8), np.log(1.0)),
               'lambda': hp.loguniform('lambda', np.log(1e-6), np.log(10.0)),
               'min_child_weight': hp.loguniform('min_child_weight', np.log(0.1), np.log(10)),
               'max_depth': hp.quniform('max_depth', 3, 9, 1),
               'subsample': hp.quniform('subsample', 0.6, 0.95, 0.05),
               'colsample_bytree': hp.quniform('colsample_bytree', 0.6, 0.95, 0.05),
               'random_state': 81}
num_round = 100000

# define score function
def score(params):
    params['max_depth'] = int(params['max_depth'])
    
    model = xgb.train(params, dtrain, num_round, evals=watchlist, early_stopping_rounds=50, verbose_eval=100)
    va_pred = model.predict(dvalid)

    val_score = mean_squared_error(y_val_xgb, va_pred)
    print(f'params: {params}, MSE: {val_score:.3f}')
    history.append((params, val_score))
    return {'loss': val_score, 'status': STATUS_OK}

In [None]:
# run grid search
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
max_evals = 10
trials = Trials()
history = []
fmin(score, param_space, algo=tpe.suggest, trials=trials, max_evals=max_evals)

history = sorted(history, key=lambda tpl: tpl[1])
best = history[0]
print(f'best params: {best[0]}, score:{best[1]:.3f}')

### refit the best model

In [None]:
# best parameter
best_params = {'objective': 'reg:squarederror', 
               'booster': best[0]['booster'],
               'eta': 0.05,
               'gamma': best[0]['gamma'],
               'alpha': best[0]['alpha'],
               'lambda': best[0]['lambda'],
               'min_child_weight': best[0]['min_child_weight'],
               'max_depth': best[0]['max_depth'],
               'subsample': best[0]['subsample'],
               'colsample_bytree': best[0]['colsample_bytree'],
               'random_state': 81}
num_round = 100000

# refit with the best parameters
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
xgb_best = xgb.train(best_params, dtrain, num_round, evals=watchlist, early_stopping_rounds=50, verbose_eval=100)

In [None]:
xgb_va_pred = xgb.predict(dvalid)
xgb_te_pred = xgb.predict(dtest)

xgb_val_score = mean_squared_error(y_val_xgb, xgb_va_pred)
xgb_test_score = mean_squared_error(y_te, xgb_te_pred)

print(f'Validation MSE: {xgb_val_score:.3f}, test MSE: {xgb_test_score:3f}.')