# Initial setting

In [1]:
#RUN THIS CELL 
import requests
from IPython.core.display import HTML
styles = requests.get(
    "https://raw.githubusercontent.com/Harvard-IACS/2018-CS109A/master/content/styles/cs109.css"
).text
HTML(styles)

In [2]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import functools
import pickle

import imblearn
from imblearn.over_sampling import SMOTE

from scipy import stats
from math import pi

from sklearn.utils import class_weight
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from xgboost.sklearn import XGBClassifier
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

%matplotlib inline

import seaborn as sns
sns.set(style='whitegrid')
pd.set_option('display.width', 1500)
pd.set_option('display.max_columns', 100)

In [3]:
# create a progressbar function
def progressbar(n_step, n_total):
    """Prints self-updating progress bar to stdout to track for-loop progress
    
    There are entire 3rd-party libraries dedicated to custom progress-bars.
    A simple function like this is often more than enough to get the job done.
    
    :param n_total: total number of expected for-loop iterations
    :type n_total: int
    :param n_step: current iteration number, starting at 0
    :type n_step: int

    .. example::
    
        for i in range(n_iterations):
            progressbar(i, n_iterations)
            
    .. source:
    
        This function is a simplified version of code found here:
        https://stackoverflow.com/questions/3160699/python-progress-bar/15860757#15860757
    """
    n_step = n_step + 1
    barlen = 50
    progress = n_step / n_total
    block = int(round(barlen * progress))
    status = ""
    if n_step == n_total:
        status = "Done...\r\n\n"
    text = "\r [{0}] {1}/{2} {3}".format(
        "=" * block + "-" * (barlen - block),
        n_step,
        n_total,
        status,
    )
    sys.stdout.write(text)
    sys.stdout.flush()

In [4]:
# common parameters for figures
fig_params = {'legend.fontsize': 'large',
              'figure.figsize': (10, 6),
              'axes.labelsize': 'large',
              'axes.titlesize':'x-large',
              'xtick.labelsize':'large',
              'ytick.labelsize':'large'}
pylab.rcParams.update(fig_params)

# Part B

## Data handling

### create the dataset for Part B

In [5]:
# Load data
df = pd.read_csv('data/df_fifa.csv')

# filter 2020 and 5 clubs (this dataframe is going to be test set)
select_clubs = ['FC Barcelona','FC Bayern München','Real Madrid','Paris Saint-Germain','Juventus','Manchester City','Liverpool']
df_b_2020 = df[(df['year'] == 2020) & (df['club'].isin(select_clubs))]

# filter everything from 2019 (this is going to be training set)
df_b_2019 = df[df['year'] == 2019]

# merge two dataframe
df_b = pd.concat([df_b_2019, df_b_2020]).drop(['d_foot_left'], axis=1)
print(df_b_2020.shape, df_b_2019.shape, df_b.shape)

(221, 146) (17770, 146) (17991, 145)


In [7]:
# create dummies for nationality, work rate, foot
df_b_nationality = pd.get_dummies(df_b[['nationality']], prefix='d_nationality')
df_b_workrate = pd.get_dummies(df_b[['work_rate']], prefix='d_workrate')
df_b_foot = pd.get_dummies(df_b[['preferred_foot']], prefix='d_foot')
#df_b_league = pd.get_dummies(df_b[['league_name']], prefix='d_foot')
#df_b_club = pd.get_dummies(df_b[['club']], prefix='d_foot')

# concate them into df_b
df_b = pd.concat([df_b, df_b_nationality, df_b_workrate, df_b_foot], axis=1)
df_b.shape

(17991, 316)

### drop columns, imputation, and train-test divide

In [8]:
# drop unnecessary columns
drop_vars = ['sofifa_id','short_name','dob','nationality','work_rate','body_type','team_position','loaned_from',
            'preferred_foot','joined','contract_valid_until','league_name','team_jersey_number','club']
df_b_all = df_b.copy().drop(drop_vars, axis=1)


# group target vars into 4 
position15 = ['ST','CF','LW','GK','CAM','CB','CM','CDM','RW','LB','LM','RB','RM','RWB','LWB']
position4 = ['FW','FW','FW','GK','MF','DF','MF','MF','FW','DF','MF','DF','MF','DF','DF']
position_conversion = dict(zip(position15, position4))
df_b_all['main_position'] = [position_conversion[x] for x in df_b_all['main_position']]
df_b_all['main_position'] = LabelEncoder().fit_transform(df_b_all['main_position'])

# impute zero values for fieldplayers and goalkeeping ability
# this is because goalkeeping ability is not available for field players and vice versa
impute_vars = ['ab_pace','ab_shooting','ab_passing','ab_dribbling','ab_defending','ab_physic','release_clause_eur']
impute_vars = impute_vars + [x for x in df_b_all.columns if x.startswith('ab_gk')]
for var in impute_vars:
    df_b_all[var] = df_b_all[var].fillna(0)
    
# impute mean value for ab_mentality_conposure
# Composure is a Player Attribute in FIFA that determines a player's the state or feeling of being calm and 
# controlling their frustration in matches frustration. (from FIFAplay)
df_b_all['ab_mentality_composure'].fillna(df_b_all['ab_mentality_composure'].mean(), inplace=True)

# select players in the following clubs as test set
df_b_te = df_b_all[df_b_all['year'] == 2020].drop(['year'], axis=1)
df_b_tr = df_b_all[df_b_all['year'] == 2019].drop(['year'], axis=1)
print(df_b_tr.shape, df_b_te.shape)

(17770, 301) (221, 301)


## Regression

In [24]:
# assign X and y
X_tr = df_b_tr.drop(['main_position'], axis=1)
X_te = df_b_te.drop(['main_position'], axis=1)
y_tr = df_b_tr['main_position']
y_te = df_b_te['main_position'].values

# count # of observations by class
# set the desired # of minority classes (up/down) at 15% of the majority class (stay)
counter = y_tr.value_counts().to_dict()
counter_new = {}
for key in counter.keys():
    if not key == 1:
        counter_new[key] = max(int(counter[1] * 0.15), counter[key])

# upsampling using SMOTE. 
oversample = SMOTE(sampling_strategy = counter_new)
X_tr, y_tr = oversample.fit_resample(X_tr.values, y_tr.values)

# standardization
scaler = StandardScaler().fit(X_tr)
X_tr_stan = scaler.transform(X_tr)
X_te_stan = scaler.transform(X_te)

In [25]:
b_tr_stan = np.concatenate((y_tr.reshape((len(y_tr), 1)),X_tr_stan), axis=1)
pd.DataFrame(data=b_tr_stan).to_csv('data/b_tr_stan.csv', index=False)
b_te_stan = np.concatenate((y_te.reshape((len(y_te), 1)),X_te_stan), axis=1)
pd.DataFrame(data=b_te_stan).to_csv('data/b_te_stan.csv', index=False)

#### XGBoost

In [27]:
# calculate class weight
class_weights = list(class_weight.compute_class_weight('balanced',np.unique(y_tr),y_tr))
w_array = np.ones(y_tr.shape[0], dtype = 'float')
for i in range(len(y_tr)):
    w_array[i] = class_weights[int(y_tr[i])]
w_array
# set parameters
param_space = {'min_child_weight': hp.loguniform('min_child_weight', np.log(1), np.log(10)),
               'max_depth': hp.quniform('max_depth', 3, 9, 1),
               'subsample': hp.quniform('subsample', 0.6, 0.95, 0.05),
               'colsample_bytree': hp.quniform('colsample_bytree', 0.6, 0.95, 0.05),
               'gamma': hp.loguniform('gamma', np.log(1e-8), np.log(1.0)),
               'reg_alpha': hp.loguniform('reg_alpha', np.log(1e-8), np.log(1.0)),
               'reg_lambda': hp.loguniform('reg_lambda', np.log(1e-6), np.log(10.0))}

# define score function
def score(params):
    xgb = XGBClassifier(random_state=81,
                        objective='multi:softmax',
                        min_child_weight=params['min_child_weight'],
                        max_depth=int(params['max_depth']), 
                        subsample=params['subsample'], 
                        colsample_bytree=params['colsample_bytree'],
                        gamma=params['gamma'], 
                        reg_alpha=params['reg_alpha'], 
                        reg_lambda=params['reg_lambda'],
                        sample_weight=w_array)
    scores = cross_validate(xgb, 
                            X=X_tr_stan, 
                            y=y_tr, 
                            cv=5, 
                            scoring='accuracy', 
                            n_jobs=-1)
    return scores['test_score'].mean()



In [28]:
# run gridsearch and find best parameters
max_evals = 100
trials = Trials()
history = []
rstate = np.random.RandomState(81)
best_params = fmin(score, param_space, algo=tpe.suggest, trials=trials, max_evals=max_evals, rstate=rstate)

# refit with the best parameters
xgb_best = XGBClassifier(random_state=81,
                         objective='multi:softmax',
                         min_child_weight=best_params['min_child_weight'],
                         max_depth=int(best_params['max_depth']), 
                         subsample=best_params['subsample'], 
                         colsample_bytree=best_params['colsample_bytree'],
                         gamma=best_params['gamma'], 
                         reg_alpha=best_params['reg_alpha'], 
                         reg_lambda=best_params['reg_lambda'])
xgb_best.fit(X_tr_stan, y_tr, sample_weight=w_array)

  0%|                                                                          | 0/100 [01:24<?, ?trial/s, best loss=?]


KeyboardInterrupt: 