In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from tqdm import tqdm_notebook

import xgboost as xgb
import lightgbm as lgb
from skopt import BayesSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV, train_test_split, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, classification_report
warnings.filterwarnings('ignore')
%matplotlib inline

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you won't need to install the gcc compiler anymore.
Instead of that, you'll need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
games = pd.read_csv('Data/games.csv')
games['Date'] = pd.to_datetime(games['Date'])
gps = pd.read_csv('Data/gps.csv', low_memory=False)
rpe = pd.read_csv('Data/rpe.csv')
wellness = pd.read_csv('Data/wellness.csv')

In [3]:
wellness.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5011 entries, 0 to 5010
Data columns (total 19 columns):
Date                   5011 non-null object
PlayerID               5011 non-null int64
Fatigue                5011 non-null int64
Soreness               5011 non-null int64
Desire                 5011 non-null int64
Irritability           5011 non-null int64
BedTime                5011 non-null object
WakeTime               5011 non-null object
SleepHours             5011 non-null float64
SleepQuality           5011 non-null int64
MonitoringScore        5011 non-null int64
Pain                   5011 non-null object
Illness                5011 non-null object
Menstruation           4995 non-null object
Nutrition              4174 non-null object
NutritionAdjustment    4266 non-null object
USGMeasurement         4843 non-null object
USG                    629 non-null float64
TrainingReadiness      5011 non-null object
dtypes: float64(2), int64(7), object(10)
memory usage: 743.9+ K

In [4]:
rpe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8860 entries, 0 to 8859
Data columns (total 14 columns):
Date                 8860 non-null object
PlayerID             8860 non-null int64
Training             8860 non-null object
SessionType          7621 non-null object
Duration             7621 non-null float64
RPE                  7621 non-null float64
SessionLoad          7621 non-null float64
DailyLoad            3149 non-null float64
AcuteLoad            4349 non-null float64
ChronicLoad          4383 non-null float64
AcuteChronicRatio    4349 non-null float64
ObjectiveRating      4724 non-null float64
FocusRating          4751 non-null float64
BestOutOfMyself      3019 non-null object
dtypes: float64(9), int64(1), object(4)
memory usage: 969.1+ KB


In [5]:
def fill_cat_data_gb_player(data, col):
    return data.apply(lambda x: data.groupby('PlayerID')[col].value_counts()[x['PlayerID']].index[0] if not pd.notnull(x['SessionType']) else x['SessionType'], axis=1)

def fill_quant_data_gb_player(data, col):
    return  data[col].fillna(data.groupby(['PlayerID'])[col].transform('median'))

In [6]:
wellness['Menstruation'].fillna(value='No', inplace=True)
wellness['USG'] = wellness.apply(lambda x: wellness.groupby('PlayerID')['USG'].median()[x['PlayerID']], axis=1)
wellness['Nutrition'] = wellness.apply(lambda x: wellness.groupby('PlayerID')['Nutrition'].value_counts()[x['PlayerID']].index[0], axis=1)
wellness['NutritionAdjustment'] = wellness.apply(lambda x: wellness.groupby('PlayerID')['NutritionAdjustment'].value_counts()[x['PlayerID']].index[0], axis=1)
wellness['NutritionAdjustment'] = wellness.apply(lambda x: wellness.groupby('PlayerID')['NutritionAdjustment'].value_counts()[x['PlayerID']].index[0], axis=1)
wellness['USGMeasurement'].fillna(value='Yes', inplace=True)

In [7]:
wellness['WakeTime'] = pd.to_datetime(wellness['WakeTime'])
wellness['BedTime'] = pd.to_datetime(wellness['BedTime'])

wellness['TotalTimeSleptInSecs'] = wellness.apply(lambda x: (x['WakeTime'] - x['BedTime']).seconds, axis=1)
wellness.drop(['WakeTime', 'BedTime'], axis=1, inplace=True)

In [8]:
wellness['TrainingReadiness'] = wellness['TrainingReadiness'].map(lambda x: x.rstrip('%'))
wellness['TrainingReadiness'] = wellness['TrainingReadiness'].astype('int64')

wellness_cat_cols = wellness.columns[wellness.dtypes == 'object']
for col in wellness_cat_cols:
    if wellness[col].isnull().values.any():
        wellness[col] = wellness[col].fillna(wellness.groupby('PlayerID')[col].value_counts()[]

wellness['USG'] = fill_quant_data_gb_player(wellness, 'USG')

In [9]:
rpe['SessionType'] = fill_cat_data_gb_player(rpe, 'SessionType')
rpe['BestOutOfMyself'] = fill_cat_data_gb_player(rpe, 'BestOutOfMyself')
rpe_float_cols = rpe.columns[rpe.dtypes == 'float64']

for col in rpe_float_cols:
    if rpe[col].isnull().values.any():
        rpe[col] = fill_quant_data_gb_player(rpe, col)

In [10]:
temp = rpe

rpe = rpe.groupby(['Date', 'PlayerID', 'SessionType']).max().reset_index().drop(['SessionType', 'Duration', 'SessionLoad', ], axis=1)

In [11]:
count = pd.DataFrame(temp.groupby(['Date', 'PlayerID', 'SessionType']).count().reset_index()['SessionLoad'])
temp = temp.groupby(['Date', 'PlayerID', 'SessionType']).sum().reset_index()
temp = pd.concat([temp, pd.get_dummies(temp['SessionType'])], axis=1).drop('SessionType', axis=1)
temp['SessionCount'] = count
temp['Mobility/Recovery'] = temp.apply(lambda x: x['Mobility/Recovery'] * x['SessionLoad'], axis=1)
temp['Skills'] = temp.apply(lambda x: x['Skills'] * x['SessionLoad'], axis=1)
temp['Strength'] = temp.apply(lambda x: x['Strength'] * x['SessionLoad'], axis=1)
temp['Game'] = temp.apply(lambda x: x['Game'] * x['SessionLoad'], axis=1)
temp['Conditioning'] = temp.apply(lambda x: x['Conditioning'] * x['SessionLoad'], axis=1)
temp['Combat'] = temp.apply(lambda x: x['Combat'] * x['SessionLoad'], axis=1)
temp['Speed'] = temp.apply(lambda x: x['Speed'] * x['SessionLoad'], axis=1)
temp = temp.drop('SessionLoad', axis=1)

"""
'Mobility/Recovery', 'Game', 'Skills', 'Conditioning', 'Strength',
       'Combat', nan, 'Speed'
"""
rpe['SessionLoadTotal_Mobility/Recovery'] = temp['Mobility/Recovery']
rpe['SessionLoadTotal_Skills'] = temp['Skills']
rpe['SessionLoadTotal_Strength'] = temp['Strength']
rpe['SessionLoadTotal_Game'] = temp['Game']
rpe['SessionLoadTotal_Conditioning'] = temp['Conditioning']
rpe['SessionLoadTotal_Combat'] = temp['Combat']
rpe['SessionLoadTotal_Speed'] = temp['Speed']
rpe['SessionCount'] = temp['SessionCount']
rpe['DurationTotal'] = temp['Duration']

In [12]:
def printClassificationErrors(y_test, y_pred):
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    print('Classification Report:')
    print(classification_report(y_test, y_pred))
    print('Accuracy Score: {}'.format(accuracy_score(y_test, y_pred)))

In [13]:
def oof_predictions(model, X, y, n_folds=5, shuffle=True, random_state=42, predict='hard'):
    k_folds = StratifiedKFold(n_splits=n_folds, shuffle=shuffle, random_state=random_state)
    oof_pred_array = np.zeros(len(X))
    #oof_test_array = np.zeros(len(X_test_actual))
    
    for train_index, test_index in tqdm_notebook(k_folds.split(X, y), total=n_folds):
        X_train, X_test = X.loc[train_index, ], X.loc[test_index, ]
        y_train, y_test = y.loc[train_index, ], y.loc[test_index, ]

        #_, X_test_meta = X_test_actual.loc[train_index, ], X_test_actual.loc[test_index]
        
        
        #print(X_train)
        #X_train = X_train.reindex(index=train_index)
        #y_train = y_train.reindex(index=train_index)

        model.fit(X_train, y_train)

        if predict == 'soft':
            y_pred = model.predict_proba(X_test)
            y_pred = np.array([prob[1] for prob in y_pred])
            
            #y_pred_actual = model.predict_proba(X_test_meta)
            #y_pred_actual = np.array([prob[1] for prob in y_pred_actual])
        elif predict == 'hard':
            y_pred = model.predict(X_test)
            #y_pred_actual = model.predict(X_test_meta)
        
        oof_pred_array[test_index] = y_pred
        #oof_test_array[test_index] = y_pred_actual
    
    return oof_pred_array#, oof_test_array

In [14]:
data = rpe.merge(wellness, how='inner', left_on=['PlayerID', 'Date'], right_on=['PlayerID', 'Date'])
data = data.groupby(['Date', 'PlayerID']).max().reset_index()
data['Date'] = pd.to_datetime(data['Date'])
numerical_features = data.columns[data.dtypes != 'object']
categorical_features = data.columns[data.dtypes == 'object']

data = pd.get_dummies(data=data, columns=categorical_features)

In [19]:
data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4085 entries, 0 to 4084
Data columns (total 49 columns):
Date                                  4085 non-null datetime64[ns]
PlayerID                              4085 non-null int64
RPE                                   4085 non-null float64
DailyLoad                             4085 non-null float64
AcuteLoad                             4085 non-null float64
ChronicLoad                           4085 non-null float64
AcuteChronicRatio                     4085 non-null float64
ObjectiveRating                       4085 non-null float64
FocusRating                           4085 non-null float64
SessionLoadTotal_Mobility/Recovery    4085 non-null float64
SessionLoadTotal_Skills               4085 non-null float64
SessionLoadTotal_Strength             4085 non-null float64
SessionLoadTotal_Game                 4085 non-null float64
SessionLoadTotal_Conditioning         4085 non-null float64
SessionLoadTotal_Combat               4085 non-n

In [20]:
X = data.drop(['Fatigue', 'Date'], axis=1)
y = data['Fatigue']

In [21]:
columns = X.columns
mm = MinMaxScaler()
X[columns] = mm.fit_transform(X[columns])

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

In [23]:
n_folds=10

In [24]:
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

X_train = clean_dataset(X_train)

In [25]:
knn = KNeighborsClassifier(n_neighbors=8, p=1)
knn_oof_predictions = oof_predictions(knn, X, y, n_folds, predict='hard')

A Jupyter Widget




In [26]:
printClassificationErrors(y, knn_oof_predictions)

Confusion Matrix:
[[  35   66   53    7    1    0    0]
 [  28  185  266   73    0    0    0]
 [   6  106  952  392    4    0    0]
 [   2   25  516 1005   13    4    0]
 [   0    5   55  117   29    5    0]
 [   0    0    9   18    8   96    0]
 [   0    0    0    2    1    1    0]]
Classification Report:
             precision    recall  f1-score   support

          1       0.49      0.22      0.30       162
          2       0.48      0.34      0.39       552
          3       0.51      0.65      0.58      1460
          4       0.62      0.64      0.63      1565
          5       0.52      0.14      0.22       211
          6       0.91      0.73      0.81       131
          7       0.00      0.00      0.00         4

avg / total       0.56      0.56      0.55      4085

Accuracy Score: 0.5635250917992656


In [27]:
xgbc = xgb.XGBClassifier(objective='multi:softmax', 
                         colsample_bylevel=0.8, 
                         colsample_bytree=0.8, 
                         gamma=0.01, 
                         learning_rate=0.001, 
                         max_delta_step=20, 
                         max_depth=20, 
                         min_child_weight=1, 
                         n_estimators=100, 
                         reg_alpha=0, 
                         reg_lambda=1e-09, 
                         scale_pos_weight=0.5362970367111267, 
                         subsample=0.9719854748963996)

xgbc_oof_predictions = oof_predictions(xgbc, X, y, n_folds, predict='hard')
printClassificationErrors(y, xgbc_oof_predictions)

A Jupyter Widget


Confusion Matrix:
[[  61   91    9    1    0    0    0]
 [  37  325  180   10    0    0    0]
 [   3  109 1208  137    3    0    0]
 [   1    6  174 1342   39    3    0]
 [   0    0    5   99   94   13    0]
 [   0    0    1    3   27  100    0]
 [   0    0    0    0    1    3    0]]
Classification Report:
             precision    recall  f1-score   support

          1       0.60      0.38      0.46       162
          2       0.61      0.59      0.60       552
          3       0.77      0.83      0.80      1460
          4       0.84      0.86      0.85      1565
          5       0.57      0.45      0.50       211
          6       0.84      0.76      0.80       131
          7       0.00      0.00      0.00         4

avg / total       0.76      0.77      0.76      4085

Accuracy Score: 0.7662178702570379


In [28]:
rfc = RandomForestClassifier(n_estimators=140, 
                             max_features=4, 
                             max_depth=4500, 
                             min_samples_split=16, 
                             random_state=0)

rfc_oof_predictions = oof_predictions(rfc, X, y, n_folds)

A Jupyter Widget




In [29]:
printClassificationErrors(y, rfc_oof_predictions)

Confusion Matrix:
[[  29  103   29    1    0    0    0]
 [  10  279  251   12    0    0    0]
 [   1   84 1098  275    2    0    0]
 [   1    4  236 1304   19    1    0]
 [   0    0    6  139   63    3    0]
 [   0    0    0    8   26   97    0]
 [   0    0    0    0    3    1    0]]
Classification Report:
             precision    recall  f1-score   support

          1       0.71      0.18      0.29       162
          2       0.59      0.51      0.55       552
          3       0.68      0.75      0.71      1460
          4       0.75      0.83      0.79      1565
          5       0.56      0.30      0.39       211
          6       0.95      0.74      0.83       131
          7       0.00      0.00      0.00         4

avg / total       0.70      0.70      0.69      4085

Accuracy Score: 0.7025703794369645


In [30]:
logr = LogisticRegression()

logr_oof_predictions = oof_predictions(logr, X, y, n_folds)
printClassificationErrors(y, logr_oof_predictions)

A Jupyter Widget


Confusion Matrix:
[[   9   91   60    2    0    0    0]
 [   7  106  411   28    0    0    0]
 [   4   42 1076  334    4    0    0]
 [   1    2  314 1231   16    1    0]
 [   0    0    1  182   24    4    0]
 [   0    0    0   31   12   88    0]
 [   0    0    0    1    2    1    0]]
Classification Report:
             precision    recall  f1-score   support

          1       0.43      0.06      0.10       162
          2       0.44      0.19      0.27       552
          3       0.58      0.74      0.65      1460
          4       0.68      0.79      0.73      1565
          5       0.41      0.11      0.18       211
          6       0.94      0.67      0.78       131
          7       0.00      0.00      0.00         4

avg / total       0.60      0.62      0.59      4085

Accuracy Score: 0.6203182374541004


In [31]:
gbc = GradientBoostingClassifier(n_estimators=1000)

gbc_oof_predictions = oof_predictions(gbc, X, y, n_folds)
printClassificationErrors(y, gbc_oof_predictions)

A Jupyter Widget


Confusion Matrix:
[[  87   71    4    0    0    0    0]
 [  39  426   85    1    1    0    0]
 [   3   63 1324   68    2    0    0]
 [   1    1   71 1438   49    4    1]
 [   0    0    0   71  122   18    0]
 [   0    0    0    3   28  100    0]
 [   0    0    0    0    0    4    0]]
Classification Report:
             precision    recall  f1-score   support

          1       0.67      0.54      0.60       162
          2       0.76      0.77      0.77       552
          3       0.89      0.91      0.90      1460
          4       0.91      0.92      0.91      1565
          5       0.60      0.58      0.59       211
          6       0.79      0.76      0.78       131
          7       0.00      0.00      0.00         4

avg / total       0.85      0.86      0.85      4085

Accuracy Score: 0.8560587515299878


In [32]:
svc = SVC(C=1, kernel='rbf', gamma=0.8)

svc_oof_predictions = oof_predictions(gbc, X, y, n_folds)
printClassificationErrors(y, svc_oof_predictions)

A Jupyter Widget


Confusion Matrix:
[[  87   72    3    0    0    0    0]
 [  38  428   84    1    1    0    0]
 [   4   63 1323   67    3    0    0]
 [   1    1   71 1437   50    4    1]
 [   0    0    0   72  122   17    0]
 [   0    0    0    3   28  100    0]
 [   0    0    0    0    0    4    0]]
Classification Report:
             precision    recall  f1-score   support

          1       0.67      0.54      0.60       162
          2       0.76      0.78      0.77       552
          3       0.89      0.91      0.90      1460
          4       0.91      0.92      0.91      1565
          5       0.60      0.58      0.59       211
          6       0.80      0.76      0.78       131
          7       0.00      0.00      0.00         4

avg / total       0.85      0.86      0.85      4085

Accuracy Score: 0.8560587515299878


In [33]:
mlp = MLPClassifier()

mlp_oof_predictions = oof_predictions(mlp, X, y, n_folds)
printClassificationErrors(y, mlp_oof_predictions)

A Jupyter Widget


Confusion Matrix:
[[  52   93   17    0    0    0    0]
 [  34  277  238    3    0    0    0]
 [   8   98 1117  230    7    0    0]
 [   1    3  226 1293   40    2    0]
 [   0    0    1  121   78   11    0]
 [   0    0    0   11   22   98    0]
 [   0    0    0    0    2    2    0]]
Classification Report:
             precision    recall  f1-score   support

          1       0.55      0.32      0.40       162
          2       0.59      0.50      0.54       552
          3       0.70      0.77      0.73      1460
          4       0.78      0.83      0.80      1565
          5       0.52      0.37      0.43       211
          6       0.87      0.75      0.80       131
          7       0.00      0.00      0.00         4

avg / total       0.70      0.71      0.71      4085

Accuracy Score: 0.7135862913096696


In [34]:
oof_predictions_lvl_one = pd.DataFrame(data={'knn': knn_oof_predictions, 
                                             'xgbc': xgbc_oof_predictions, 
                                             'rfc': rfc_oof_predictions, 
                                             'logr': logr_oof_predictions, 
                                             'gbc': gbc_oof_predictions, 
                                             'svc': svc_oof_predictions, 
                                             'mlp': mlp_oof_predictions})

oof_predictions_lvl_one.head()

Unnamed: 0,gbc,knn,logr,mlp,rfc,svc,xgbc
0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
1,4.0,4.0,4.0,4.0,4.0,4.0,4.0
2,5.0,5.0,5.0,5.0,5.0,5.0,5.0
3,3.0,4.0,4.0,4.0,3.0,3.0,3.0
4,4.0,4.0,4.0,4.0,4.0,4.0,4.0


In [35]:
ITERATIONS = 100
METRIC = 'accuracy'

bayes_cv_tuner = BayesSearchCV(
    estimator = xgb.XGBClassifier(
        n_jobs = 2,
        objective = 'multi:softmax',
        eval_metric = 'mlogloss',
        silent=1,
        tree_method='approx'
    ),
    search_spaces = {
        'learning_rate': (0.01, 1.0, 'log-uniform'),
        'min_child_weight': (0, 10),
        'max_depth': (0, 50),
        'max_delta_step': (0, 20),
        'subsample': (0.01, 1.0, 'uniform'),
        'colsample_bytree': (0.01, 1.0, 'uniform'),
        'colsample_bylevel': (0.01, 1.0, 'uniform'),
        'reg_lambda': (1e-9, 1000, 'log-uniform'),
        'reg_alpha': (1e-9, 1.0, 'log-uniform'),
        'gamma': (1e-9, 0.5, 'log-uniform'),
        'min_child_weight': (0, 5),
        'n_estimators': (50, 100),
        'scale_pos_weight': (1e-6, 500, 'log-uniform')
    },    
    scoring = None,
    cv = StratifiedKFold(
        n_splits=5,
        shuffle=True,
        random_state=42
    ),
    n_jobs = 1,
    n_iter = ITERATIONS,
    verbose = 0,
    refit = True,####
    random_state = 42
)

def status_print(optim_result):
    """Status callback durring bayesian hyperparameter search"""
    
    # Get all the models tested so far in DataFrame format
    all_models = pd.DataFrame(bayes_cv_tuner.cv_results_)    
    
    # Get current parameters and the best parameters    
    best_params = pd.Series(bayes_cv_tuner.best_params_)
    print('Model #{}\nBest {}: {}\nBest params: {}\n'.format(
        len(all_models),
        METRIC,
        np.round(bayes_cv_tuner.best_score_, 5),
        bayes_cv_tuner.best_params_
    ))

In [36]:
result = bayes_cv_tuner.fit(oof_predictions_lvl_one.values, y.values, callback=status_print)

Model #1
Best accuracy: 0.85826
Best params: {'subsample': 0.13556548021189216, 'max_depth': 21, 'colsample_bylevel': 0.41600291926478072, 'learning_rate': 0.042815319280763466, 'n_estimators': 87, 'scale_pos_weight': 0.060830282487222144, 'reg_lambda': 0.059360706359120489, 'min_child_weight': 2, 'max_delta_step': 13, 'reg_alpha': 5.4975577392897861e-07, 'colsample_bytree': 0.73044848574555188, 'gamma': 0.13031389926541354}

Model #2
Best accuracy: 0.85851
Best params: {'subsample': 0.99237105986371343, 'max_depth': 3, 'colsample_bylevel': 0.83901447199775159, 'learning_rate': 0.79881794627812419, 'n_estimators': 68, 'scale_pos_weight': 0.30164107718431421, 'reg_lambda': 276.54244755742252, 'min_child_weight': 1, 'max_delta_step': 17, 'reg_alpha': 0.00052669830037015467, 'colsample_bytree': 0.88448212460705367, 'gamma': 4.3586846084807948e-07}

Model #3
Best accuracy: 0.85851
Best params: {'subsample': 0.99237105986371343, 'max_depth': 3, 'colsample_bylevel': 0.83901447199775159, 'lea