# Building an XGBoost Classifier

The last classifier was a Random Forest Classifier that seemed to perform poorly with the complex dataset.

We are trying a XGBoost model since it uses more fine-grained weighting of predictions and sensitivity to feature interaction.

In [25]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [26]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


In [27]:
df = pd.read_csv('cleaned_data.csv')

df.columns



Index(['Team', 'Home/Away', 'Opp', 'MP_G', 'FG_G', 'FGA_G', 'FG%_G', '3P_G',
       '3PA_G', '3P%_G', 'FT_G', 'FTA_G', 'FT%_G', 'ORB_G', 'DRB_G', 'TRB_G',
       'AST_G', 'STL_G', 'BLK_G', 'TOV_G', 'PF_G', 'PTS_G', 'GmSc',
       'Plus/Minus_G', 'W/L_Margin', 'Season', 'Ttl_MP_Sn', 'PER', 'TS%',
       '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%',
       'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'ValORP',
       'Avg_MP_Sn', 'FG_Sn', 'FGA_Sn', 'FG%_Sn', '3P_Sn', '3PA_Sn', '3P%_Sn',
       '2P_Sn', '2PA_Sn', '2P%_Sn', 'eFG%_Sn', 'FT_Sn', 'FTA_Sn', 'FT%_Sn',
       'ORB_Sn', 'DRB_Sn', 'TRB_Sn', 'AST_Sn', 'STL_Sn', 'BLK_Sn', 'TOV_Sn',
       'PF_Sn', 'PTS_Sn'],
      dtype='object')

In [28]:
# Bin Target
# Creating the function to bin the target variable

def bin_margin(margin):
    if 1 <= margin <= 5:
        return 'Close Victory'
    elif 6 <= margin <= 19:
        return 'Medium Victory'
    elif margin >= 20:
        return 'Blowout Victory'
    elif -5 <= margin <= -1:  
        return 'Tight Loss'
    elif -19 <= margin <= -6: 
        return 'Medium Loss'
    elif margin <= -20:
        return 'Blowout Loss'
    elif margin == 0:  
        return 'Tie'
    else:
        return 'Unknown'
    

df['Target'] = df['W/L_Margin'].apply(bin_margin)
df['Target'].value_counts()
df.drop(columns=['W/L_Margin'], inplace=True)

df.head()

Unnamed: 0,Team,Home/Away,Opp,MP_G,FG_G,FGA_G,FG%_G,3P_G,3PA_G,3P%_G,...,ORB_Sn,DRB_Sn,TRB_Sn,AST_Sn,STL_Sn,BLK_Sn,TOV_Sn,PF_Sn,PTS_Sn,Target
0,DAL,Away,PHO,19,1,3,0.333,1,3,0.333,...,0.3,2.0,2.3,3.2,0.5,0.1,1.2,1.7,9.3,Blowout Loss
1,DAL,Home,MIN,20,3,7,0.429,1,4,0.25,...,0.3,2.0,2.3,3.2,0.5,0.1,1.2,1.7,9.3,Close Victory
2,DAL,Home,CHI,14,2,7,0.286,2,4,0.5,...,0.3,2.0,2.3,3.2,0.5,0.1,1.2,1.7,9.3,Medium Victory
3,DAL,Away,ATL,18,3,9,0.333,0,3,0.0,...,0.3,2.0,2.3,3.2,0.5,0.1,1.2,1.7,9.3,Medium Loss
4,DAL,Away,TOR,29,3,11,0.273,2,4,0.5,...,0.3,2.0,2.3,3.2,0.5,0.1,1.2,1.7,9.3,Medium Loss


In [29]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report


# Split the data
X= df.drop(columns=['Target'])
y = df['Target']

label_encoder = LabelEncoder()
y_transformed = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.20, random_state=42)


In [30]:
from xgboost import XGBClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

#preprocessing features
categorical_features = ['Team','Home/Away', 'Opp', 'Season']
numerical_features = ['MP_G', 'FG_G', 'FGA_G', 'FG%_G', '3P_G',
       '3PA_G', '3P%_G', 'FT_G', 'FTA_G', 'FT%_G', 'ORB_G', 'DRB_G', 'TRB_G',
       'AST_G', 'STL_G', 'BLK_G', 'TOV_G', 'PF_G', 'PTS_G', 'GmSc',
       'Plus/Minus_G', 'Ttl_MP_Sn', 'PER', 'TS%',
       '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%',
       'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'ValORP',
       'Avg_MP_Sn', 'FG_Sn', 'FGA_Sn', 'FG%_Sn', '3P_Sn', '3PA_Sn', '3P%_Sn',
       '2P_Sn', '2PA_Sn', '2P%_Sn', 'eFG%_Sn', 'FT_Sn', 'FTA_Sn', 'FT%_Sn',
       'ORB_Sn', 'DRB_Sn', 'TRB_Sn', 'AST_Sn', 'STL_Sn', 'BLK_Sn', 'TOV_Sn',
       'PF_Sn', 'PTS_Sn']


preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

#Basic model before tuning hyperparameters

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier())
])

#Fit the model pipeline
pipeline.fit(X_train, y_train)


In [31]:
# Make predictions

y_pred = pipeline.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')

Accuracy: 0.3191489361702128
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       0.17      0.25      0.20         8
           2       0.06      0.11      0.07         9
           3       0.48      0.48      0.48        25
           4       0.45      0.42      0.43        31
           5       0.29      0.12      0.17        16

    accuracy                           0.32        94
   macro avg       0.24      0.23      0.23        94
weighted avg       0.34      0.32      0.32        94



Starting at 32% accuracy

## Tuning the model

In [32]:
from xgboost import XGBClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

#preprocessing features
categorical_features = ['Team','Home/Away', 'Opp', 'Season']
numerical_features = ['MP_G', 'FG_G', 'FGA_G', 'FG%_G', '3P_G',
       '3PA_G', '3P%_G', 'FT_G', 'FTA_G', 'FT%_G', 'ORB_G', 'DRB_G', 'TRB_G',
       'AST_G', 'STL_G', 'BLK_G', 'TOV_G', 'PF_G', 'PTS_G', 'GmSc',
       'Plus/Minus_G', 'Ttl_MP_Sn', 'PER', 'TS%',
       '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%',
       'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'ValORP',
       'Avg_MP_Sn', 'FG_Sn', 'FGA_Sn', 'FG%_Sn', '3P_Sn', '3PA_Sn', '3P%_Sn',
       '2P_Sn', '2PA_Sn', '2P%_Sn', 'eFG%_Sn', 'FT_Sn', 'FTA_Sn', 'FT%_Sn',
       'ORB_Sn', 'DRB_Sn', 'TRB_Sn', 'AST_Sn', 'STL_Sn', 'BLK_Sn', 'TOV_Sn',
       'PF_Sn', 'PTS_Sn']


preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])


pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(base_score =.2,
                                  gamma = 6, 
                                  n_estimators=500,
                                  max_depth=7, 
                                  learning_rate=0.05,
                                  min_child_weight=1,
                                  scale_pos_weight=1,
                                  subsample=.8,
                                  colsample_bytree=.8))
])

#Fit the model pipeline
pipeline.fit(X_train, y_train)


Parameters: { "scale_pos_weight" } are not used.



In [33]:
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')

Accuracy: 0.3829787234042553
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       0.27      0.38      0.32         8
           2       0.00      0.00      0.00         9
           3       0.48      0.48      0.48        25
           4       0.36      0.68      0.47        31
           5       0.00      0.00      0.00        16

    accuracy                           0.38        94
   macro avg       0.19      0.26      0.21        94
weighted avg       0.27      0.38      0.31        94



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


This model is performing worse even with hypertuning, there is not much of a difference. It still cannot recognize the first class.

## Hypertuning with GridSearchCV

In [34]:
from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)

pipeline = Pipeline(steps=[
    ('preprocessorX_tr', preprocessor),
    ('classifier', XGBClassifier())
])

param_grid = {
    'classifier__n_estimators': [100, 300, 400],
    'classifier__max_depth': [3, 5, 7],
    'classifier__learning_rate': [0.001, 0.01, 0.1],
}

grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy')

In [35]:
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)


KeyboardInterrupt: 

In [18]:
best_model = grid_search.best_estimator_

In [19]:
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')

Accuracy: 0.3829787234042553
Classification Report:
              precision    recall  f1-score   support

           0       0.33      0.20      0.25         5
           1       0.17      0.25      0.20         8
           2       0.33      0.11      0.17         9
           3       0.46      0.44      0.45        25
           4       0.39      0.65      0.49        31
           5       1.00      0.06      0.12        16

    accuracy                           0.38        94
   macro avg       0.45      0.28      0.28        94
weighted avg       0.49      0.38      0.35        94



The hypertuned model is performing the same as the previous one. The diffeence is that it can recognize the first class though it is barely there. It is also overfitting on the 6th class. This is not doing any better than the previous model. In fact it is worse.

## Let's try to address Class Imbalance

In [20]:
# stratification in the split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_transformed, 
    test_size=0.2, 
    random_state=42,
    stratify=y_transformed)

# get class weights
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))


#Update model with class weights and adjusted parameters
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(
        max_depth=3,
        learning_rate=0.01,
        n_estimators=400,
        min_child_weight=3,
        class_weights=class_weight_dict  
    ))
])

In [21]:
pipeline.fit(X_train, y_train)

Parameters: { "class_weights" } are not used.



In [22]:
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')


Accuracy: 0.40425531914893614
Classification Report:
              precision    recall  f1-score   support

           0       0.25      0.40      0.31         5
           1       0.50      0.55      0.52        11
           2       0.00      0.00      0.00        13
           3       0.38      0.57      0.46        23
           4       0.43      0.52      0.47        29
           5       0.40      0.15      0.22        13

    accuracy                           0.40        94
   macro avg       0.33      0.36      0.33        94
weighted avg       0.35      0.40      0.36        94



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The biggest difference is there is more balanced performance across all the classes. The accuracy is still low and is now at the same level of accuracy as the RandomForest Classifier. The accuracy will likely not be improved with hypertuning because of the inherent difficulty of the problem and the limited data for some classes.

In [36]:
#Do another grid search with the balanced weights
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(
        max_depth=3,
        learning_rate=0.01,
        n_estimators=400,
        class_weights=class_weight_dict  
    ))
])
#Use stratified kfold

from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

param_grid = {
    'classifier__min_child_weight': [1, 3, 5],
    'classifier__subsample': [0.6, 0.8, 1.0],
    'classifier__colsample_bytree': [0.6, 0.8, 1.0]
}

grid_search = GridSearchCV(
    pipeline, 
    param_grid, 
    cv=cv,
    scoring='balanced_accuracy',  # Changed to balanced accuracy
    n_jobs=-1
)

In [37]:
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)


Parameters: { "class_weights" } are not used.

Parameters: { "class_weights" } are not used.

Parameters: { "class_weights" } are not used.

Parameters: { "class_weights" } are not used.

Parameters: { "class_weights" } are not used.

Parameters: { "class_weights" } are not used.

Parameters: { "class_weights" } are not used.

Parameters: { "class_weights" } are not used.

Parameters: { "class_weights" } are not used.

Parameters: { "class_weights" } are not used.

Parameters: { "class_weights" } are not used.

Parameters: { "class_weights" } are not used.

Parameters: { "class_weights" } are not used.

Parameters: { "class_weights" } are not used.

Parameters: { "class_weights" } are not used.

Parameters: { "class_weights" } are not used.

Parameters: { "class_weights" } are not used.

Parameters: { "class_weights" } are not used.

Parameters: { "class_weights" } are not used.

Parameters: { "class_weights" } are not used.

Parameters: { "class_weights" } are not used.

Parameters: {

{'classifier__colsample_bytree': 0.8, 'classifier__min_child_weight': 1, 'classifier__subsample': 0.6}
0.34021747741696595


In [38]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')


Accuracy: 0.3829787234042553
Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.20      0.29         5
           1       0.20      0.25      0.22         8
           2       0.00      0.00      0.00         9
           3       0.44      0.56      0.49        25
           4       0.40      0.61      0.49        31
           5       0.00      0.00      0.00        16

    accuracy                           0.38        94
   macro avg       0.26      0.27      0.25        94
weighted avg       0.29      0.38      0.33        94



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


This grid search did not improve the accuracy nor did it change the classification report.


# Next Steps, Recommendations

1. The inherent difficulty of the problem is a huge limitation. There are a lot of factors that contribute to the outcome of a game that can't necessarily be quantified.
2. To better tackle the problem we need to scrape more data such as players stats for Brunson's teammates and opponents to give the classifier more context on how other players perform since after all the outcome is largely based on team performance.
3. It would also help to try to get more data that balances the classes.