Let's see how we do with xgboost

In [1]:
import pandas as pd

from xgboost import XGBClassifier
# Import RandomForestClassifier and GradientBoostingClassifer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Function for splitting training and test set
from sklearn.model_selection import train_test_split
# Function for creating model pipelines
from sklearn.pipeline import make_pipeline
# For standardization
from sklearn.preprocessing import StandardScaler
# Helper for cross-validation
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

Let's load in the dataset from the processed data folder.

In [7]:
df = pd.read_csv('../../data/processed/model_dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Season,LowID,HighID,Win,LowScore,LowFGP,LowFGP3,LowFTP,LowOR,...,HighFTP,HighOR,HighDR,HighAst,HighTO,HighStl,HighBlk,HighPF,HighRank,HighSeed
0,0,2003,1411,1421,0,72.8,0.448892,0.321414,0.613745,13.166667,...,0.766142,12.275862,23.172414,13.034483,16.206897,7.068966,3.0,19.103448,240.34375,16.0
1,1,2003,1112,1436,1,85.214286,0.463563,0.35106,0.701154,15.178571,...,0.649708,12.965517,25.724138,14.206897,14.068966,6.862069,2.965517,15.896552,153.125,16.0
2,2,2003,1113,1272,1,75.965517,0.48168,0.328376,0.675667,13.689655,...,0.628299,14.068966,25.965517,16.62069,13.793103,7.37931,5.068966,18.758621,21.705882,7.0
3,3,2003,1141,1166,1,79.344828,0.506349,0.377481,0.762741,10.586207,...,0.689707,10.878788,23.181818,16.818182,13.363636,8.393939,4.454545,17.272727,20.735294,6.0
4,4,2003,1143,1301,1,74.482759,0.468741,0.375934,0.688632,11.241379,...,0.778148,9.733333,22.033333,14.666667,14.2,7.766667,3.066667,18.666667,50.3125,9.0


In [12]:
df.columns.values

array(['Unnamed: 0', 'Season', 'LowID', 'HighID', 'Win', 'LowScore',
       'LowFGP', 'LowFGP3', 'LowFTP', 'LowOR', 'LowDR', 'LowAst', 'LowTO',
       'LowStl', 'LowBlk', 'LowPF', 'LowRank', 'LowSeed', 'HighScore',
       'HighFGP', 'HighFGP3', 'HighFTP', 'HighOR', 'HighDR', 'HighAst',
       'HighTO', 'HighStl', 'HighBlk', 'HighPF', 'HighRank', 'HighSeed'],
      dtype=object)

In [13]:
df = df.drop(columns = ['Unnamed: 0'])
df.head()

Unnamed: 0,Season,LowID,HighID,Win,LowScore,LowFGP,LowFGP3,LowFTP,LowOR,LowDR,...,HighFTP,HighOR,HighDR,HighAst,HighTO,HighStl,HighBlk,HighPF,HighRank,HighSeed
0,2003,1411,1421,0,72.8,0.448892,0.321414,0.613745,13.166667,24.8,...,0.766142,12.275862,23.172414,13.034483,16.206897,7.068966,3.0,19.103448,240.34375,16.0
1,2003,1112,1436,1,85.214286,0.463563,0.35106,0.701154,15.178571,27.642857,...,0.649708,12.965517,25.724138,14.206897,14.068966,6.862069,2.965517,15.896552,153.125,16.0
2,2003,1113,1272,1,75.965517,0.48168,0.328376,0.675667,13.689655,23.310345,...,0.628299,14.068966,25.965517,16.62069,13.793103,7.37931,5.068966,18.758621,21.705882,7.0
3,2003,1141,1166,1,79.344828,0.506349,0.377481,0.762741,10.586207,23.275862,...,0.689707,10.878788,23.181818,16.818182,13.363636,8.393939,4.454545,17.272727,20.735294,6.0
4,2003,1143,1301,1,74.482759,0.468741,0.375934,0.688632,11.241379,24.37931,...,0.778148,9.733333,22.033333,14.666667,14.2,7.766667,3.066667,18.666667,50.3125,9.0


We need to split the data so it's the target and input features.

In [17]:
# Create separate object for target variable
y = df.Win

# Create separate object for input features
X = df.drop('Win', axis=1)

Split into training and test data with 20% of the observations going to the test set. We also give it a random state so we can reproduce the results.

In [18]:
# Split X and y into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1234)

# Print number of observations in X_train, X_test, y_train, and y_test
print(len(X_train), len(X_test), len(y_train), len(y_test))

892 223 892 223


Ok time for pipelines. 

In [54]:
# Pipeline dictionary
pipelines = {
    'xg': make_pipeline(StandardScaler(),XGBClassifier(random_state=123)),
    'gb': make_pipeline(StandardScaler(),GradientBoostingClassifier(random_state=123))
            }

In [55]:
# Boosted Tree hyperparameters
gb_hyperparameters = {'gradientboostingclassifier__n_estimators': [200],
                     'gradientboostingclassifier__learning_rate': [0.05],
                     'gradientboostingclassifier__max_depth': [1]}

In [59]:
# Boosted Tree hyperparameters
xg_hyperparameters = {
'xgbclassifier__max_depth': range (2, 10, 1),
    'xgbclassifier__n_estimators': range(60, 220, 40),
    'xgbclassifier__learning_rate': [0.1, 0.01, 0.05]
}

In [60]:
# Create hyperparameters dictionary
hyperparameters = {
    'xg': xg_hyperparameters,
    'gb': gb_hyperparameters
}

In [61]:
# Create empty dictionary called fitted_models
fitted_models = {}

# Loop through model pipelines, tuning each one and saving it to fitted_models
for name, pipeline in pipelines.items():
    # Create cross-validation object from pipeline and hyperparameters
    model = GridSearchCV(pipeline, hyperparameters[name], cv=10, n_jobs=-1, scoring='neg_log_loss')
    
    # Fit model on X_train, y_train
    model.fit(X_train,y_train)
    
    # Store model in fitted_models[name] 
    fitted_models[name] = model
    
    # Print '{name} has been fitted'
    print(name, 'has been fitted')

xg has beed fitted
gb has beed fitted


In [63]:
fitted_models['xg']

GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('xgbclassifier',
                                        XGBClassifier(base_score=0.5,
                                                      booster='gbtree',
                                                      colsample_bylevel=1,
                                                      colsample_bynode=1,
                                                      colsample_bytree=1,
                                                      gamma=0,
                                                      learning_rate=0.1,
                                                      max_delta_step=0,
                     

In [64]:
model_xg = fitted_models['xg']

In [65]:
y_pred = model_xg.predict(X_test)
predictions = [round(value) for value in y_pred]

In [66]:
accuracy = accuracy_score(y_test, predictions)
print(accuracy)

0.6905829596412556


If I play around by hand I can get a little bit better accuracy.

In [29]:
model = XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.5, max_delta_step=2, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=300, nthread=-1,reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [30]:
model.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.5, max_delta_step=2, max_depth=10,
              min_child_weight=1, missing=None, n_estimators=300, n_jobs=1,
              nthread=-1, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0,
              silent=True, subsample=1, verbosity=1)

In [31]:
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

In [32]:
accuracy = accuracy_score(y_test, predictions)
print(accuracy)

0.7130044843049327


That seems to be pretty decent. Most of the values that I was playing with gave me somewhere between 66 and 70. Let's check it against the winning gradient boosting model from 3.0-theberling-classifier.ipynb

In [33]:
model2 = GradientBoostingClassifier(n_estimators=200,learning_rate=0.05,max_depth=1)

In [34]:
model2.fit(X_train,y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.05, loss='deviance', max_depth=1,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=200,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [35]:
y_pred2 = model2.predict(X_test)
predictions2 = [round(value) for value in y_pred2]

In [36]:
accuracy2 = accuracy_score(y_test, predictions2)
print(accuracy2)

0.6860986547085202


Let's run it on our submission data set and see what happens.

In [37]:
data = pd.read_csv('../../data/processed/model_dataset2.csv')
data.head()

Unnamed: 0,Season,LowID,HighID,LowScore,LowFGP,LowFGP3,LowFTP,LowOR,LowDR,LowAst,...,HighFTP,HighOR,HighDR,HighAst,HighTO,HighStl,HighBlk,HighPF,HighRank,HighSeed
0,2021,1101,1104,76.304348,0.459648,0.383192,0.689699,9.608696,22.826087,18.173913,...,0.711088,10.4,25.966667,14.133333,13.833333,8.633333,4.333333,4.933333,6.865385,2.0
1,2021,1101,1111,76.304348,0.459648,0.383192,0.689699,9.608696,22.826087,18.173913,...,0.699853,8.875,22.041667,11.333333,11.125,7.541667,3.166667,8.083333,180.36,16.0
2,2021,1101,1116,76.304348,0.459648,0.383192,0.689699,9.608696,22.826087,18.173913,...,0.743755,10.571429,26.142857,14.928571,12.821429,8.035714,5.142857,4.035714,13.269231,3.0
3,2021,1101,1124,76.304348,0.459648,0.383192,0.689699,9.608696,22.826087,18.173913,...,0.703066,11.333333,22.375,17.041667,12.166667,8.958333,3.75,5.25,3.038462,1.0
4,2021,1101,1140,76.304348,0.459648,0.383192,0.689699,9.608696,22.826087,18.173913,...,0.74271,8.32,28.28,16.16,13.0,4.68,2.84,4.6,20.666667,6.0


In [46]:
predictions = model.predict(data)
predictions

array([0, 0, 0, ..., 1, 0, 0])

In [47]:
probs = model.predict_proba(data)
probs

array([[0.90552395, 0.09447605],
       [0.66393287, 0.33606713],
       [0.91118332, 0.08881668],
       ...,
       [0.40607894, 0.59392106],
       [0.60249355, 0.39750645],
       [0.66988895, 0.33011105]])

In [48]:
prob_of_1 = probs[:,1]
prob_of_1

array([0.09447605, 0.33606713, 0.08881668, ..., 0.59392106, 0.39750645,
       0.33011105])

In [49]:
def make_id(row):
    season = row[0]
    low_id = row[1]
    high_id = row[2]
    ID = str(int(season))+'_'+str(int(low_id))+'_'+str(int(high_id))
    return ID

In [50]:
ID = data.apply(make_id,axis=1)

In [51]:
df_submission = pd.DataFrame()

In [52]:
df_submission['ID'] = ID
df_submission['Pred'] = prob_of_1
df_submission.head()

Unnamed: 0,ID,Pred
0,2021_1101_1104,0.094476
1,2021_1101_1111,0.336067
2,2021_1101_1116,0.088817
3,2021_1101_1124,0.063618
4,2021_1101_1140,0.239142


In [53]:
df_submission.to_csv('../../data/predictions/phase1_submissions1.csv',index=False)