Let's see how we do with xgboost

In [1]:
import pandas as pd

# Matplotlib for visualization
from matplotlib import pyplot as plt
# display plots in the notebook
%matplotlib inline 

from xgboost import XGBClassifier
# Import RandomForestClassifier and GradientBoostingClassifer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Function for splitting training and test set
from sklearn.model_selection import train_test_split
# Function for creating model pipelines
from sklearn.pipeline import make_pipeline
# For standardization
from sklearn.preprocessing import StandardScaler
# Helper for cross-validation
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

Let's load in the dataset from the processed data folder.

In [2]:
df = pd.read_csv('../input/3-merge-datasets-step-1/model_dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Season,LowID,HighID,Win,LowScore,LowFGP,LowFGP3,LowFTP,LowOR,...,HighFTP,HighOR,HighDR,HighAst,HighTO,HighStl,HighBlk,HighPF,HighRank,HighSeed
0,0,2003,1411,1421,0,72.8,0.448892,0.321414,0.613745,13.166667,...,0.766142,12.275862,23.172414,13.034483,16.206897,7.068966,3.0,19.103448,240.34375,16.0
1,1,2003,1112,1436,1,85.214286,0.463563,0.35106,0.701154,15.178571,...,0.649708,12.965517,25.724138,14.206897,14.068966,6.862069,2.965517,15.896552,153.125,16.0
2,2,2003,1113,1272,1,75.965517,0.48168,0.328376,0.675667,13.689655,...,0.628299,14.068966,25.965517,16.62069,13.793103,7.37931,5.068966,18.758621,21.705882,7.0
3,3,2003,1141,1166,1,79.344828,0.506349,0.377481,0.762741,10.586207,...,0.689707,10.878788,23.181818,16.818182,13.363636,8.393939,4.454545,17.272727,20.735294,6.0
4,4,2003,1143,1301,1,74.482759,0.468741,0.375934,0.688632,11.241379,...,0.778148,9.733333,22.033333,14.666667,14.2,7.766667,3.066667,18.666667,50.3125,9.0


In [3]:
df.columns.values

array(['Unnamed: 0', 'Season', 'LowID', 'HighID', 'Win', 'LowScore',
       'LowFGP', 'LowFGP3', 'LowFTP', 'LowOR', 'LowDR', 'LowAst', 'LowTO',
       'LowStl', 'LowBlk', 'LowPF', 'LowRank', 'LowSeed', 'HighScore',
       'HighFGP', 'HighFGP3', 'HighFTP', 'HighOR', 'HighDR', 'HighAst',
       'HighTO', 'HighStl', 'HighBlk', 'HighPF', 'HighRank', 'HighSeed'],
      dtype=object)

In [4]:
df = df.drop(columns = ['Unnamed: 0'])
df.head()

Unnamed: 0,Season,LowID,HighID,Win,LowScore,LowFGP,LowFGP3,LowFTP,LowOR,LowDR,...,HighFTP,HighOR,HighDR,HighAst,HighTO,HighStl,HighBlk,HighPF,HighRank,HighSeed
0,2003,1411,1421,0,72.8,0.448892,0.321414,0.613745,13.166667,24.8,...,0.766142,12.275862,23.172414,13.034483,16.206897,7.068966,3.0,19.103448,240.34375,16.0
1,2003,1112,1436,1,85.214286,0.463563,0.35106,0.701154,15.178571,27.642857,...,0.649708,12.965517,25.724138,14.206897,14.068966,6.862069,2.965517,15.896552,153.125,16.0
2,2003,1113,1272,1,75.965517,0.48168,0.328376,0.675667,13.689655,23.310345,...,0.628299,14.068966,25.965517,16.62069,13.793103,7.37931,5.068966,18.758621,21.705882,7.0
3,2003,1141,1166,1,79.344828,0.506349,0.377481,0.762741,10.586207,23.275862,...,0.689707,10.878788,23.181818,16.818182,13.363636,8.393939,4.454545,17.272727,20.735294,6.0
4,2003,1143,1301,1,74.482759,0.468741,0.375934,0.688632,11.241379,24.37931,...,0.778148,9.733333,22.033333,14.666667,14.2,7.766667,3.066667,18.666667,50.3125,9.0


We need to split the data so it's the target and input features.

In [5]:
# Create separate object for target variable
y = df.Win

# Create separate object for input features
X = df.drop('Win', axis=1)

Split into training and test data with 20% of the observations going to the test set. We also give it a random state so we can reproduce the results.

In [6]:
# Split X and y into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1234)

# Print number of observations in X_train, X_test, y_train, and y_test
print(len(X_train), len(X_test), len(y_train), len(y_test))

944 237 944 237


Ok time for pipelines. 

In [None]:
# Pipeline dictionary
pipelines = {
    'xg': make_pipeline(StandardScaler(),XGBClassifier(random_state=123)),
    'gb': make_pipeline(StandardScaler(),GradientBoostingClassifier(random_state=123))
            }

In [None]:
# Boosted Tree hyperparameters
gb_hyperparameters = {'gradientboostingclassifier__n_estimators': [200],
                     'gradientboostingclassifier__learning_rate': [0.05],
                     'gradientboostingclassifier__max_depth': [1]}

In [None]:
# Boosted Tree hyperparameters
xg_hyperparameters = {
'xgbclassifier__max_depth': range (2, 10, 1),
    'xgbclassifier__n_estimators': range(60, 220, 40),
    'xgbclassifier__learning_rate': [0.1, 0.01, 0.05]
}

In [None]:
# Create hyperparameters dictionary
hyperparameters = {
    'xg': xg_hyperparameters,
    'gb': gb_hyperparameters
}

In [None]:
# Create empty dictionary called fitted_models
fitted_models = {}

# Loop through model pipelines, tuning each one and saving it to fitted_models
for name, pipeline in pipelines.items():
    # Create cross-validation object from pipeline and hyperparameters
    model = GridSearchCV(pipeline, hyperparameters[name], cv=10, n_jobs=-1, scoring='neg_log_loss')
    
    # Fit model on X_train, y_train
    model.fit(X_train,y_train)
    
    # Store model in fitted_models[name] 
    fitted_models[name] = model
    
    # Print '{name} has been fitted'
    print(name, 'has been fitted')

In [None]:
fitted_models['xg']

In [None]:
model_xg = fitted_models['xg']

In [None]:
y_pred = model_xg.predict(X_test)
predictions = [round(value) for value in y_pred]

In [None]:
accuracy = accuracy_score(y_test, predictions)
print(accuracy)

If I play around by hand I can get a little bit better accuracy.

In [7]:
model = XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.5, max_delta_step=2, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=300, nthread=-1,reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [8]:
model.fit(X_train,y_train)



Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.5, max_delta_step=2,
              max_depth=10, min_child_weight=1, missing=None,
              monotone_constraints='()', n_estimators=300, n_jobs=4, nthread=-1,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0,
              silent=True, subsample=1, tree_method='exact',
              validate_parameters=1, ...)

In [9]:
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

XGBoostError: [01:40:41] ../src/c_api/c_api_utils.h:161: Invalid missing value: null
Stack trace:
  [bt] (0) /opt/conda/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0x9733d) [0x7f05ce53433d]
  [bt] (1) /opt/conda/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0xaf4ec) [0x7f05ce54c4ec]
  [bt] (2) /opt/conda/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0xb560f) [0x7f05ce55260f]
  [bt] (3) /opt/conda/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(XGBoosterPredictFromDense+0x28d) [0x7f05ce53852d]
  [bt] (4) /opt/conda/lib/python3.7/lib-dynload/../../libffi.so.8(+0x6a4a) [0x7f0657fe7a4a]
  [bt] (5) /opt/conda/lib/python3.7/lib-dynload/../../libffi.so.8(+0x5fea) [0x7f0657fe6fea]
  [bt] (6) /opt/conda/lib/python3.7/lib-dynload/_ctypes.cpython-37m-x86_64-linux-gnu.so(_ctypes_callproc+0x2f4) [0x7f0657ffd784]
  [bt] (7) /opt/conda/lib/python3.7/lib-dynload/_ctypes.cpython-37m-x86_64-linux-gnu.so(+0x10ff8) [0x7f0657ffdff8]
  [bt] (8) /opt/conda/bin/python(_PyObject_FastCallKeywords+0x47b) [0x55aa80aee72b]



In [10]:
accuracy = accuracy_score(y_test, predictions)
print(accuracy)

NameError: name 'predictions' is not defined

That seems to be pretty decent. Most of the values that I was playing with gave me somewhere between 66 and 70. Let's check it against the winning gradient boosting model from 3.0-theberling-classifier.ipynb

In [11]:
model2 = GradientBoostingClassifier(n_estimators=200,learning_rate=0.05,max_depth=1)

In [12]:
model2.fit(X_train,y_train)

GradientBoostingClassifier(learning_rate=0.05, max_depth=1, n_estimators=200)

In [13]:
y_pred2 = model2.predict(X_test)
predictions2 = [round(value) for value in y_pred2]

In [14]:
accuracy2 = accuracy_score(y_test, predictions2)
print(accuracy2)

0.7130801687763713


Let's run it on our submission data set and see what happens.

In [15]:
data = pd.read_csv('../input/4-merge-datasets-step-2/model_dataset2.csv')
data.head()

Unnamed: 0,Season,LowID,HighID,LowScore,LowFGP,LowFGP3,LowFTP,LowOR,LowDR,LowAst,...,HighFTP,HighOR,HighDR,HighAst,HighTO,HighStl,HighBlk,HighPF,HighRank,HighSeed
0,2016,1112,1114,81.212121,0.48596,0.37945,0.725924,11.636364,28.606061,14.515152,...,0.727142,8.709677,24.193548,12.903226,10.225806,6.290323,3.064516,18.258065,55.457627,12.0
1,2016,1112,1122,81.212121,0.48596,0.37945,0.725924,11.636364,28.606061,14.515152,...,0.662456,11.090909,24.787879,12.878788,14.545455,6.909091,2.757576,17.636364,208.293103,16.0
2,2016,1112,1124,81.212121,0.48596,0.37945,0.725924,11.636364,28.606061,14.515152,...,0.721206,13.5625,23.21875,17.09375,12.8125,7.71875,3.96875,19.40625,24.229508,5.0
3,2016,1112,1138,81.212121,0.48596,0.37945,0.725924,11.636364,28.606061,14.515152,...,0.71415,12.090909,26.727273,12.545455,13.818182,6.727273,3.484848,21.272727,122.465517,14.0
4,2016,1112,1139,81.212121,0.48596,0.37945,0.725924,11.636364,28.606061,14.515152,...,0.7263,11.419355,25.0,14.354839,10.225806,6.774194,3.354839,19.903226,34.241379,9.0


In [16]:
predictions = model.predict(data)
predictions

XGBoostError: [01:41:59] ../src/c_api/c_api_utils.h:161: Invalid missing value: null
Stack trace:
  [bt] (0) /opt/conda/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0x9733d) [0x7f05ce53433d]
  [bt] (1) /opt/conda/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0xaf4ec) [0x7f05ce54c4ec]
  [bt] (2) /opt/conda/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0xb560f) [0x7f05ce55260f]
  [bt] (3) /opt/conda/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(XGBoosterPredictFromDense+0x28d) [0x7f05ce53852d]
  [bt] (4) /opt/conda/lib/python3.7/lib-dynload/../../libffi.so.8(+0x6a4a) [0x7f0657fe7a4a]
  [bt] (5) /opt/conda/lib/python3.7/lib-dynload/../../libffi.so.8(+0x5fea) [0x7f0657fe6fea]
  [bt] (6) /opt/conda/lib/python3.7/lib-dynload/_ctypes.cpython-37m-x86_64-linux-gnu.so(_ctypes_callproc+0x2f4) [0x7f0657ffd784]
  [bt] (7) /opt/conda/lib/python3.7/lib-dynload/_ctypes.cpython-37m-x86_64-linux-gnu.so(+0x10ff8) [0x7f0657ffdff8]
  [bt] (8) /opt/conda/bin/python(_PyObject_FastCallKeywords+0x47b) [0x55aa80aee72b]



In [17]:
probs = model.predict_proba(data)
probs

XGBoostError: [01:42:12] ../src/c_api/c_api_utils.h:161: Invalid missing value: null
Stack trace:
  [bt] (0) /opt/conda/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0x9733d) [0x7f05ce53433d]
  [bt] (1) /opt/conda/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0xaf4ec) [0x7f05ce54c4ec]
  [bt] (2) /opt/conda/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0xb560f) [0x7f05ce55260f]
  [bt] (3) /opt/conda/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(XGBoosterPredictFromDense+0x28d) [0x7f05ce53852d]
  [bt] (4) /opt/conda/lib/python3.7/lib-dynload/../../libffi.so.8(+0x6a4a) [0x7f0657fe7a4a]
  [bt] (5) /opt/conda/lib/python3.7/lib-dynload/../../libffi.so.8(+0x5fea) [0x7f0657fe6fea]
  [bt] (6) /opt/conda/lib/python3.7/lib-dynload/_ctypes.cpython-37m-x86_64-linux-gnu.so(_ctypes_callproc+0x2f4) [0x7f0657ffd784]
  [bt] (7) /opt/conda/lib/python3.7/lib-dynload/_ctypes.cpython-37m-x86_64-linux-gnu.so(+0x10ff8) [0x7f0657ffdff8]
  [bt] (8) /opt/conda/bin/python(_PyObject_FastCallKeywords+0x47b) [0x55aa80aee72b]



In [18]:
prob_of_1 = probs[:,1]
prob_of_1

NameError: name 'probs' is not defined

In [19]:
def make_id(row):
    season = row[0]
    low_id = row[1]
    high_id = row[2]
    ID = str(int(season))+'_'+str(int(low_id))+'_'+str(int(high_id))
    return ID

In [None]:
ID = data.apply(make_id,axis=1)

In [None]:
df_submission = pd.DataFrame()

In [None]:
df_submission['ID'] = ID
df_submission['Pred'] = prob_of_1
df_submission.head()

In [None]:
df_submission.to_csv('kaggle/working/phase1_submissions1.csv',index=False)

Out of curiousity let's look at the feature importance.

In [None]:
model.feature_importances_

In [None]:
feature_names = X.columns

In [None]:
plt.barh(feature_names, model.feature_importances_)

What about gradient boost?

In [None]:
model2.feature_importances_

In [None]:
plt.barh(feature_names, model2.feature_importances_)