In [33]:
#import the necessary libraries
import pandas as pd
import os
import seaborn as sns


In [34]:
#take a look at what we have to explore
!ls

[31mAllstarFull.csv[m[m         [31mFieldingOF.csv[m[m          [31mPitchingPost.csv[m[m
[31mAppearances.csv[m[m         [31mFieldingOFsplit.csv[m[m     [31mSalaries.csv[m[m
[31mAwardsManagers.csv[m[m      [31mFieldingPost.csv[m[m        [31mSchools.csv[m[m
[31mAwardsPlayers.csv[m[m       [31mHallOfFame.csv[m[m          [31mSeriesPost.csv[m[m
[31mAwardsShareManagers.csv[m[m [31mHomeGames.csv[m[m           [31mTeams.csv[m[m
[31mAwardsSharePlayers.csv[m[m  [31mManagers.csv[m[m            [31mTeamsFranchises.csv[m[m
[31mBatting.csv[m[m             [31mManagersHalf.csv[m[m        [31mTeamsHalf.csv[m[m
[31mBattingPost.csv[m[m         [31mParks.csv[m[m               Untitled.ipynb
[31mCollegePlaying.csv[m[m      [31mPeople.csv[m[m              [31mreadme2014.txt[m[m
[31mFielding.csv[m[m            [31mPitching.csv[m[m


In [35]:
#import the pitchers data and take a look at it
pitchers = pd.read_csv('Pitching.csv')
pitchers.tail()

Unnamed: 0,playerID,yearID,stint,teamID,lgID,W,L,G,GS,CG,...,IBB,WP,HBP,BK,BFP,GF,R,SH,SF,GIDP
45801,youngch03,2017,1,KCA,AL,0,0,14,2,0,...,2.0,1,1.0,0,148.0,5,27,0.0,0.0,3.0
45802,zastrro01,2017,1,CHN,NL,0,0,4,0,0,...,0.0,0,1.0,0,62.0,0,13,0.0,0.0,3.0
45803,zieglbr01,2017,1,MIA,NL,1,4,53,0,0,...,6.0,1,6.0,0,211.0,21,29,3.0,0.0,10.0
45804,zimmejo02,2017,1,DET,AL,8,13,29,29,0,...,2.0,3,7.0,0,713.0,0,111,3.0,8.0,19.0
45805,zychto01,2017,1,SEA,AL,6,3,45,0,0,...,3.0,1,5.0,1,173.0,7,12,1.0,2.0,5.0


In [23]:
#look at what all the features we have are
pitchers.columns

Index(['playerID', 'yearID', 'stint', 'teamID', 'lgID', 'W', 'L', 'G', 'GS',
       'CG', 'SHO', 'SV', 'IPouts', 'H', 'ER', 'HR', 'BB', 'SO', 'BAOpp',
       'ERA', 'IBB', 'WP', 'HBP', 'BK', 'BFP', 'GF', 'R', 'SH', 'SF', 'GIDP'],
      dtype='object')

In [36]:
#I want to predict wins, so I set my target as the Wins column in our data frame
y = pitchers.W

#I then delete the Wins column because I don't want to train on the data I'm trying to predict
del pitchers['W']
pitchers.fillna(0)

#I want to get one-hot variables for categorical things like League and Team
#but if I leave the PlayerID in there, I will have thousands of one-hot columns that are meaningless
#it will make the data take too long to work with and doesn't add anything (that I can think of)
del pitchers['playerID']
#now that I have deleted
one_hot = pd.get_dummies(pitchers)


In [37]:
#we're going to need to validate our model, so let's use the train_test_split function
#here we import the necessary function
from sklearn.model_selection import train_test_split

#let's take a look at the columns again to make sure we didn't create way too many one-hot dummy variables
one_hot.columns

Index(['yearID', 'stint', 'L', 'G', 'GS', 'CG', 'SHO', 'SV', 'IPouts', 'H',
       ...
       'teamID_WS7', 'teamID_WS8', 'teamID_WS9', 'teamID_WSU', 'lgID_AA',
       'lgID_AL', 'lgID_FL', 'lgID_NL', 'lgID_PL', 'lgID_UA'],
      dtype='object', length=181)

In [38]:
#split the dataset 80/20 for training/testing
X_train, X_test, y_train, y_test = train_test_split(one_hot, y, test_size = .2, random_state=42)


In [39]:
#import the necessary library to create our model
from xgboost import XGBRegressor

#define the model. I arbitrarily selected the number of trees to fit and the learning rate
model = XGBRegressor(n_estimators=124, learning_rate=0.05)


In [40]:
#fit the model
model.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=124,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [42]:
#make predictions about the test set
predictions = model.predict(X_test)

#we want to know how well we predicted number of wins, so we will import a function to measure mean absolute error.
from sklearn.metrics import mean_absolute_error
print("MAE is "+ str(mean_absolute_error(predictions, y_test)))

MAE is 1.1135217059298754


In [45]:
#if we're predicting within about 1 game, that's a pretty good prediction.
#let's see if we can make it better with more trees
#I'll also turn of the silencer so we can see what's happening
model = XGBRegressor(n_estimators=150, learning_rate=0.05, silent=0)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print("MAE is "+ str(mean_absolute_error(predictions, y_test)))

[12:01:39] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[12:01:39] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[12:01:39] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[12:01:39] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[12:01:40] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[12:01:40] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[12:01:40] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[12:01:40] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[12:01:41] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_

[12:02:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[12:02:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[12:02:01] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=3
[12:02:01] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[12:02:01] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[12:02:01] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[12:02:02] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[12:02:02] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[12:02:02] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_

[12:02:22] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[12:02:22] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
MAE is 1.093505686360841


In [None]:
#Obviously, there's room for improvement. We improved from 1.113 to 1.093