
# <font color='blue'>Part 1: Setting Up </font> 

First, let's make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures:

In [14]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os


# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)


# <font color='blue'>Part 2: Creating Train and Test Sets </font> 

### 2.1: Creating Data-Fetching Function

In [15]:
import os
import tarfile
from six.moves import urllib
import pandas as pd

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/rnz269/ml_baseball/master/understanding_war/excel_files/"
BASEBALL_PATH = os.path.join("datasets", "baseball")
# BASEBALL_URL = DOWNLOAD_ROOT + "datasets/baseball/"

# Following function automates the data fetching process
# Following function creates a datasets/housing directory, downloads the housing.tgz file, and from this extracts the housing.csv 
def fetch_baseball_data(data_url, baseball_path=BASEBALL_PATH):
    # Allows me to use the fetch_baseball_data function on any dataset "data_url" in my Github library
    # Now:"https://github.com/rnz269/ml_baseball/blob/master/understanding_war/excel_files/"data_url"
    baseball_url = DOWNLOAD_ROOT + data_url
    # Following if function creates a local datasets/baseball directory if it doesn't already exist
    if not os.path.isdir(baseball_path):
        os.makedirs(baseball_path)
    # following defines csv_path as datasets/baseball/catchers.csv
    baseball_csv_path = os.path.join(baseball_path, data_url)
    # Following copies network object "https://raw.githubusercontent.com/rnz269/ml_baseball/master/understanding_war/excel_files/"data_url"
    # and stores in local file baseball_csv_path
    urllib.request.urlretrieve(baseball_url, baseball_csv_path)
    return pd.read_csv(baseball_csv_path)
 
  

### Change the next cell if you want to access different data:

In [16]:
# This won't work if the file is open, as excel won't allow you to rewrite an open file
baseball = fetch_baseball_data("mlb_4_year_samples.csv")
baseball.head()

PermissionError: [Errno 13] Permission denied: 'datasets\\baseball\\mlb_4_year_samples.csv'

### 2.2: Checking out Descriptive Stats on Data

In [17]:
baseball.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1814 entries, 0 to 1813
Data columns (total 80 columns):
y1 Age      1814 non-null int64
y1 G        1814 non-null int64
y1 PA       1814 non-null int64
y1 HR       1814 non-null int64
y1 R        1814 non-null int64
y1 RBI      1814 non-null int64
y1 SB       1814 non-null int64
y1 BB%      1814 non-null float64
y1 K%       1814 non-null float64
y1 ISO      1814 non-null float64
y1 BABIP    1814 non-null float64
y1 AVG      1814 non-null float64
y1 OBP      1814 non-null float64
y1 SLG      1814 non-null float64
y1 WOBA     1814 non-null float64
y1 wRC+     1814 non-null int64
y1 BsR      1814 non-null float64
y1 Off      1814 non-null float64
y1 Def      1814 non-null float64
y1 WAR      1814 non-null float64
y2 Age      1814 non-null int64
y2 G        1814 non-null int64
y2 PA       1814 non-null int64
y2 HR       1814 non-null int64
y2 R        1814 non-null int64
y2 RBI      1814 non-null int64
y2 SB       1814 non-null int64
y2 BB

In [18]:
baseball["Spd"].value_counts()

KeyError: 'Spd'

In [None]:
# Wait a second! y1 Age could be Harper's first year in bigs or his second year, as I split 4 year samples
# Harper had five years in bigs, 2012-2016, so that's two samples
war_features = ["y1 Age", "y1 WAR", "y2 Age", "y2 WAR", "y3 Age", "y3 WAR", "y4 Age", "y4 WAR"]
baseball[war_features].describe()

In [13]:
%matplotlib inline
import matplotlib.pyplot as plt
resultant = ["y1 HR", "y1 AVG", "y1 OBP", "y1 SLG", "y1 wOBA", "y1 wRC+", "y1 BsR", "y1 WAR"]
baseball_resultant = baseball[resultant]
baseball_resultant.hist(bins=50, figsize=(20,15))
plt.show()

KeyError: "['y1 wOBA'] not in index"

### 2.3: Ensuring Test Data Never Sees Train Data on Update

In [19]:
# To make this notebook's output identical at every run
np.random.seed(42)

In [20]:
import numpy as np

# Given dataset, test ratio, returns training set, test set (train_indices, test_indices)
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    # From 1:test_set_size
    test_indices = shuffled_indices[:test_set_size]
    # From test_set_size:n
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [21]:
train_set, test_set = split_train_test(baseball, 0.2)
print(len(train_set), "train +", len(test_set), "test")

1452 train + 362 test


In [22]:
import hashlib

# The following function makes sure upon loading this page that the test data is always the same 20%, important for holdout and generalization error calculation
def test_set_check(identifier, test_ratio, hash):
    # Compute hash of each instance's identifier, then look at last byte of hash, and is True if less than 256 * test_ratio
    return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio

def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):
    ids = data[id_column]
    # lambda id_ is an anonymous function that returns the true/false value for each in test_set_check(id_, test_ratio, hash)
    # making in_test_set a boolean array
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio, hash))
    # print(~in_test_set)
    # The ~ defines complement, just reverses the true false values to return training set first
    return data.loc[~in_test_set], data.loc[in_test_set]

In [23]:
# Unfortunately, our baseball dataset doesn't have an identifier column, so here:
# the following line adds an index column to our baseball dataset
baseball_with_id = baseball.reset_index()
train_set, test_set = split_train_test_by_id(baseball_with_id, 0.2, "index")
# make sure new data is always appended to end of housing_with_id to ensure the same data is always withheld from testing

### 2.4: Starting to use sklearn

In [24]:
# train_test_split below is pretty much the same as split_train_test above... what about same test data?
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(baseball, test_size=0.2, random_state=42)

In [25]:
test_set.head()

Unnamed: 0,y1 Age,y1 G,y1 PA,y1 HR,y1 R,y1 RBI,y1 SB,y1 BB%,y1 K%,y1 ISO,...,y4 BABIP,y4 AVG,y4 OBP,y4 SLG,y4 WOBA,y4 wRC+,y4 BsR,y4 Off,y4 Def,y4 WAR
1029,34,111,266,5,33,28,2,0.019,0.237,0.159,...,0.278,0.235,0.266,0.348,0.272,71,0.2,-6.3,-5.8,-0.7
1449,29,107,375,12,46,40,10,0.096,0.248,0.179,...,0.326,0.276,0.352,0.498,0.364,126,0.9,8.0,-4.6,1.1
532,26,28,118,1,15,7,7,0.076,0.161,0.131,...,0.304,0.253,0.319,0.314,0.288,74,4.5,-13.6,2.4,0.7
906,30,154,604,23,69,76,12,0.051,0.2,0.195,...,0.28,0.229,0.272,0.321,0.26,62,-1.3,-23.6,16.6,0.9
997,25,139,551,22,72,68,1,0.087,0.152,0.188,...,0.329,0.308,0.339,0.5,0.365,134,-0.6,3.6,1.0,0.9


In [26]:
train_set.describe()

Unnamed: 0,y1 Age,y1 G,y1 PA,y1 HR,y1 R,y1 RBI,y1 SB,y1 BB%,y1 K%,y1 ISO,...,y4 BABIP,y4 AVG,y4 OBP,y4 SLG,y4 WOBA,y4 wRC+,y4 BsR,y4 Off,y4 Def,y4 WAR
count,1451.0,1451.0,1451.0,1451.0,1451.0,1451.0,1451.0,1451.0,1451.0,1451.0,...,1451.0,1451.0,1451.0,1451.0,1451.0,1451.0,1451.0,1451.0,1451.0,1451.0
mean,27.585114,116.179187,455.875258,13.467953,57.301861,54.559614,8.528601,0.086759,0.180347,0.159779,...,0.29513,0.255396,0.323281,0.405666,0.31857,98.930393,-0.260303,2.43501,-1.496968,1.564645
std,3.578117,37.180097,182.264644,10.25984,28.1628,30.107458,10.885535,0.031454,0.058144,0.059232,...,0.037996,0.035047,0.039191,0.076816,0.042722,28.626585,3.293449,14.902382,8.648585,2.027007
min,19.0,22.0,100.0,0.0,4.0,2.0,0.0,0.015,0.043,0.006,...,0.168,0.142,0.193,0.187,0.19,11.0,-12.6,-32.8,-32.3,-2.9
25%,25.0,88.5,304.0,5.0,34.0,30.0,1.0,0.064,0.137,0.117,...,0.27,0.233,0.298,0.353,0.29,80.0,-2.1,-7.8,-7.15,0.0
50%,27.0,125.0,487.0,11.0,57.0,52.0,4.0,0.083,0.175,0.156,...,0.297,0.257,0.323,0.402,0.319,99.0,-0.3,-0.6,-1.5,1.1
75%,30.0,149.0,619.0,20.0,80.0,77.0,11.5,0.107,0.219,0.2005,...,0.321,0.28,0.348,0.455,0.346,117.0,1.4,9.6,3.7,2.7
max,42.0,163.0,778.0,54.0,143.0,156.0,78.0,0.207,0.38,0.357,...,0.408,0.359,0.474,0.649,0.461,197.0,13.6,77.3,26.5,9.5



# <font color='blue'>Part 3: Discover and visualize the data to gain insights </font> 

In [27]:
# Creates a copy of baseball training set for us to play with without worry
baseball = train_set.copy()

In [28]:
# Plotting LD% against WAR for all training examples
baseball.plot(kind="scatter", x="BB%", y="wRC+")

KeyError: 'BB%'

In [None]:
# Size of circle determined by population, option s
# Color of circle represents median_house_value, c, which is the label
baseball.plot(kind="scatter", x="BB%", y="HR/FB", figsize=(10,7),
    c="WAR", cmap=plt.get_cmap("jet"), colorbar=True,
    sharex=False)
plt.legend()

From above graph, clearly see that combination of high power and high patience correlates highly with WAR

### 3.2: Examining Correlations Among All Features

In [None]:
# Creates a correlation matrix: a square, n x n matrix for n variables

attributes = ["K%", "BB%", "ISO", "Spd", "LD%", "HR/FB", "Hard%", "O-Swing%", "O-Contact%", "wRC+", "WAR"]
corr_matrix = baseball[attributes].corr()
print(corr_matrix)

In [None]:
corr_matrix["WAR"].sort_values(ascending=False)

# <font color='blue'>Part 4: Prepare the data for Machine Learning Algorithms </font> 

In [43]:
# drop labels for training set and then make a copy so as not to affect original
# Stopped 
# Predictors
predictors = baseball.iloc[0,0]
print(predictors)
labels = baseball.iloc[0,61:]
#print(labels)
# baseball = train_set.drop(labels, axis=1)
# Labels
# baseball_labels = train_set[labels].copy()

27


In [30]:
baseball.shape

(537, 19)

In [31]:
baseball_labels.shape

(537, 8)

In [32]:
baseball.describe()

Unnamed: 0,PA,K%,BB%,ISO,Spd,LD%,GB%,FB%,IFFB%,HR/FB,Soft%,Med%,Hard%,Swing%,O-Swing%,Z-Swing%,O-Contact%,Z-Contact%,Def
count,537.0,537.0,537.0,537.0,537.0,537.0,537.0,537.0,537.0,537.0,537.0,537.0,537.0,537.0,537.0,537.0,537.0,537.0,537.0
mean,411.800745,0.205026,0.095071,0.175471,2.675047,0.202818,0.415194,0.381972,0.089466,0.132231,0.165998,0.506324,0.327864,0.457525,0.292927,0.638611,0.62222,0.86441,-6.813222
std,190.189947,0.0652,0.034193,0.058841,1.260053,0.034985,0.064183,0.067453,0.045502,0.064163,0.043171,0.054359,0.057861,0.044424,0.057446,0.051632,0.099704,0.049978,5.977956
min,100.0,0.044,0.018,0.033,0.1,0.106,0.214,0.176,0.0,0.0,0.059,0.372,0.123,0.326,0.131,0.494,0.318,0.712,-29.6
25%,222.0,0.155,0.07,0.131,1.7,0.18,0.373,0.338,0.058,0.084,0.136,0.47,0.291,0.427,0.252,0.603,0.551,0.832,-10.6
50%,419.0,0.2,0.092,0.172,2.6,0.2,0.417,0.381,0.088,0.126,0.163,0.502,0.331,0.456,0.291,0.637,0.629,0.871,-6.0
75%,593.0,0.25,0.117,0.215,3.4,0.224,0.455,0.429,0.117,0.177,0.192,0.54,0.364,0.485,0.333,0.675,0.687,0.899,-2.5
max,726.0,0.372,0.224,0.392,7.1,0.325,0.637,0.586,0.313,0.414,0.313,0.692,0.47,0.646,0.565,0.799,0.896,0.968,13.1


No missing values and no categorical variables to convert to numerical, so we can proceed

### 4.4: Feature Scaling

In [33]:
from sklearn.preprocessing import StandardScaler

In [42]:
# Initialize
scaler = StandardScaler()

def standardize(data):
    baseball_standardized = scaler.fit_transform(baseball)
    return baseball_standardized

In [43]:
print(standardize(baseball))

[[ 1.06939784 -1.91935385 -0.03134406 ...,  1.67431987  1.55392721
  -0.48335415]
 [-0.55154577  0.35268724 -1.43643013 ..., -0.16283121 -1.06965359
   0.28685706]
 [ 1.53252459  0.92069751  0.34920009 ..., -1.36752045 -1.28995427
  -0.78474114]
 ..., 
 [ 1.26938439 -1.7351343  -0.14843457 ...,  1.63416356  1.17340786
   1.55937993]
 [-0.14104707  2.348399   -0.35334295 ..., -0.95591829 -0.6691069
  -1.62192724]
 [ 0.81152045 -0.21532303  1.40301464 ..., -0.2531829   1.33362654
  -0.31591693]]


# <font color='blue'>Part 5: Select and Train a Model </font>

In [39]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(baseball_standardized, baseball_labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [44]:
# Let's try the full pipeline on a few training instances, say the first 5
some_data = baseball.iloc[:5]
some_labels = baseball_labels.iloc[:5]
# Since we have previously defined pipelines, the program recalls the following:
some_data_prepared = standardize(some_data)

print("Predictions:", lin_reg.predict(some_data_prepared))

Predictions: [[  1.42754762e+01   2.76212299e-01   3.47570694e-01 ...,   9.72591383e+01
   -3.67372327e+00   1.00554199e+00]
 [  1.35760806e+01   2.38725610e-01   2.79415395e-01 ...,   8.13444778e+01
   -3.85930872e-01  -4.08632522e-02]
 [  3.75566953e+01   2.89138154e-01   3.70992126e-01 ...,   1.50820292e+02
   -2.07481690e-01   4.20176179e+00]
 ..., 
 [  2.16766900e+01   2.85986625e-01   3.55982300e-01 ...,   1.18763351e+02
   -2.09104360e+00   3.75609016e+00]
 [  1.90961959e+01   2.02198584e-01   2.75528835e-01 ...,   8.21678965e+01
   -1.92877945e+00  -1.42291393e+00]
 [  2.34668360e+01   2.91471795e-01   3.96779981e-01 ...,   1.43809034e+02
   -6.47573028e-01   3.66575761e+00]]


### Let's just focus on WAR:

In [47]:
war_labels = baseball_labels["WAR"]
war_labels.head()

480   -0.1
605   -1.0
61     4.5
145    2.4
353    1.1
Name: WAR, dtype: float64

In [48]:
lin_reg.fit(baseball_standardized, war_labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [51]:
from sklearn.metrics import mean_squared_error

baseball_predictions = lin_reg.predict(baseball_standardized)
# Our error function which takes in two arguments, the labels and the predictions
lin_mse = mean_squared_error(war_labels, baseball_predictions)
# Take the square root to rescale absolute differencesg
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

0.839614677261


### Let's try a different algorithm: Decision Trees

In [52]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(baseball_standardized, war_labels)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=42, splitter='best')

In [54]:
baseball_predictions = tree_reg.predict(baseball_standardized)
tree_mse = mean_squared_error(war_labels, baseball_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

2.2367642033395347e-17

Definitely an overfit here...