
# <font color='blue'>Part 1: Setting Up </font> 

First, let's make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures:

In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os


# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)


# <font color='blue'>Part 2: Creating Train and Test Sets </font> 

### 2.1: Creating Data-Fetching Function

In [10]:
import os
import tarfile
from six.moves import urllib
import pandas as pd

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/rnz269/ml_baseball/master/understanding_war/excel_files/"
BASEBALL_PATH = os.path.join("datasets", "baseball")
# BASEBALL_URL = DOWNLOAD_ROOT + "datasets/baseball/"

# Following function automates the data fetching process
# Following function creates a datasets/housing directory, downloads the housing.tgz file, and from this extracts the housing.csv 
def fetch_baseball_data(data_url, baseball_path=BASEBALL_PATH):
    # Allows me to use the fetch_baseball_data function on any dataset "data_url" in my Github library
    # Now:"https://github.com/rnz269/ml_baseball/blob/master/understanding_war/excel_files/"data_url"
    baseball_url = DOWNLOAD_ROOT + data_url
    # Following if function creates a local datasets/baseball directory if it doesn't already exist
    if not os.path.isdir(baseball_path):
        os.makedirs(baseball_path)
    # following defines csv_path as datasets/baseball/catchers.csv
    baseball_csv_path = os.path.join(baseball_path, data_url)
    # Following copies network object "https://raw.githubusercontent.com/rnz269/ml_baseball/master/understanding_war/excel_files/"data_url"
    # and stores in local file baseball_csv_path
    urllib.request.urlretrieve(baseball_url, baseball_csv_path)
    return pd.read_csv(baseball_csv_path)
 
  

### Change the next cell if you want to access different data:

In [17]:
# This won't work if the file is open, as excel won't allow you to rewrite an open file
baseball = fetch_baseball_data("1b.csv")
baseball.head()

Unnamed: 0,PA,K%,BB%,ISO,Spd,LD%,GB%,FB%,IFFB%,HR/FB,...,Z-Contact% (pi),Def,HR,AVG,OBP,SLG,wOBA,wRC+,BsR,WAR
0,641,8.40%,16.20%,0.296,2.6,22.40%,40.20%,37.40%,11.70%,20.70%,...,96.10%,-1.0,37,0.357,0.462,0.653,0.459,184,-0.2,8.7
1,348,17.80%,14.40%,0.274,1.9,22.00%,33.10%,44.90%,9.40%,17.00%,...,88.10%,2.7,18,0.345,0.437,0.618,0.448,183,-1.2,4.9
2,700,9.10%,16.40%,0.331,4.1,15.60%,38.70%,45.70%,12.00%,20.10%,...,94.60%,-8.8,47,0.327,0.443,0.658,0.447,180,1.6,8.4
3,432,19.70%,13.40%,0.312,2.8,19.60%,39.20%,41.30%,5.10%,25.40%,...,78.70%,0.4,30,0.32,0.414,0.631,0.445,179,-2.8,5.4
4,475,17.90%,19.80%,0.23,2.3,30.20%,37.80%,32.00%,1.10%,15.10%,...,83.90%,-2.1,14,0.337,0.474,0.567,0.438,178,-1.8,5.7


### 2.2: Checking out Descriptive Stats on Data

In [18]:
baseball.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 672 entries, 0 to 671
Data columns (total 27 columns):
PA                 672 non-null int64
K%                 672 non-null object
BB%                672 non-null object
ISO                672 non-null float64
Spd                672 non-null float64
LD%                672 non-null object
GB%                672 non-null object
FB%                672 non-null object
IFFB%              672 non-null object
HR/FB              672 non-null object
Soft%              672 non-null object
Med%               672 non-null object
Hard%              672 non-null object
Swing% (pi)        672 non-null object
O-Swing% (pi)      672 non-null object
Z-Swing% (pi)      672 non-null object
O-Contact% (pi)    672 non-null object
Z-Contact% (pi)    672 non-null object
Def                672 non-null float64
HR                 672 non-null int64
AVG                672 non-null float64
OBP                672 non-null float64
SLG                672 non-null fl

In [None]:
baseball[]