
# <font color='blue'>Part 1: Setting Up </font> 

First, let's make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures:

In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os


# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)


# <font color='blue'>Part 2: Creating Train and Test Sets </font> 

### 2.1: Creating Data-Fetching Function

In [10]:
import os
import tarfile
from six.moves import urllib
import pandas as pd

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/rnz269/ml_baseball/master/understanding_war/excel_files/"
BASEBALL_PATH = os.path.join("datasets", "baseball")
# BASEBALL_URL = DOWNLOAD_ROOT + "datasets/baseball/"

# Following function automates the data fetching process
# Following function creates a datasets/housing directory, downloads the housing.tgz file, and from this extracts the housing.csv 
def fetch_baseball_data(data_url, baseball_path=BASEBALL_PATH):
    # Allows me to use the fetch_baseball_data function on any dataset "data_url" in my Github library
    # Now:"https://github.com/rnz269/ml_baseball/blob/master/understanding_war/excel_files/"data_url"
    baseball_url = DOWNLOAD_ROOT + data_url
    # Following if function creates a local datasets/baseball directory if it doesn't already exist
    if not os.path.isdir(baseball_path):
        os.makedirs(baseball_path)
    # following defines csv_path as datasets/baseball/catchers.csv
    baseball_csv_path = os.path.join(baseball_path, data_url)
    # Following copies network object "https://raw.githubusercontent.com/rnz269/ml_baseball/master/understanding_war/excel_files/"data_url"
    # and stores in local file baseball_csv_path
    urllib.request.urlretrieve(baseball_url, baseball_csv_path)
    return pd.read_csv(baseball_csv_path)
 
  

Change the next cell if you want to access different data:

In [12]:
# This won't work if the file is open, as excel won't allow you to rewrite an open file
fetch_baseball_data("catchers.csv").head()

Unnamed: 0,PA,K%,BB%,ISO,Spd,LD%,GB%,FB%,IFFB%,HR/FB,...,Z-Contact% (pi),Def,HR,AVG,OBP,SLG,wOBA,wRC+,BsR,WAR
0,432,19.70%,13.40%,0.312,2.8,19.60%,39.20%,41.30%,5.10%,25.40%,...,78.70%,0.4,30,0.32,0.414,0.631,0.445,179,-2.8,5.4
1,606,10.40%,12.50%,0.222,3.1,22.60%,47.80%,29.50%,1.50%,20.40%,...,90.50%,4.6,28,0.365,0.444,0.587,0.438,170,-2.1,7.6
2,229,24.90%,10.50%,0.358,2.2,16.40%,49.30%,34.20%,14.00%,40.00%,...,83.70%,4.9,20,0.299,0.376,0.657,0.425,171,-1.0,3.2
3,589,16.60%,12.60%,0.206,3.0,22.10%,40.40%,37.50%,4.50%,13.00%,...,92.50%,2.9,20,0.338,0.426,0.543,0.418,157,-7.4,5.6
4,610,15.70%,11.30%,0.213,2.0,24.60%,46.50%,28.90%,3.90%,18.80%,...,89.90%,13.0,24,0.336,0.408,0.549,0.406,164,-4.6,7.7
