# SQL DataBase Formation

In [1]:
import sqlite3
import pandas as pd
import numpy as np
import os
import urllib2
import time

In [2]:
## connect to database

path = "Data/mlb_data.db"
conn = sqlite3.connect(path)
c = conn.cursor()

## New method - from website

After browsing multiple scrapers on github, I chose to try to make my own. I decided to utilize the URL that Mr. Kessler used in his scraper (url below). I tried to make my own small loop scheme to import into a SQL database. I later realized it is similar to Mr. Kessler's. All credit for the link and method go to him and his scraper (namely, link, year/team loop idea, HTTPError catch and wait method).

reference: https://github.com/alanrkessler/savantscraper

In [3]:
## year_list & team_list

year_list = [2015, 2016, 2017, 2018]

team_list = ['SF', 'LAD', 'ARI', 'COL', 'SD',
             'CHC', 'MIL', 'STL', 'CIN', 'PIT',
             'NYM', 'WSH', 'MIA', 'ATL', 'PHI',
             'OAK', 'HOU', 'LAA', 'TEX', 'SEA',
             'MIN', 'CWS', 'KC', 'DET', 'CLE',
             'NYY', 'BOS', 'TB', 'TOR', 'BAL']


## loop for each team and year

for year in year_list:
    
    print(str(year) + ' Starting. Please wait' + 
          ' (5 to 15 minutes, depending on length of season and connection speed)...')

    counter = 1      # if first team of year, replace existing table

    for team in team_list:
        done = False     # if done, stop trying to access link (stays false if error)
                
        while not done:
            try:
                ## non-nan link
                
#                 link = 'https://baseballsavant.mlb.com/statcast_search/csv?all=true&hfPT=&hfAB=&hfBBT=&hfPR=' + \
#                     '&hfZ=&stadium=&hfBBL=&hfNewZones=&hfGT=R%7C&hfC=&hfSea=' + str(year) + \
#                     '%7C&hfSit=&player_type=pitcher&hfOuts=&opponent=&pitcher_throws=&batter_stands=&hfSA=' + \
#                     '&game_date_gt=&game_date_lt=&team=' + team + \
#                     '&position=&hfRO=&home_road=&hfFlag=&metric_1=&hfInn=&min_pitches=' + \
#                     '0&min_results=0&group_by=name-event&sort_col=pitches&player_event_sort=' + \
#                     'api_p_release_speed&sort_order=desc&min_abs=0&type=details&'
                
                ## nan-included link
        
                link = 'https://baseballsavant.mlb.com/statcast_search/csv?all=true&hfPT=&hfAB=&hfBBT=&hfPR' + \
                    '=&hfZ=&stadium=&hfBBL=&hfNewZones=&hfGT=R%7C&hfC=&hfSea=' + str(year) + \
                    '%7C&hfSit=&player_type=pitcher&hfOuts=&opponent=&pitcher_throws=&batter_stands=&hfSA=' + \
                    '&game_date_gt=&game_date_lt=&hfInfield=&team=' + team + \
                    '&position=&hfOutfield=&hfRO=&home_road=&hfFlag=&hfPull=&metric_1=&hfInn=&min_pitches=' + \
                    '0&min_results=0&group_by=name&sort_col=pitches&player_event_sort=h_launch_speed&' + \
                    'sort_order=desc&min_pas=0&type=details&'
                
                
                
                
                
                temp = pd.read_csv(link)     # import data from link, a download csv link
                exists = ('replace' if counter == 1 else 'append')  # if first team of year, replace existing table
#                 print exists

                temp['spray_angle'] = \
    (np.arctan((temp['hc_x'] - 125.42)/(198.27 - temp['hc_y']))*180/np.pi*.75).apply(lambda x: round(x, 1))
    
    
                temp.to_sql("MLB_" + str(year), conn, if_exists=exists, index = False)  # import to SQL
                
                done = True      # if import and link work, done
                counter = counter + 1     # add to counter for each team completed
                
            except urllib2.HTTPError as e:     # catch an HTTP error if calling website too often
                print(e)
                print(str(year) + ' and ' + team + ' error...')
                time.sleep(60)     # wait a minute before trying again
        
    print(str(year) + ' Finished.')

2015 Starting. Please wait (5 to 15 minutes, depending on length of season and connection speed)...


ParserError: Error tokenizing data. C error: EOF inside string starting at line 4155

## Old method 

I manually downloaded csv files and stored them in a directory, by year. From there, I would import each.

In [None]:
# for year in range(2015, 2019):
    

#     ## current directory of data files

#     cd = "Data/savant/savant_" + str(year) + "/"     # change on your machine


#     ## create empty dataframe

#     data = pd.DataFrame()


#     ## loop through each sheet and append the previous one

#     for file_name in os.listdir(cd):
#         if 'DS' not in file_name:
#             import_data = pd.read_csv(cd + file_name).replace(
#                 'null', np.nan).convert_objects(convert_numeric = True)
#             data = data.append(import_data)


#     ## add spray angle

#     data['spray_angle'] = (np.arctan((data['hc_x'] - 125.42)/(198.27 - data['hc_y']))*180/np.pi*.75).apply(lambda x: round(x, 1))


#     # add dataframe to database

#     data.to_sql("MLB_" + str(year), conn, if_exists="replace", index = False)

## MLB ID key

To have a key to map names to numeric MLB player IDs.

source: http://crunchtimebaseball.com/baseball_map.html

In [None]:
# ## current directory of data files

# cd = "http://crunchtimebaseball.com/master.csv"     # website of linked file


# ## create empty dataframe

# data = pd.read_csv(cd, encoding = 'latin-1').replace('null', np.nan).infer_objects()


# # add dataframe to database

# data.to_sql("ID_Key", conn, if_exists="replace", index = False)

## Database Checks

Check to see the tables listed to confirm their existance, and see the amount of data in each season table.

In [None]:
c.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(c.fetchall())

In [None]:
for year in [2015, 2016, 2017, 2018]:
    df = pd.read_sql("""SELECT game_date
        FROM MLB_{}
        ;""".format(year), conn)
    print year, len(df)

In [None]:
## close access to database

c.close()