##### Import Packages

In [1]:
import requests
import sqlite3
import pandas as pd
import json

##### Retrieve API Data

In [2]:
# fetch API data
url = "https://statsapi.mlb.com/api/v1/stats?stats=season&group=pitching&playerPool=all&season=2018&teamId=144"
response = requests.get(url)
api_data = response.json()

In [3]:
# process API for dataframe
records = []
for split in api_data['stats'][0]['splits']:
    record = {
        'playerID': str(split['player']['id']),
        'firstName': split['player']['firstName'],
        'lastName': split['player']['lastName'],
        **split['stat']  
    }
    records.append(record)

In [4]:
# convert to dataframe
api_df = pd.DataFrame(records)
api_df = api_df.sort_values(by=['playerID'])

In [5]:
# save raw API data to csv
api_df.to_csv('raw_api_data.csv', index=False)

In [6]:
# create new columns to match main_db
api_df['balls'] = api_df['numberOfPitches'] - api_df['strikes']
api_df['name'] = api_df['lastName'] + ', ' + api_df['firstName']

# select columns to keep
api_df = api_df[['playerID', 'name', 'gamesPlayed', 'battersFaced', 'numberOfPitches', 'hits', 
                'doubles', 'triples', 'homeRuns', 'strikeOuts', 'outs', 'balls','strikes']]

# rename columns to match main_db
api_df = api_df.rename(columns={
    'playerID': 'PitcherID',
    'name': 'PitcherName',
    'gamesPlayed': 'games',
    'battersFaced': 'batters_faced',
    'numberOfPitches': 'pitches',
    'hits': 'hits',
    'doubles': 'doubles',
    'triples': 'triples',
    'homeRuns': 'homeruns',
    'strikeOuts': 'strikeouts',
    'outs': 'outs',
    'strikes': 'strikes',
})

#### Load 'PITCHBYPITCH' From 'main' database

In [7]:
# connect to database
conn = sqlite3.connect('main')

In [8]:
# load tables for dataframe
main_db_df = pd.read_sql_query("SELECT * FROM PITCHBYPITCH", conn)

# close connection
conn.close()

In [9]:
# sort values
main_db_df = main_db_df.sort_values(by=['GameDate', 'INNING', 'PA_OF_INNING', 'PITCH_OF_PA'])

# convert columns to integers
int_columns = ['BALLS', 'STRIKES', 'IS_SINGLE', 'IS_DOUBLE', 'IS_TRIPLE', 'IS_HOMERUN', 
               'IS_HIT', 'IS_OUT', 'IS_STRIKEOUT', 'LAST_PITCH_OF_PA']
main_db_df[int_columns] = main_db_df[int_columns].astype(int)

In [10]:
# group data
main_db_df = (main_db_df
    .groupby(['PitcherID', 'GameKey'])  # first group by pitcher and game
    .agg({
        'PitcherName': 'first', # keep pitcher name
        'PA_OF_INNING': 'nunique', # count unique PAs per game
        'PITCH_OF_PA': 'count', # count total pitches
        'IS_HIT': 'sum', # sum of hits
        'IS_DOUBLE': 'sum', # sum of doubles
        'IS_TRIPLE': 'sum', # sum of triples
        'IS_HOMERUN': 'sum', # sum of homeruns
        'IS_STRIKEOUT': 'sum', # sum of strikeouts
        'IS_OUT': 'sum', # sum of outs
        'BALLS': 'sum', # sum of balls
        'STRIKES': 'sum', # sum of strikes
        'SWING_TAKE': lambda x: [(x == 'swing').sum(), (x == 'take').sum()], # count both swings and takes
    })
    .reset_index()
    .groupby('PitcherID')  # then group by just pitcher to get totals
    .agg({
        'PitcherName': 'first', # keep pitcher name
        'GameKey': 'nunique',  # count of games
        'PA_OF_INNING': 'sum',  # sum of PAs
        'PITCH_OF_PA': 'sum', # sum of total pitches
        'IS_HIT': 'sum', # sum of hits
        'IS_DOUBLE': 'sum', # sum of doubles
        'IS_TRIPLE': 'sum', # sum of triples
        'IS_HOMERUN': 'sum', # sum of homeruns
        'IS_STRIKEOUT': 'sum', # sum of strikeouts
        'IS_OUT': 'sum', # sum of outs
        'BALLS': 'sum', # sum of balls
        'STRIKES': 'sum', # sum of strikes
        'SWING_TAKE': lambda x: [sum(i[0] for i in x), sum(i[1] for i in x)],  # sum of swings and takes
    })
    .reset_index())

# create seperate swing and take columns
main_db_df['swings'] = main_db_df['SWING_TAKE'].apply(lambda x: x[0])
main_db_df['takes'] = main_db_df['SWING_TAKE'].apply(lambda x: x[1])

# drop SWING_TAKE and rename columns
main_db_df = main_db_df.drop('SWING_TAKE', axis=1).rename(columns={
    'GameKey': 'games',
    'PITCH_OF_PA': 'pitches',
    'PA_OF_INNING': 'batters_faced',
    'IS_HIT': 'hits',
    'IS_DOUBLE': 'doubles',
    'IS_TRIPLE': 'triples',
    'IS_HOMERUN': 'homeruns',
    'IS_STRIKEOUT': 'strikeouts',
    'IS_OUT': 'outs',
    'BALLS': 'balls',
    'STRIKES': 'strikes',
})

In [11]:
# cleaned main database
main_db_df

Unnamed: 0,PitcherID,PitcherName,games,batters_faced,pitches,hits,doubles,triples,homeruns,strikeouts,outs,balls,strikes,swings,takes
0,458924,"Venters, Jonny",50,138,527,26,6,0,1,27,99,515,379,217,310
1,592426,"Jackson, Luke",35,148,708,42,10,1,3,46,118,654,625,331,377
2,641438,"Carle, Shane",52,200,942,50,15,0,2,43,176,809,830,477,464


In [12]:
# filter API data for pitchers in main database
api_df = api_df.query('PitcherID in @main_db_df.PitcherID').reset_index(drop=True)
api_df

Unnamed: 0,PitcherID,PitcherName,games,batters_faced,pitches,hits,doubles,triples,homeruns,strikeouts,outs,balls,strikes
0,458924,"Venters, Jonny",28,87,319,15,3,0,0,16,61,134,185
1,592426,"Jackson, Luke",35,184,708,41,10,1,3,46,122,266,442
2,641438,"Carle, Shane",53,259,957,50,15,0,2,43,189,342,615


In [13]:
# save cleaned main and API data to csv
main_db_df.to_csv('cleaned_main_data.csv', index=False)
api_df.to_csv('cleaned_api_data.csv', index=False)