# Notebook Description

I want to scrape live MLB gameday data. I have loaded in the models I created in the Pitches notebook and plan to feed in one given live situation at a time to try and predict the next pitch before it is thrown.

Import packages

In [1]:
import bs4 as bs
import urllib
import pandas as pd
import numpy as np
import time

import pickle
from sklearn.externals import joblib 

Load player names dataset and the models

In [2]:
player_names = pd.read_csv('player_names.csv')
player_names.head()

Unnamed: 0.1,Unnamed: 0,id,first_name,last_name,pitches_fast,pitches_break,pitches_change,pitches_non,pitches_rare,pitches_unknown,pitches_total
0,0,452657,Jon,Lester,0.787254,0.141357,0.067835,0.000237,0.0,0.003317,12663
1,1,425794,Adam,Wainwright,0.681166,0.272713,0.033472,0.002622,0.002468,0.007558,6483
2,2,457435,Phil,Coke,0.690594,0.175743,0.108911,0.024752,0.0,0.0,404
3,3,435400,Jason,Motte,0.949453,0.020323,0.016154,0.011464,0.0,0.002606,1919
4,4,519166,Neil,Ramirez,0.552444,0.43664,0.003322,0.005221,0.0,0.002373,2107


In [3]:
rf_mod = joblib.load('rf_mod.pkl')
gb_mod = joblib.load('gb_mod.pkl')

# Live Prediction

### Game Functions

In [4]:
def get_url(year, month, day, away, home, doubleheader=False):
    doubleheader = str(doubleheader * 1 + 1)
    game_url = 'http://gd2.mlb.com/components/game/mlb/year_' + year + '/month_' + month + '/day_' + day + '/'
    game_url = game_url + 'gid_' + year + '_' + month + '_' + day + '_' + away + '_' + home + '_' + doubleheader + '/'
    game_url = game_url + 'inning/inning_all.xml'
    return(game_url)

def starters(game):
    first = game.inning
    home_starter = first.top.atbat['pitcher']
    away_starter = '-1'
    if first.bottom.atbat:
        away_starter = first.bottom.atbat['pitcher']
    return(home_starter, away_starter)

def cur_inning(game):
    inn = game.find_all('inning')[-1]
    return(inn)

def half_inning(inn):
    if inn.bottom:
        top = 0
        half = inn.bottom
    else:
        top = 1
        half = inn.top
    return(half, top)

def hitter(half):
    cur_hitter = half.find_all('atbat')[-1]
    return(cur_hitter)
#NEED TO CHECK IF THE CURRENT HITTER HAS FINISHED HIS AT BAT. DO WE? MAYBE THE NEXT ONE IS PUT IN RIGHT AWAY

#Pitch number
def pitches_ab(cur_hitter):
    all_pitches = cur_hitter.find_all('pitch')
    cur_pitch = all_pitches[-1]
    return(all_pitches, cur_pitch)

#Score differential
def score(top, cur_hitter):
    if top == 1:
        score_diff = int(cur_hitter['home_team_runs']) - int(cur_hitter['away_team_runs'])
    else:
        score_diff = int(cur_hitter['away_team_runs']) - int(cur_hitter['home_team_runs'])
    return(score_diff)

#Previous pitch info
def prev_pitch_info(prev_pitch):
    #Initialize previous pitch info
    prev_type_BREAK, prev_type_CHANGE, prev_type_FAST, prev_type_NON, prev_type_RARE, prev_type_UNKNOWN = [0] * 6
    prev_outcome_B, prev_outcome_C, prev_outcome_F, prev_outcome_S, prev_outcome_starB = [0] * 5
    
    if prev_pitch != 'first': ######Try without this
        #Previous pitch type
        fast = ['FC', 'FF', 'FS', 'FT', 'SI']
        breaking = ['CU', 'KC', 'SL', 'SC']
        unknown = ['UN', 'AB', 'FA']
        non = ['FO', 'IN', 'PO']

        if prev_pitch['pitch_type'] in fast:
            prev_type_FAST = 1
        elif prev_pitch['pitch_type'] in breaking:
            prev_type_BREAK = 1
        elif prev_pitch['pitch_type'] == 'CH':
            prev_type_CHANGE = 1
        elif prev_pitch['pitch_type'] in unknown:
            prev_type_UNKNOWN = 1
        elif prev_pitch['pitch_type'] in non:
            prev_type_NON = 1
        else:
            prev_type_RARE = 1

        #Previous pitch outcome
        prev_code = prev_pitch['code']
        if prev_code == '*B':
            prev_code = 'starB'
        globals()['prev_outcome_' + prev_code] = 1   
    
    return(prev_type_BREAK, prev_type_CHANGE, prev_type_FAST, prev_type_NON, prev_type_RARE, prev_type_UNKNOWN,
           prev_outcome_B, prev_outcome_C, prev_outcome_F, prev_outcome_S, prev_outcome_starB)

def baserunners(prev_hitter):
    on_1b, on_2b, on_3b = [0] * 3

    if prev_hitter != 'none':
        runners = prev_hitter.find_all('runner')
        for r in runners:
            position = r['end']
            if position == '1B':
                on_1b = 1
            elif position == '2B':
                on_2b = 1
            elif position == '3B':
                on_3b = 1

    return(on_1b, on_2b, on_3b)

def get_pitch_count(inn, pitcher_id):
    count = 0
    get_out = 0

    i = inn
    while i:
        if top == 1:
            half = i.top
        else:
            half = i.bottom
        a = half.find_all('atbat')[-1]
        while a:
            if a['pitcher'] != pitcher_id:
                get_out = 1
                break

            ps = a.find_all('pitch')
            count += len(ps)

            a = a.find_previous_sibling('atbat')

        if get_out:
            break

        i = i.find_previous_sibling('inning')
    return(count)    

def right_left(side):
    if side=='R':
        return(1)
    else:
        return(0)
    
def pitches_to_date(pitcher_id):
    pitches_fast, pitches_break, pitches_change, pitches_non, pitches_unknown, pitches_rare, pitches_total = [0] * 7
    if int(pitcher_id) in list(player_names['id']):
        pitches_fast = player_names.loc[player_names['id'] == int(pitcher_id)]['pitches_fast'].values[0]
        pitches_break = player_names.loc[player_names['id'] == int(pitcher_id)]['pitches_break'].values[0]
        pitches_change = player_names.loc[player_names['id'] == int(pitcher_id)]['pitches_change'].values[0]
        pitches_non = player_names.loc[player_names['id'] == int(pitcher_id)]['pitches_non'].values[0]
        pitches_unknown = player_names.loc[player_names['id'] == int(pitcher_id)]['pitches_unknown'].values[0]
        pitches_rare = player_names.loc[player_names['id'] == int(pitcher_id)]['pitches_rare'].values[0]
        pitches_total = player_names.loc[player_names['id'] == int(pitcher_id)]['pitches_total'].values[0]
    else:
        pitches_fast = np.mean(player_names['pitches_fast'])
        pitches_break = np.mean(player_names['pitches_break'])
        pitches_change = np.mean(player_names['pitches_change'])
        pitches_non = np.mean(player_names['pitches_non'])
        pitches_unknown = np.mean(player_names['pitches_unknown'])
        pitches_rare = np.mean(player_names['pitches_rare'])
    return(pitches_fast, pitches_break, pitches_change, pitches_non, pitches_unknown, pitches_rare, pitches_total)

def pitcher_name(pitcher_id):
    if len(player_names.loc[player_names['id'] == int(pitcher_id)][['first_name', 'last_name']]) == 0:
        display_pitcher = 'Rookie'
    else:
        display_pitcher = player_names.loc[player_names['id'] == int(pitcher_id)][['first_name', 'last_name']].values[0]
        display_pitcher = display_pitcher[0] + ' ' + display_pitcher[1]
    return(display_pitcher)

### Request url and predict

Use all the current game functions to extract features for the model

In [5]:
start=time.time()
#Request Data
#Check to see if the inning is over and re-request
#I also want to see if that atbat is over. Use runner status? See if there is an event?
###If we've re-requested more than 10 times, assume the game is over or delayed and stop
count = 1
outs = 3
while outs == 3:
    url = get_url(year='2019', month='04', day='10', away='clemlb', home='detmlb')
    raw = urllib.request.urlopen(url).read()
    game = bs.BeautifulSoup(raw, 'xml')
    outs = game.find_all('atbat')[-1]['o']
    count += 1
    if count > 10:
        break
    if count > 1:
        time.sleep(2)

#Situation attributes:
inn = cur_inning(game)
half = half_inning(inn)[0]
top = half_inning(inn)[1]
cur_hitter = hitter(half)
pitcher_id = cur_hitter['pitcher']
pitcher_history = pitches_to_date(pitcher_id)

#MODEL FEATURES
#Balls
b_count = int(cur_hitter['b'])
#Strikes
s_count = int(cur_hitter['s'])
#Outs
outs = cur_hitter['o']
#Inning
inning = int(inn['num'])
#Pitcher throwing hand
p_throws = cur_hitter['p_throws']
p_throws = right_left(str(p_throws))
#Batting side
stand = cur_hitter['stand']
stand = right_left(str(stand))
#Score differential
score_diff = score(top, cur_hitter)
#Runners on
prev_hitter = 'none'
if cur_hitter.find_previous_sibling('atbat'):
    prev_hitter = cur_hitter.find_previous_sibling('atbat')
on_1b, on_2b, on_3b = baserunners(prev_hitter)
#Starter
is_starter = (pitcher_id in starters(game)) * 1
#Pitch Count
pitch_count = get_pitch_count(inn, pitcher_id)
#Pitches thrown to date
pitches_fast, pitches_break, pitches_change, pitches_non = pitcher_history[0:4]
pitches_unknown, pitches_rare, pitches_total = pitcher_history[4:7]

#Issues if it's the first pitch of the atbat. Use try/except
try:
    pitches_info = pitches_ab(cur_hitter)
    cur_pitch = pitches_info[1]
    #Pitch Number
    pitch_num = len(pitches_info[0]) + 1
    prev = prev_pitch_info(cur_pitch)
    prev_type_BREAK, prev_type_CHANGE, prev_type_FAST, prev_type_NON, prev_type_RARE, prev_type_UNKNOWN = prev[0:6]
    prev_outcome_B, prev_outcome_C, prev_outcome_F, prev_outcome_S, prev_outcome_starB = prev[6:11]
except:
    pitch_num = 1
    prev_type_BREAK, prev_type_CHANGE, prev_type_FAST, prev_type_NON, prev_type_RARE, prev_type_UNKNOWN = [0] * 6
    prev_outcome_B, prev_outcome_C, prev_outcome_F, prev_outcome_S, prev_outcome_starB = [0] * 5

#Put all the data into an array to predict with the model
test_pitch = [[b_count, on_1b, on_2b, on_3b, outs, pitch_num, s_count, inning, p_throws, stand, top, is_starter, 
             score_diff, pitch_count, pitches_fast, pitches_break, pitches_change, pitches_non, pitches_rare, 
             pitches_unknown, pitches_total, prev_type_BREAK, prev_type_CHANGE, prev_type_FAST, prev_type_NON, 
             prev_type_RARE, prev_type_UNKNOWN, prev_outcome_B, prev_outcome_C, prev_outcome_F, prev_outcome_S, 
             prev_outcome_starB]]


#Get the situation to print out to make sure I'm predicting the right thing (sanity check)
display_pitcher = pitcher_name(pitcher_id)
situation = {'pitcher':display_pitcher, 'inning':inning, 'balls':b_count, 'strikes':s_count, 'outs':outs, 
             'runners':[on_1b, on_2b, on_3b], 'pitch count':pitch_count,  'pitch number':pitch_num,
             'prev_type':[prev_type_BREAK, prev_type_CHANGE, prev_type_FAST], 'score differential':score_diff}
print('Situation:', situation)


#Run the model on the situation and print the output
print('\nRF Prediction:', rf_mod.predict(test_pitch)[0])
print('\nGB Prediction:', gb_mod.predict(test_pitch)[0])

end=time.time()
print('Time Elapsed: ', end-start)

Situation: {'pitcher': 'Shane Greene', 'inning': 9, 'balls': 0, 'strikes': 0, 'outs': '3', 'runners': [0, 0, 0], 'pitch count': 11, 'pitch number': 2, 'prev_type': [0, 0, 1], 'score differential': 3}

RF Prediction: FAST

GB Prediction: FAST
Time Elapsed:  12.720757961273193
