In [None]:
from glob import glob
import pandas as pd
import numpy as np

files = glob("../../datasets/baseball savant/2021/*")

aggregate_set = []

for file in files:
    position = file.split('\\')[1].split('_')[0].upper()
    df = pd.read_csv(file)
    df['Position'] = position
    df['Name'] = df[df.columns[1]] + ' ' + df[df.columns[0]]
    df['Name'] = df['Name'].str.lstrip()
    aggregate_set.append(df)

result = pd.concat(aggregate_set).sort_values(by=['brl_pa'], ascending=False).reset_index()
result

In [None]:
draftkings = pd.read_csv('../../datasets/draft kings/DKSalaries.csv')
_l1 = set(draftkings['Name'])
_l2 = set(result['Name'])

names = list(_l1.intersection(_l2))

# draftkings.loc[draftkings['Name'] == names[0], 'Salary'].item()
_d = result[result['Name'].isin(names)].reset_index().drop(['index', 'level_0'], axis=1)
for name in names:
    x = draftkings.loc[draftkings['Name'] == name, 'Salary'].item()
    _d.loc[_d.Name == name, 'Salary'] = x

_d

In [None]:
import requests
import re
from datetime import datetime

contest_mapping = {
    'n'         :'contest',
    'nt'        :'current_participants',
    'mec'       :'max_entries_possible',
    'm'         :'max_participants',
    'po'        :'total_prizes',
    'gameType'  :'game_type',
    'fpp'       :'entry_fee',
    'sd'        :'starting_datetime',
    'id'        :'game_id',
    'dg'        :'draft_group'
}

def parse_mlb_contests(mapping, max_entry_fee, min_max_participants):

    # define variable for potential farms.
    potential_contests = []

    # gather contest info from unofficial draftkings api.
    response = requests.get('https://www.draftkings.com/lobby/getcontests?sport=MLB').json()

    # restrict ourself to upper limit of entry cost and lower limit of max participants.
    contests = [contest for contest in response['Contests'] if contest['fpp'] <= max_entry_fee if contest['m'] >= min_max_participants]
    
    for contest in contests:
        placeholder = {}
        format = '%Y-%m-%d %I:%M %p %A'
        for k,v in mapping.items():
            if k == 'sd':
                start_time = int(re.search('\((.*)\)',contest[k]).group(1))
                placeholder[v] = datetime.fromtimestamp(start_time/1e3).strftime(format)
            else:
                placeholder[v] = contest[k]

        potential_contests.append(placeholder)
        
    return potential_contests

def get_draft_parameters(game_id):

    # player list
    player_list = []

    response = requests.get(f'https://api.draftkings.com/draftgroups/v1/draftgroups/{game_id}/draftables').json()['draftables']
    for player in response:
      
        placeholder = {
            'name'         :player['displayName'],
            'position'     :player['position'],
            'salary'       :player['salary'],
            'status'       :player['status'],
            'is_disabled'  :player['isDisabled'],
            'opposing_sp'  :player['playerGameAttributes'][0]['value'],
            'team'         :player['teamAbbreviation'],
            'average_pts'  :player['draftStatAttributes'][0]['value']
        }
         
        # determine if player is a probable pitcher.

        player_game_attr_list = player['playerGameAttributes']
        for _dict in player_game_attr_list[::-1]:
            try:
                if _dict['id'] == 137:
                    if _dict['value'] == 'true':
                        placeholder['potential_sp'] = True
                    else:
                        placeholder['potential_sp'] = False
                    break
            except Exception as err:
                pass

        # find the opposing team, kinda wonky, but it works.
        opposing_team = list(filter(lambda x: x not in [placeholder['team'], '@'], player['competition']['name'].split(' ')))[0]
        placeholder['opposing_team'] = opposing_team
        
        player_list.append(placeholder)
        
    return player_list

current_open_contests = parse_mlb_contests(contest_mapping, 250, 25)
test_id = current_open_contests[0]['draft_group']

player_list = get_draft_parameters(test_id)
print(player_list[0])

In [None]:
# build quick model for pitching longevity weighted by points

df = pd.read_csv('../../datasets/baseball savant/pitching longevity.csv')

df['name'] = df[df.columns[1]] + ' ' + df[df.columns[0]]
df['name'] = df['name'].str.lstrip()

df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df = df.iloc[:, 4:]

df['ip/game'] = df.p_formatted_ip.div(df.p_game).div(9)

percent_cols = ['p_k_percent', 'p_bb_percent','hard_hit_percent', 'whiff_percent']

df[percent_cols] = df[percent_cols].apply(lambda x: x/100)

df['pitch/game'] = df.pitch_count.div(df.p_game)

df['pitch/league_pitch_avg'] = df['pitch/game'].div(df['pitch/game'].mean())

df.set_index('name', drop=True, inplace=True)
df = df.drop(['p_formatted_ip', 'p_game', 'pitch_count'], axis=1)

df['rating'] = (df['p_k_percent'].mul(2) + df['p_bb_percent'].mul(-0.6)).mul(df['pitch/league_pitch_avg'])

df.sort_values(by=['rating'], ascending=False)

# dictionary = df.to_dict(orient='index')

# player_dict = dict(zip(df.name, df.values))
# player_dict

In [None]:
import requests
from bs4 import BeautifulSoup
import re
import json
import pandas as pd

endpoints = {
    'probable_pitchers':'https://baseballsavant.mlb.com/probable-pitchers',
    'base'             :'https://baseballsavant.mlb.com'
}

# via requests, first find a list of all the game references to scrape.

response = requests.get(endpoints['probable_pitchers'])
soup = BeautifulSoup(response.text, 'html.parser')
game_hrefs = soup.select("a[href*=preview]")

# given the list of hrefs, visit each matchup page and scrape the player data.
for href in game_hrefs:
    endpoint = endpoints['base'] + str(href.attrs["href"])
    response = requests.get(endpoint)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # all the data is contained within a script under the class "article-template"
    div = soup.find_all("div", {"class":"article-template"})[0]

    # convert bs4 element to string and parse by unique identifiers '{' and ';' and then
    # json.load the string to a dictionary variable. proceed to parse.
    div_str = div.select_one('script').string
    first_index = div_str.find('{')
    
    # start at back 
    last_index = div_str.rfind(';')
    element = div_str[first_index:last_index]
    stats_dict = json.loads(element)
    
    query_params = {
        'hasLineup'  : True,
        'hasProbable': True,
    }

    roster_dictionary = {
        'pitchers' : [],
        'position' : []
    }

    for player in stats_dict['home']['roster']['hitters']:
        placeholder = {
            'name'          : player['person']['fullName'],
            'jersey_number' : int(player['jerseyNumber']),
            'position'      : player['position']['abbreviation'],
            'status'        : player['status']['code'],
            'qualified'     : player['didNotQualify']

        }

        for k in player['seasonStats']['batting'].keys():
            placeholder[f'cs_{k}'] = player['seasonStats']['batting'][k]

        _d = list(player.keys())[10:]
        _t = dict((k, player[k]) for k in _d if k in player)

        placeholder.update(_t)
        
        roster_dictionary['position'].append(placeholder)
        

    for player in stats_dict['home']['roster']['pitchers']:
        placeholder = {
            'name'          : player['person']['fullName'],
            'jersey_number' : int(player['jerseyNumber']),
            'position'      : player['position']['abbreviation'],
            'status'        : player['status']['code'],
            'qualified'     : player['didNotQualify']
        }

        for k in player['seasonStats']['pitching'].keys():
            placeholder[f'cs_{k}'] = player['seasonStats']['pitching'][k]

        _d = list(player.keys())[10:]
        _t = dict((k, player[k]) for k in _d if k in player)

        placeholder.update(_t)
        
        roster_dictionary['pitchers'].append(placeholder)
    
    df = pd.DataFrame(roster_dictionary['position'])
    # df.to_csv('../datasets/baseball savant/')

    # for team in ['home', 'away']:
    #     placeholder = {
    #         'side'              :team,
    #         'team'              :stats_dict[team]['name'],
    #         'filecode'          :stats_dict[team]['fileCode'],
    #         'roster'            :stats_dict[team]['roster']
    #     }

    #     # print(stats_dict)

    break   

In [None]:
import pandas as pd

endpoints = {
    'probable_pitchers':'https://baseballsavant.mlb.com/probable-pitchers',
    'base'             :'https://baseballsavant.mlb.com'
}

df = pd.read_csv('../datasets/baseball savant/general sets/pitcher percentile rankings.csv')
df = df.dropna().reset_index(drop=True)

df[df.columns[3:]] = df[df.columns[3:]].div(100)
df = df.drop(['player_id','year'], axis=1)

# add more weight to strikeouts, hardhit, and bb%
df['hard_hit_percent'] = df['hard_hit_percent'].mul(1.55)
df['k_percent'] = df['k_percent'].mul(1.75)
df['bb_percent'] = df['bb_percent'].mul(1.5)
df['exit_velocity'] = df['exit_velocity'].mul(1.5)
df['fb_velocity'] = df['fb_velocity'].mul(2)

df['rating'] = df[df.columns[1:]].sum(axis=1).round(2)

df = df.sort_values(by=['rating'], ascending=False).reset_index(drop=True)

df['rating'] = (df['rating'] - df['rating'].min()) / (df['rating'].max() - df['rating'].min())   

# df.to_csv('../../datasets/baseball savant/models/pitching percentile rankings.csv')

mismatch_tuples = []
pitchers_found = []

def generate_matchups(matchup_list, df):

    for matchup in matchup_list:
        away_pitcher = matchup[0]
        home_pitcher = matchup[1]
        away_rating, home_rating = None, None
        try:
            away_rating = round(df.loc[df['player_name'] == away_pitcher, 'rating'].item(),2)
            pitchers_found.append(away_pitcher)
        except Exception as err:
            print('not enough 2021 data on:',away_pitcher)

        try:
            home_rating = round(df.loc[df['player_name'] == home_pitcher, 'rating'].item(),2)
            pitchers_found.append(home_pitcher)
            
        except Exception as err:
            print('not enough 2021 data on:',home_pitcher)
        
        # try calculate mismatch.
        try:
            
            mismatch = round(away_rating - home_rating,2)
            mismatch_tuples.append((away_pitcher,home_pitcher, mismatch))

        except Exception as err:
            # print(err)
            print(f'mismatch couldn\'t be calculated for {away_pitcher} vs. {home_pitcher}.')

    sorted_available_matches = sorted(mismatch_tuples, key=lambda x: x[1])

    mismatch_df = pd.DataFrame(sorted_available_matches, columns =['away pitcher', 'home pitcher', 'mismatch difference (positive = away favored, negative = home favored.'])

    todays_pitchers = df[df['player_name'].isin(pitchers_found)]

    todays_pitchers = todays_pitchers.sort_values(by=['rating'], ascending=False).reset_index(drop=True)

    with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
        display(mismatch_df)
        display(todays_pitchers)

    return mismatch_df, todays_pitchers

In [None]:
import requests
from bs4 import BeautifulSoup
# via requests, first find a list of all the game references to scrape.

response = requests.get(endpoints['probable_pitchers'])
soup = BeautifulSoup(response.text, 'html.parser')

matchup_strings = [i.text for i in soup.find_all("a", {"class": "matchup-link"}) if 'src=' not in str(i)]

matchup_strings = [pitcher for pitcher in matchup_strings if pitcher not in ['Jose Quintana']]

matchup_tuples = [(matchup_strings[i],matchup_strings[i+1]) for i in range(0,len(matchup_strings),2)]

mismatch_df, todays_pitchers = generate_matchups(matchup_tuples, df)

In [None]:
df = pd.read_csv('../datasets/baseball savant/general sets/batter percentile rankings.csv')
df = df.dropna().reset_index(drop=True)

df[df.columns[3:]] = df[df.columns[3:]].div(100)
df = df.drop(['player_id','year'], axis=1)

# add more weight to strikeouts, hardhit, and bb%
df['hard_hit_percent'] = df['hard_hit_percent'].mul(1.55)
df['k_percent'] = df['hard_hit_percent'].mul(1.75)
df['bb_percent'] = df['hard_hit_percent'].mul(1.5)
df['exit_velocity'] = df['exit_velocity'].mul(2)
df['xwoba'] = df['xwoba'].mul(2)

df['rating'] = df[df.columns[1:]].sum(axis=1).round(2)

df = df.sort_values(by=['rating'], ascending=False).reset_index(drop=True)

df['rating'] = (df['rating'] - df['rating'].min()) / (df['rating'].max() - df['rating'].min())   

df

# df.to_csv('../../datasets/baseball savant/models/pitching percentile rankings.csv')


In [None]:
response = requests.get('https://www.rotowire.com/baseball/daily-lineups.php')
soup = BeautifulSoup(response.text, 'html.parser')
team_abbrs = [result.text for result in soup.find_all("div", {"class": "lineup__abbr"})]
away_lineups = []
home_lineups = []

# i honestly don't care to write this loop in one, i could, but i dont want to think about slicing. its 4am.

lineups = soup.find_all("div", {"class":"lineup__main"})
for lineup in lineups:
    uls = lineup.find_all("ul", {"class":"lineup__list is-visit"})
    _uls = lineup.find_all("ul", {"class":"lineup__list is-home"})
    
    for ul in uls:
        lis = ul.find_all('li')[2:]
        lineup = [(li.find("div", {'class':"lineup__pos"}).text, li.find("a").text, i) for li, i in zip(lis,range(1,10))]
        away_lineups.append(lineup)

    for ul in _uls:
        lis = ul.find_all('li')[2:]
        lineup = [(li.find("div", {'class':"lineup__pos"}).text, li.find("a").text, i) for li, i in zip(lis,range(1,10))]
        home_lineups.append(lineup)

away_dict = {
    a:b for a,b in zip(team_abbrs[::2], away_lineups)
}

home_dict = {
    a:b for a,b in zip(team_abbrs[1::2], home_lineups)
}

away_dict.update(home_dict)
lineup_dict = away_dict.copy()
# keys = ['CLE','KC','PIT','STL','MIL','BAL','COL','TEX']
# for k in keys:
#     print(k)
#     print(lineup_dict[k][:5])
#     print()

player_list = [player[1].split(' ')[1] for k in lineup_dict.keys() for player in lineup_dict[k][:7]]


In [None]:
df = pd.read_csv('../../datasets/baseball savant/general sets/pitcher percentile rankings.csv')
# df = df.dropna().reset_index(drop=True)

# df[df.columns[3:]] = df[df.columns[3:]].div(100)
# df = df.drop(['player_id','year'], axis=1)

velo = df.drop(['player_id','year'], axis=1)

velo = velo.sort_values(by=['fb_velocity'], ascending=False).reset_index(drop=True)

todays_pitcher_velo = velo[velo['player_name'].isin(matchup_strings)]

with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(todays_pitcher_velo)

In [None]:
import numpy as np
import re
from bs4 import BeautifulSoup
import requests
# get box scores

def get_individual_box_scores(href):
    base_url = 'https://www.baseball-reference.com/boxes/'
    new_url = base_url + href['href'].split('/boxes/')[1]
    response = requests.get(new_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # get the box score.
    table = soup.findAll("table")[0]

    # initialize scoreboard
    box_score = {
        i:{} for i in [1,2,3,4,5,6,7,8,9,'R','H','E']
    }

    # for table in tables:
    rows = table.findAll('tr')[1:3]
    for row in rows:
        team = row.select("a[href*='/teams/']")[0]['href'].split('/')[2]
        scores = [0 if td.text == 'X' else int(td.text) for td in row.find_all('td', {'class':'center'})[1:]]
        for score,key in zip(scores,box_score.keys()):
            box_score[key][team] = score 
        
    comments_removed = response.text.replace("<!--","").replace("-->","")
    _soup = BeautifulSoup(comments_removed, 'html.parser')
    
    tables = _soup.find_all("table", {"class":"sortable stats_table min_width shade_zero"})
    for table in tables:
        if 'batting' in table["id"]:
            
            team_unparsed = table["id"].split('batting')[0]
            team_separated = re.findall('[A-Z][^A-Z]*', team_unparsed)
            team = " ".join(team_separated) + ' Game Batting Stats'

            rows = table.find_all('tr')
            headers = rows[0].find_all('th')
            table_list = []
            for row in rows[1:]:
                
                player_info = row.find('th').text.split(' ')
                name = " ".join(player_info[:2])
                name = name.replace(u'\xa0', u'')
                position = player_info[-1]
                
                table_dict = {header.text:None for header in headers[1:]}
                table_dict['name'] = name
                table_dict['position'] = position

                stats = [stat.text for stat in row.find_all('td')]
                for k,s in zip(table_dict.keys(), stats):
                    table_dict[k] = s

                table_list.append(table_dict)
                
            # remove pitchers and pinch hitters
            table_list = [player for player in table_list if player['position'] not in ['P','PH', 'Totals', '']]
            df = pd.DataFrame(table_list).set_index(['name'])
            display(df)
            
        else:
            team_unparsed = table["id"].split('pitching')[0]
            team_separated = re.findall('[A-Z][^A-Z]*', team_unparsed)
            team = " ".join(team_separated) + ' Game Pitching Stats'

            rows = table.find_all('tr')
            headers = rows[0].find_all('th')
            table_list = []
            for row in rows[1:]:
                
                player_info = row.find('th').text.split(' ')
                name = " ".join(player_info[:2])
                name = name.replace(u'\xa0', u'')
                position = 'P'
                
                table_dict = {header.text:None for header in headers[1:]}
                table_dict['name'] = name
                table_dict['position'] = position

                stats = [stat.text for stat in row.find_all('td')]
                for k,s in zip(table_dict.keys(), stats):
                    table_dict[k] = s

                table_list.append(table_dict)

            # # remove pitchers and pinch hitters
            table_list = [player for player in table_list if player['position'] not in ['Totals', '']]
            df = pd.DataFrame(table_list).set_index(['name'])

            display(df)

def get_box_scores():
    
    base_url = 'https://www.baseball-reference.com/boxes/'
    response = requests.get(base_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    game_hrefs = [href for href in soup.select("a[href*='2022']") if 'Final' in href.text]

    return game_hrefs

In [None]:
import requests
import re
import json
from IPython.display import display, HTML
import pandas as pd
# get updated baseball savant percentile rankings

endpoints = {
    'batters_percentile':'https://baseballsavant.mlb.com/leaderboard/percentile-rankings?type=batter&team=',
    'pitchers_percentile':'https://baseballsavant.mlb.com/leaderboard/percentile-rankings?type=pitcher&year=2022&team=',
    'batters_custom':"https://baseballsavant.mlb.com/leaderboard/custom?year=2022&type=batter&filter=&sort=4&sortDir=desc&min=10&selections=player_age,b_ab,b_total_pa,b_total_hits,b_single,b_double,b_triple,b_home_run,b_strikeout,b_walk,b_k_percent,b_bb_percent,batting_avg,slg_percent,on_base_percent,on_base_plus_slg,isolated_power,b_rbi,b_lob,b_total_bases,r_total_caught_stealing,r_total_stolen_base,b_ab_scoring,b_ball,b_called_strike,b_catcher_interf,b_foul,b_foul_tip,b_game,b_gnd_into_dp,b_gnd_into_tp,b_gnd_rule_double,b_hit_by_pitch,b_hit_ground,b_hit_fly,b_hit_into_play,b_hit_line_drive,b_hit_popup,b_out_fly,b_out_ground,b_out_line_drive,b_out_popup,b_intent_ball,b_intent_walk,b_interference,b_pinch_hit,b_pinch_run,b_pitchout,b_played_dh,b_sac_bunt,b_sac_fly,b_swinging_strike,r_caught_stealing_2b,r_caught_stealing_3b,r_caught_stealing_home,r_defensive_indiff,r_interference,r_pickoff_1b,r_pickoff_2b,r_pickoff_3b,r_run,r_stolen_base_2b,r_stolen_base_3b,r_stolen_base_home,b_total_ball,b_total_sacrifices,b_total_strike,b_total_swinging_strike,b_total_pitches,r_stolen_base_pct,r_total_pickoff,b_reached_on_error,b_walkoff,b_reached_on_int,xba,xslg,woba,xwoba,xobp,xiso,wobacon,xwobacon,bacon,xbacon,xbadiff,xslgdiff,wobadif,exit_velocity_avg,launch_angle_avg,sweet_spot_percent,barrel,barrel_batted_rate,solidcontact_percent,flareburner_percent,poorlyunder_percent,poorlytopped_percent,poorlyweak_percent,hard_hit_percent,z_swing_percent,z_swing_miss_percent,oz_swing_percent,oz_swing_miss_percent,oz_contact_percent,out_zone_swing_miss,out_zone_swing,out_zone_percent,out_zone,meatball_swing_percent,meatball_percent,pitch_count_offspeed,pitch_count_fastball,pitch_count_breaking,pitch_count,iz_contact_percent,in_zone_swing_miss,in_zone_swing,in_zone_percent,in_zone,edge_percent,edge,whiff_percent,swing_percent,pull_percent,straightaway_percent,opposite_percent,batted_ball,f_strike_percent,groundballs_percent,groundballs,flyballs_percent,flyballs,linedrives_percent,linedrives,popups_percent,popups,pop_2b_sba_count,pop_2b_sba,pop_2b_sb,pop_2b_cs,pop_3b_sba_count,pop_3b_sba,pop_3b_sb,pop_3b_cs,exchange_2b_3b_sba,maxeff_arm_2b_3b_sba,n_outs_above_average,n_fieldout_5stars,n_opp_5stars,n_5star_percent,n_fieldout_4stars,n_opp_4stars,n_4star_percent,n_fieldout_4stars,n_fieldout_3stars,n_opp_3stars,n_3star_percent,n_fieldout_3stars,n_opp_2stars,n_2star_percent,n_fieldout_2stars,n_opp_2stars,n_1star_percent,n_fieldout_1stars,n_opp_1stars,n_1star_percent,rel_league_reaction_distance,rel_league_burst_distance,rel_league_routing_distance,rel_league_bootup_distance,f_bootup_distance,n_bolts,hp_to_1b,sprint_speed,&chart=false&x=xba&y=xba&r=no&chartType=beeswarm",
    'pitchers_custom':"https://baseballsavant.mlb.com/leaderboard/custom?year=2022&type=pitcher&filter=&sort=4&sortDir=asc&min=10&selections=player_age,p_game,p_formatted_ip,p_total_pa,p_ab,p_total_hits,p_single,p_double,p_triple,p_home_run,p_strikeout,p_walk,p_k_percent,p_bb_percent,batting_avg,slg_percent,on_base_percent,on_base_plus_slg,isolated_power,p_earned_run,p_run,p_save,p_blown_save,p_out,p_win,p_loss,p_wild_pitch,p_balk,p_shutout,p_era,p_opp_batting_avg,p_opp_on_base_avg,p_total_stolen_base,p_pickoff_attempt_1b,p_pickoff_attempt_2b,p_pickoff_attempt_3b,p_pickoff_1b,p_pickoff_2b,p_pickoff_3b,p_lob,p_rbi,p_stolen_base_2b,p_stolen_base_3b,p_stolen_base_home,p_quality_start,p_walkoff,p_run_support,p_ab_scoring,p_automatic_ball,p_ball,p_called_strike,p_catcher_interf,p_caught_stealing_2b,p_caught_stealing_3b,p_caught_stealing_home,p_complete_game,p_defensive_indiff,p_foul,p_foul_tip,p_game_finished,p_game_in_relief,p_gnd_into_dp,p_gnd_into_tp,p_gnd_rule_double,p_hit_by_pitch,p_hit_fly,p_hit_ground,p_hit_line_drive,p_hit_into_play,p_hit_scoring,p_hold,p_intent_ball,p_intent_walk,p_missed_bunt,p_out_fly,p_out_ground,p_out_line_drive,p_passed_ball,p_pickoff_error_1b,p_pickoff_error_2b,p_pickoff_error_3b,p_pitchout,p_relief_no_out,p_sac_bunt,p_sac_fly,p_starting_p,p_swinging_strike,p_unearned_run,p_total_ball,p_total_bases,p_total_caught_stealing,p_total_pickoff,p_total_pickoff_attempt,p_total_pickoff_error,p_total_pitches,p_total_sacrifices,p_total_strike,p_total_swinging_strike,p_inh_runner,p_inh_runner_scored,p_beq_runner,p_beq_runner_scored,p_reached_on_error,xba,xslg,woba,xwoba,xobp,xiso,wobacon,xwobacon,bacon,xbacon,xbadiff,xslgdiff,wobadif,exit_velocity_avg,launch_angle_avg,sweet_spot_percent,barrel,barrel_batted_rate,solidcontact_percent,flareburner_percent,poorlyunder_percent,poorlytopped_percent,poorlyweak_percent,hard_hit_percent,z_swing_percent,z_swing_miss_percent,oz_swing_percent,oz_swing_miss_percent,oz_contact_percent,out_zone_swing_miss,out_zone_swing,out_zone_percent,out_zone,meatball_swing_percent,meatball_percent,pitch_count_offspeed,pitch_count_fastball,pitch_count_breaking,pitch_count,iz_contact_percent,in_zone_swing_miss,in_zone_swing,in_zone_percent,in_zone,edge_percent,edge,whiff_percent,swing_percent,pull_percent,straightaway_percent,opposite_percent,batted_ball,f_strike_percent,groundballs_percent,groundballs,flyballs_percent,flyballs,linedrives_percent,linedrives,popups_percent,popups,pitch_hand,n,n_ff_formatted,ff_avg_speed,ff_avg_spin,ff_avg_break_x,ff_avg_break_z,ff_avg_break,ff_range_speed,n_sl_formatted,sl_avg_speed,sl_avg_spin,sl_avg_break_x,sl_avg_break_z,sl_avg_break,sl_range_speed,n_ch_formatted,ch_avg_speed,ch_avg_spin,ch_avg_break_x,ch_avg_break_z,ch_avg_break,ch_range_speed,n_cukc_formatted,cu_avg_speed,cu_avg_spin,cu_avg_break_x,cu_avg_break_z,cu_avg_break,cu_range_speed,n_sift_formatted,si_avg_speed,si_avg_spin,si_avg_break_x,si_avg_break_z,si_avg_break,si_range_speed,n_fc_formatted,fc_avg_speed,fc_avg_spin,fc_avg_break_x,fc_avg_break_z,fc_avg_break,fc_range_speed,n_fs_formatted,fs_avg_speed,fs_avg_spin,fs_avg_break_x,fs_avg_break_z,fs_avg_break,fs_range_speed,n_kn_formatted,kn_avg_speed,kn_avg_spin,kn_avg_break_x,kn_avg_break_z,kn_avg_break,kn_range_speed,n_fastball_formatted,fastball_avg_speed,fastball_avg_spin,fastball_avg_break_x,fastball_avg_break_z,fastball_avg_break,fastball_range_speed,n_breaking_formatted,breaking_avg_speed,breaking_avg_spin,breaking_avg_break_x,breaking_avg_break_z,breaking_avg_break,breaking_range_speed,n_offspeed_formatted,offspeed_avg_speed,offspeed_avg_spin,offspeed_avg_break_x,offspeed_avg_break_z,offspeed_avg_break,offspeed_range_speed,&chart=false&x=xba&y=xba&r=no&chartType=beeswarm"
}

with open('../housekeeping/baseball_savant_endpoints.txt', 'w') as file:
    file.write(json.dumps(endpoints))

position_mapping = {
    1:'P',
    2:'C',
    3:'1B',
    4:'2B',
    5:'3B',
    6:'SS',
    7:'LF',
    8:'CF',
    9:'RF',
    10:'DH',
    11:'UTIL'
}

def get_baseball_savant_leaderboard(endpoint):

    response = requests.get(endpoint)
    soup = BeautifulSoup(response.text, 'html.parser')

    # find leaderboard data (should only be one element.)
    script = str(soup.find_all("div", {"class":"article-template"})[0])

    # use regex and put everything within brackets to create a pseudo-list
    # then load this as json.
    leaderboard = json.loads('[' + re.findall('\[(.*?)\]', script)[0] + ']')

    df = pd.DataFrame(leaderboard)

    return df

# batters_percentiles = get_baseball_savant_leaderboard(endpoints['batters_percentile'])
pitchers_percentiles = get_baseball_savant_leaderboard(endpoints['pitchers_percentile'])
# batter_custom_leaderboard  = get_baseball_savant_leaderboard(endpoints['batters_custom'])
# pitcher_custom_leaderboard = get_baseball_savant_leaderboard(endpoints['pitchers_custom'])

In [None]:
# custom leaderboard endpoint

batters_endpoint = "https://baseballsavant.mlb.com/leaderboard/custom?year=2022&type=batter&filter=&sort=4&sortDir=desc&min=10&selections=player_age,b_ab,b_total_pa,b_total_hits,b_single,b_double,b_triple,b_home_run,b_strikeout,b_walk,b_k_percent,b_bb_percent,batting_avg,slg_percent,on_base_percent,on_base_plus_slg,isolated_power,b_rbi,b_lob,b_total_bases,r_total_caught_stealing,r_total_stolen_base,b_ab_scoring,b_ball,b_called_strike,b_catcher_interf,b_foul,b_foul_tip,b_game,b_gnd_into_dp,b_gnd_into_tp,b_gnd_rule_double,b_hit_by_pitch,b_hit_ground,b_hit_fly,b_hit_into_play,b_hit_line_drive,b_hit_popup,b_out_fly,b_out_ground,b_out_line_drive,b_out_popup,b_intent_ball,b_intent_walk,b_interference,b_pinch_hit,b_pinch_run,b_pitchout,b_played_dh,b_sac_bunt,b_sac_fly,b_swinging_strike,r_caught_stealing_2b,r_caught_stealing_3b,r_caught_stealing_home,r_defensive_indiff,r_interference,r_pickoff_1b,r_pickoff_2b,r_pickoff_3b,r_run,r_stolen_base_2b,r_stolen_base_3b,r_stolen_base_home,b_total_ball,b_total_sacrifices,b_total_strike,b_total_swinging_strike,b_total_pitches,r_stolen_base_pct,r_total_pickoff,b_reached_on_error,b_walkoff,b_reached_on_int,xba,xslg,woba,xwoba,xobp,xiso,wobacon,xwobacon,bacon,xbacon,xbadiff,xslgdiff,wobadif,exit_velocity_avg,launch_angle_avg,sweet_spot_percent,barrel,barrel_batted_rate,solidcontact_percent,flareburner_percent,poorlyunder_percent,poorlytopped_percent,poorlyweak_percent,hard_hit_percent,z_swing_percent,z_swing_miss_percent,oz_swing_percent,oz_swing_miss_percent,oz_contact_percent,out_zone_swing_miss,out_zone_swing,out_zone_percent,out_zone,meatball_swing_percent,meatball_percent,pitch_count_offspeed,pitch_count_fastball,pitch_count_breaking,pitch_count,iz_contact_percent,in_zone_swing_miss,in_zone_swing,in_zone_percent,in_zone,edge_percent,edge,whiff_percent,swing_percent,pull_percent,straightaway_percent,opposite_percent,batted_ball,f_strike_percent,groundballs_percent,groundballs,flyballs_percent,flyballs,linedrives_percent,linedrives,popups_percent,popups,pop_2b_sba_count,pop_2b_sba,pop_2b_sb,pop_2b_cs,pop_3b_sba_count,pop_3b_sba,pop_3b_sb,pop_3b_cs,exchange_2b_3b_sba,maxeff_arm_2b_3b_sba,n_outs_above_average,n_fieldout_5stars,n_opp_5stars,n_5star_percent,n_fieldout_4stars,n_opp_4stars,n_4star_percent,n_fieldout_4stars,n_fieldout_3stars,n_opp_3stars,n_3star_percent,n_fieldout_3stars,n_opp_2stars,n_2star_percent,n_fieldout_2stars,n_opp_2stars,n_1star_percent,n_fieldout_1stars,n_opp_1stars,n_1star_percent,rel_league_reaction_distance,rel_league_burst_distance,rel_league_routing_distance,rel_league_bootup_distance,f_bootup_distance,n_bolts,hp_to_1b,sprint_speed,&chart=false&x=xba&y=xba&r=no&chartType=beeswarm"
pitcher_endpoint = "https://baseballsavant.mlb.com/leaderboard/custom?year=2022&type=pitcher&filter=&sort=4&sortDir=asc&min=10&selections=player_age,p_game,p_formatted_ip,p_total_pa,p_ab,p_total_hits,p_single,p_double,p_triple,p_home_run,p_strikeout,p_walk,p_k_percent,p_bb_percent,batting_avg,slg_percent,on_base_percent,on_base_plus_slg,isolated_power,p_earned_run,p_run,p_save,p_blown_save,p_out,p_win,p_loss,p_wild_pitch,p_balk,p_shutout,p_era,p_opp_batting_avg,p_opp_on_base_avg,p_total_stolen_base,p_pickoff_attempt_1b,p_pickoff_attempt_2b,p_pickoff_attempt_3b,p_pickoff_1b,p_pickoff_2b,p_pickoff_3b,p_lob,p_rbi,p_stolen_base_2b,p_stolen_base_3b,p_stolen_base_home,p_quality_start,p_walkoff,p_run_support,p_ab_scoring,p_automatic_ball,p_ball,p_called_strike,p_catcher_interf,p_caught_stealing_2b,p_caught_stealing_3b,p_caught_stealing_home,p_complete_game,p_defensive_indiff,p_foul,p_foul_tip,p_game_finished,p_game_in_relief,p_gnd_into_dp,p_gnd_into_tp,p_gnd_rule_double,p_hit_by_pitch,p_hit_fly,p_hit_ground,p_hit_line_drive,p_hit_into_play,p_hit_scoring,p_hold,p_intent_ball,p_intent_walk,p_missed_bunt,p_out_fly,p_out_ground,p_out_line_drive,p_passed_ball,p_pickoff_error_1b,p_pickoff_error_2b,p_pickoff_error_3b,p_pitchout,p_relief_no_out,p_sac_bunt,p_sac_fly,p_starting_p,p_swinging_strike,p_unearned_run,p_total_ball,p_total_bases,p_total_caught_stealing,p_total_pickoff,p_total_pickoff_attempt,p_total_pickoff_error,p_total_pitches,p_total_sacrifices,p_total_strike,p_total_swinging_strike,p_inh_runner,p_inh_runner_scored,p_beq_runner,p_beq_runner_scored,p_reached_on_error,xba,xslg,woba,xwoba,xobp,xiso,wobacon,xwobacon,bacon,xbacon,xbadiff,xslgdiff,wobadif,exit_velocity_avg,launch_angle_avg,sweet_spot_percent,barrel,barrel_batted_rate,solidcontact_percent,flareburner_percent,poorlyunder_percent,poorlytopped_percent,poorlyweak_percent,hard_hit_percent,z_swing_percent,z_swing_miss_percent,oz_swing_percent,oz_swing_miss_percent,oz_contact_percent,out_zone_swing_miss,out_zone_swing,out_zone_percent,out_zone,meatball_swing_percent,meatball_percent,pitch_count_offspeed,pitch_count_fastball,pitch_count_breaking,pitch_count,iz_contact_percent,in_zone_swing_miss,in_zone_swing,in_zone_percent,in_zone,edge_percent,edge,whiff_percent,swing_percent,pull_percent,straightaway_percent,opposite_percent,batted_ball,f_strike_percent,groundballs_percent,groundballs,flyballs_percent,flyballs,linedrives_percent,linedrives,popups_percent,popups,pitch_hand,n,n_ff_formatted,ff_avg_speed,ff_avg_spin,ff_avg_break_x,ff_avg_break_z,ff_avg_break,ff_range_speed,n_sl_formatted,sl_avg_speed,sl_avg_spin,sl_avg_break_x,sl_avg_break_z,sl_avg_break,sl_range_speed,n_ch_formatted,ch_avg_speed,ch_avg_spin,ch_avg_break_x,ch_avg_break_z,ch_avg_break,ch_range_speed,n_cukc_formatted,cu_avg_speed,cu_avg_spin,cu_avg_break_x,cu_avg_break_z,cu_avg_break,cu_range_speed,n_sift_formatted,si_avg_speed,si_avg_spin,si_avg_break_x,si_avg_break_z,si_avg_break,si_range_speed,n_fc_formatted,fc_avg_speed,fc_avg_spin,fc_avg_break_x,fc_avg_break_z,fc_avg_break,fc_range_speed,n_fs_formatted,fs_avg_speed,fs_avg_spin,fs_avg_break_x,fs_avg_break_z,fs_avg_break,fs_range_speed,n_kn_formatted,kn_avg_speed,kn_avg_spin,kn_avg_break_x,kn_avg_break_z,kn_avg_break,kn_range_speed,n_fastball_formatted,fastball_avg_speed,fastball_avg_spin,fastball_avg_break_x,fastball_avg_break_z,fastball_avg_break,fastball_range_speed,n_breaking_formatted,breaking_avg_speed,breaking_avg_spin,breaking_avg_break_x,breaking_avg_break_z,breaking_avg_break,breaking_range_speed,n_offspeed_formatted,offspeed_avg_speed,offspeed_avg_spin,offspeed_avg_break_x,offspeed_avg_break_z,offspeed_avg_break,offspeed_range_speed,&chart=false&x=xba&y=xba&r=no&chartType=beeswarm"

def get_custom_leaderboard(endpoint):
    response = requests.get(endpoint)
    soup = BeautifulSoup(response.text, 'html.parser')

    # find leaderboard data (should only be one element.)
    script = str(soup.find_all("div", {"class":"article-template"})[0])

    # use regex and put everything within brackets to create a pseudo-list
    # then load this as json.
    leaderboard = json.loads('[' + re.findall('\[(.*?)\]', script)[0] + ']')

    df = pd.DataFrame(leaderboard)

    return df

batter_custom_leaderboard  = get_custom_leaderboard(batters_endpoint)
pitcher_custom_leaderboard = get_custom_leaderboard(pitcher_endpoint)

In [None]:
import re 
import requests
import pandas as pd
from IPython.display import display, HTML

# team based stats

def get_team_stats():

    endpoints = {
        'batting':"https://www.espn.com/mlb/stats/team",
        'pitching':"https://www.espn.com/mlb/stats/team/_/view/pitching"
    }

    return_dfs = {}

    for k,v in endpoints.items():
        response = requests.get(v)
        soup = BeautifulSoup(response.text, 'html.parser')
        raw_string = re.findall('\"teamStats\":(.*?),\"dictionary\":',str(soup))[0]
        team_stats = json.loads(raw_string)

        stat_list = []

        for team in team_stats:
            team_dict = {
                'abbr':team['team']['abbrev'],
                'name':team['team']['displayName']
            }
            
            for stat in team['stats']:
                try:
                    team_dict[stat['name']] = float(stat['value'])
                except ValueError as err:
                    team_dict[stat['name']] = stat['value']

            stat_list.append(team_dict)

        if k == 'batting':
            df = pd.DataFrame(stat_list).sort_values(by=['OPS'], ascending=True).reset_index(drop=True)
            
            return_dfs['batting'] = df 
        
        else:
            df = pd.DataFrame(stat_list).sort_values(by=['WHIP'], ascending=False).reset_index(drop=True)
            
            return_dfs['pitching'] = df

    return return_dfs

_ = get_team_stats()

In [2]:
import requests
from bs4 import BeautifulSoup
# via requests, first find a list of all the game references to scrape.

endpoints = {
    'probable_pitchers':'https://baseballsavant.mlb.com/probable-pitchers',
    'base'             :'https://baseballsavant.mlb.com',
    'batting':"https://www.espn.com/mlb/stats/team",
    'pitching':"https://www.espn.com/mlb/stats/team/_/view/pitching",
    'batters_percentile':'https://baseballsavant.mlb.com/leaderboard/percentile-rankings?type=batter&team=',
    'pitchers_percentile':'https://baseballsavant.mlb.com/leaderboard/percentile-rankings?type=pitcher&year=2022&team=',
    'batters_custom':"https://baseballsavant.mlb.com/leaderboard/custom?year=2022&type=batter&filter=&sort=4&sortDir=desc&min=10&selections=player_age,b_ab,b_total_pa,b_total_hits,b_single,b_double,b_triple,b_home_run,b_strikeout,b_walk,b_k_percent,b_bb_percent,batting_avg,slg_percent,on_base_percent,on_base_plus_slg,isolated_power,b_rbi,b_lob,b_total_bases,r_total_caught_stealing,r_total_stolen_base,b_ab_scoring,b_ball,b_called_strike,b_catcher_interf,b_foul,b_foul_tip,b_game,b_gnd_into_dp,b_gnd_into_tp,b_gnd_rule_double,b_hit_by_pitch,b_hit_ground,b_hit_fly,b_hit_into_play,b_hit_line_drive,b_hit_popup,b_out_fly,b_out_ground,b_out_line_drive,b_out_popup,b_intent_ball,b_intent_walk,b_interference,b_pinch_hit,b_pinch_run,b_pitchout,b_played_dh,b_sac_bunt,b_sac_fly,b_swinging_strike,r_caught_stealing_2b,r_caught_stealing_3b,r_caught_stealing_home,r_defensive_indiff,r_interference,r_pickoff_1b,r_pickoff_2b,r_pickoff_3b,r_run,r_stolen_base_2b,r_stolen_base_3b,r_stolen_base_home,b_total_ball,b_total_sacrifices,b_total_strike,b_total_swinging_strike,b_total_pitches,r_stolen_base_pct,r_total_pickoff,b_reached_on_error,b_walkoff,b_reached_on_int,xba,xslg,woba,xwoba,xobp,xiso,wobacon,xwobacon,bacon,xbacon,xbadiff,xslgdiff,wobadif,exit_velocity_avg,launch_angle_avg,sweet_spot_percent,barrel,barrel_batted_rate,solidcontact_percent,flareburner_percent,poorlyunder_percent,poorlytopped_percent,poorlyweak_percent,hard_hit_percent,z_swing_percent,z_swing_miss_percent,oz_swing_percent,oz_swing_miss_percent,oz_contact_percent,out_zone_swing_miss,out_zone_swing,out_zone_percent,out_zone,meatball_swing_percent,meatball_percent,pitch_count_offspeed,pitch_count_fastball,pitch_count_breaking,pitch_count,iz_contact_percent,in_zone_swing_miss,in_zone_swing,in_zone_percent,in_zone,edge_percent,edge,whiff_percent,swing_percent,pull_percent,straightaway_percent,opposite_percent,batted_ball,f_strike_percent,groundballs_percent,groundballs,flyballs_percent,flyballs,linedrives_percent,linedrives,popups_percent,popups,pop_2b_sba_count,pop_2b_sba,pop_2b_sb,pop_2b_cs,pop_3b_sba_count,pop_3b_sba,pop_3b_sb,pop_3b_cs,exchange_2b_3b_sba,maxeff_arm_2b_3b_sba,n_outs_above_average,n_fieldout_5stars,n_opp_5stars,n_5star_percent,n_fieldout_4stars,n_opp_4stars,n_4star_percent,n_fieldout_4stars,n_fieldout_3stars,n_opp_3stars,n_3star_percent,n_fieldout_3stars,n_opp_2stars,n_2star_percent,n_fieldout_2stars,n_opp_2stars,n_1star_percent,n_fieldout_1stars,n_opp_1stars,n_1star_percent,rel_league_reaction_distance,rel_league_burst_distance,rel_league_routing_distance,rel_league_bootup_distance,f_bootup_distance,n_bolts,hp_to_1b,sprint_speed,&chart=false&x=xba&y=xba&r=no&chartType=beeswarm",
    'pitchers_custom':"https://baseballsavant.mlb.com/leaderboard/custom?year=2022&type=pitcher&filter=&sort=4&sortDir=asc&min=10&selections=player_age,p_game,p_formatted_ip,p_total_pa,p_ab,p_total_hits,p_single,p_double,p_triple,p_home_run,p_strikeout,p_walk,p_k_percent,p_bb_percent,batting_avg,slg_percent,on_base_percent,on_base_plus_slg,isolated_power,p_earned_run,p_run,p_save,p_blown_save,p_out,p_win,p_loss,p_wild_pitch,p_balk,p_shutout,p_era,p_opp_batting_avg,p_opp_on_base_avg,p_total_stolen_base,p_pickoff_attempt_1b,p_pickoff_attempt_2b,p_pickoff_attempt_3b,p_pickoff_1b,p_pickoff_2b,p_pickoff_3b,p_lob,p_rbi,p_stolen_base_2b,p_stolen_base_3b,p_stolen_base_home,p_quality_start,p_walkoff,p_run_support,p_ab_scoring,p_automatic_ball,p_ball,p_called_strike,p_catcher_interf,p_caught_stealing_2b,p_caught_stealing_3b,p_caught_stealing_home,p_complete_game,p_defensive_indiff,p_foul,p_foul_tip,p_game_finished,p_game_in_relief,p_gnd_into_dp,p_gnd_into_tp,p_gnd_rule_double,p_hit_by_pitch,p_hit_fly,p_hit_ground,p_hit_line_drive,p_hit_into_play,p_hit_scoring,p_hold,p_intent_ball,p_intent_walk,p_missed_bunt,p_out_fly,p_out_ground,p_out_line_drive,p_passed_ball,p_pickoff_error_1b,p_pickoff_error_2b,p_pickoff_error_3b,p_pitchout,p_relief_no_out,p_sac_bunt,p_sac_fly,p_starting_p,p_swinging_strike,p_unearned_run,p_total_ball,p_total_bases,p_total_caught_stealing,p_total_pickoff,p_total_pickoff_attempt,p_total_pickoff_error,p_total_pitches,p_total_sacrifices,p_total_strike,p_total_swinging_strike,p_inh_runner,p_inh_runner_scored,p_beq_runner,p_beq_runner_scored,p_reached_on_error,xba,xslg,woba,xwoba,xobp,xiso,wobacon,xwobacon,bacon,xbacon,xbadiff,xslgdiff,wobadif,exit_velocity_avg,launch_angle_avg,sweet_spot_percent,barrel,barrel_batted_rate,solidcontact_percent,flareburner_percent,poorlyunder_percent,poorlytopped_percent,poorlyweak_percent,hard_hit_percent,z_swing_percent,z_swing_miss_percent,oz_swing_percent,oz_swing_miss_percent,oz_contact_percent,out_zone_swing_miss,out_zone_swing,out_zone_percent,out_zone,meatball_swing_percent,meatball_percent,pitch_count_offspeed,pitch_count_fastball,pitch_count_breaking,pitch_count,iz_contact_percent,in_zone_swing_miss,in_zone_swing,in_zone_percent,in_zone,edge_percent,edge,whiff_percent,swing_percent,pull_percent,straightaway_percent,opposite_percent,batted_ball,f_strike_percent,groundballs_percent,groundballs,flyballs_percent,flyballs,linedrives_percent,linedrives,popups_percent,popups,pitch_hand,n,n_ff_formatted,ff_avg_speed,ff_avg_spin,ff_avg_break_x,ff_avg_break_z,ff_avg_break,ff_range_speed,n_sl_formatted,sl_avg_speed,sl_avg_spin,sl_avg_break_x,sl_avg_break_z,sl_avg_break,sl_range_speed,n_ch_formatted,ch_avg_speed,ch_avg_spin,ch_avg_break_x,ch_avg_break_z,ch_avg_break,ch_range_speed,n_cukc_formatted,cu_avg_speed,cu_avg_spin,cu_avg_break_x,cu_avg_break_z,cu_avg_break,cu_range_speed,n_sift_formatted,si_avg_speed,si_avg_spin,si_avg_break_x,si_avg_break_z,si_avg_break,si_range_speed,n_fc_formatted,fc_avg_speed,fc_avg_spin,fc_avg_break_x,fc_avg_break_z,fc_avg_break,fc_range_speed,n_fs_formatted,fs_avg_speed,fs_avg_spin,fs_avg_break_x,fs_avg_break_z,fs_avg_break,fs_range_speed,n_kn_formatted,kn_avg_speed,kn_avg_spin,kn_avg_break_x,kn_avg_break_z,kn_avg_break,kn_range_speed,n_fastball_formatted,fastball_avg_speed,fastball_avg_spin,fastball_avg_break_x,fastball_avg_break_z,fastball_avg_break,fastball_range_speed,n_breaking_formatted,breaking_avg_speed,breaking_avg_spin,breaking_avg_break_x,breaking_avg_break_z,breaking_avg_break,breaking_range_speed,n_offspeed_formatted,offspeed_avg_speed,offspeed_avg_spin,offspeed_avg_break_x,offspeed_avg_break_z,offspeed_avg_break,offspeed_range_speed,&chart=false&x=xba&y=xba&r=no&chartType=beeswarm"
}

def generate_pitching_matchups(endpoints):

    response = requests.get(endpoints['probable_pitchers'])
    soup = BeautifulSoup(response.text, 'html.parser')

    matchup_strings = [i.text for i in soup.find_all("a", {"class": "matchup-link"}) if 'src=' not in str(i)]

    matchup_strings = [pitcher for pitcher in matchup_strings if pitcher not in ['Cole Irvin',]]

    matchup_tuples = [(matchup_strings[i],matchup_strings[i+1]) for i in range(0,len(matchup_strings),2)]

    df = get_baseball_savant_leaderboard(endpoints['pitchers_percentile'])

    df = df.drop([  'year', 'player_type', 'player_id', 'pos',
                    'percent_speed_order', 'percent_rank_pop_2b', 'percent_rank_oaa',
                    'percent_rank_framing', 'percent_rank_jump',
                    'percent_rank_fastball_velo', 'percent_rank_fastball_spin',
                    'percent_rank_cu_spin','href'], axis = 1)

    df[df.columns[2:]] = df[df.columns[2:]].div(100)

    df = df[['player_name', 'team_name', 'percent_rank_barrel',
       'percent_rank_barrel_batted_rate', 'percent_rank_exit_velocity_avg',
       'percent_rank_exit_velocity_max',
       'percent_rank_xba', 'percent_rank_xslg',
       'percent_rank_hard_hit_percent',
       'percent_rank_xwobacon', 'percent_rank_k_percent', 'percent_rank_bb_percent',
       'percent_rank_whiff_percent', 'percent_rank_chase_percent',
       'percent_rank_ba', 'percent_rank_xbacon',
       'percent_rank_babip', 'percent_rank_obp', 'percent_rank_slg',
       'percent_rank_xobp', 'percent_rank_xiso',
       'percent_rank_xera']]

    # add more weight to strikeouts, hardhit, and bb%
    df['percent_rank_hard_hit_percent'] = df['percent_rank_hard_hit_percent'].mul(1.55)
    df['percent_rank_k_percent'] = df['percent_rank_k_percent'].mul(1.75)
    df['percent_rank_whiff_percent'] = df['percent_rank_whiff_percent'].mul(1.75)
    df['percent_rank_bb_percent'] = df['percent_rank_bb_percent'].mul(2)    
    df['percent_rank_exit_velocity_avg'] = df['percent_rank_exit_velocity_avg'].mul(1.5)
    df['percent_rank_xwobacon'] = df['percent_rank_xwobacon'].mul(1.5)
    
    df['rating'] = df[df.columns[2:]].sum(axis=1).round(2)

    df = df.sort_values(by=['rating'], ascending=False).reset_index(drop=True)

    df['rating'] = (df['rating'] - df['rating'].mean()) / df['rating'].std(ddof=0)

    mismatch_tuples = []
    pitchers_found  = []

    for matchup in matchup_tuples:
        away_pitcher = matchup[0]
        home_pitcher = matchup[1]
        away_rating, home_rating = None, None
        try:
            away_rating = round(df.loc[df['player_name'] == away_pitcher, 'rating'].item(),2)
            pitchers_found.append(away_pitcher)
        except Exception as err:
            print('not enough 2021 data on:',away_pitcher)

        try:
            home_rating = round(df.loc[df['player_name'] == home_pitcher, 'rating'].item(),2)
            pitchers_found.append(home_pitcher)
            
        except Exception as err:
            print('not enough 2021 data on:',home_pitcher)
        
        # try calculate mismatch.
        try:
            
            mismatch = round(away_rating - home_rating,2)
            mismatch_tuples.append((away_pitcher,home_pitcher, mismatch))

        except Exception as err:
            # print(err)
            print(f'mismatch couldn\'t be calculated for {away_pitcher} vs. {home_pitcher}.')

    sorted_available_matches = set(sorted(mismatch_tuples, key=lambda x: x[1]))

    mismatch_df = pd.DataFrame(sorted_available_matches, columns =['away pitcher', 'home pitcher', 'mismatch difference (positive = away favored, negative = home favored.']).sort_values(
        by=['mismatch difference (positive = away favored, negative = home favored.'], ascending=False
    )

    todays_pitchers = df[df['player_name'].isin(pitchers_found)]

    todays_pitchers = todays_pitchers.sort_values(by=['rating'], ascending=False).reset_index(drop=True)

    cols = todays_pitchers.columns.to_list()
    rearranged = cols[0:2] + cols[-1:] + cols[2:-1]

    todays_pitchers = todays_pitchers[rearranged]

    with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
        display(mismatch_df)
        display(todays_pitchers)

    return mismatch_df, todays_pitchers

mismatch_df, todays_pitchers = generate_pitching_matchups(endpoints)

NameError: name 'get_baseball_savant_leaderboard' is not defined

In [None]:
# stuff+ derivation
import numpy as np

def generate_matchup_ratings():

    # load daily rankings

    daily_team_rankings = get_team_stats()
    batting_ranks  = daily_team_rankings['batting']
    pitching_ranks = daily_team_rankings['pitching']

    # normalize ops and whip

    batting_ranks['norm_OPS'] = (batting_ranks['OPS'] - batting_ranks['OPS'].mean()) / batting_ranks['OPS'].std(ddof=0)
    pitching_ranks['norm_WHIP'] = (pitching_ranks['WHIP'] - pitching_ranks['WHIP'].mean()) / pitching_ranks['WHIP'].std(ddof=0)

    with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
        display(batting_ranks)
        display(pitching_ranks)

    # create list for sorting mismatches
    mismatch_list = []

    response = requests.get("https://baseballsavant.mlb.com/probable-pitchers")
    soup = BeautifulSoup(response.text, 'html.parser')

    matchups = [game.text for game in soup.find_all("h2")]
    for matchup in matchups:
        str_split = matchup.split(' @ ')

        away_batting = batting_ranks.loc[batting_ranks['name'] == str_split[0]]
        away_batting_index = away_batting.index[0]
        home_batting = batting_ranks.loc[batting_ranks['name'] == str_split[1]]
        home_batting_index = home_batting.index[0]

        away_pitching = pitching_ranks.loc[pitching_ranks['name'] == str_split[0]]
        away_pitching_index = away_pitching.index[0]
        home_pitching = pitching_ranks.loc[pitching_ranks['name'] == str_split[1]] 
        home_pitching_index = home_pitching.index[0]

        norm_ops_gap  = round(away_batting['norm_OPS'].values[0] - home_batting['norm_OPS'].values[0],3)
        norm_whip_gap = -1*round(away_pitching['norm_WHIP'].values[0] - home_pitching['norm_WHIP'].values[0],3)
        
        batting_rank_mismatch = away_batting_index - home_batting_index
        pitching_rank_mismatch = away_pitching_index - home_pitching_index

        mismatch_dict = {
            'away_team'             : str_split[0],
            'home_team'             : str_split[1],
            'batting_rank_mismatch' : batting_rank_mismatch,
            'pitching_rank_mismatch': pitching_rank_mismatch,
            'norm_ops_gap'          : norm_ops_gap,
            'norm_whip_gap'         : norm_whip_gap,
        }

        mismatch_list.append(mismatch_dict)
        
    df = pd.DataFrame(mismatch_list)

    df['mismatch_rating'] = df['norm_ops_gap'] + df['norm_whip_gap']
    df['abs_mismatch'] = df['mismatch_rating'].abs()
    df = df.sort_values('abs_mismatch', ascending=False)

    df['ML'] = np.where(df['mismatch_rating'] > 0, df['away_team'], df['home_team'])

    with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
        display(df)
    

generate_matchup_ratings()

In [None]:
prices = [7,1,5,3,6,4]
for n in range(len(prices)-1):
    print(max(prices[n+1:])- prices[n])
# maxProfits = [prices[n] - max(prices[n+1:]) for n in range(len(prices))]

In [None]:
# get (general) bullpen stats

def get_general_bullpen_stats():

    response = requests.get('https://www.covers.com/sport/baseball/mlb/statistics/team-bullpenera/2022')
    soup = BeautifulSoup(response.text, 'html.parser')
    trs = soup.find_all('tr')

    # extract column names from first element
    column_names = [th.text for th in trs[0].find_all('th')]
    data = []
    
    for tr in trs[1:]:
        tds = tr.find_all('td')
        if tds:
            team_name = tds[0].find_all('a')[0].find_all('span')[-1].text
            values = [float(td.text) for td in tds[1:]]
            values.insert(0, team_name)

            # webpage structured that data is repeated twice, so check.
            if values not in data:
                data.append(values)

    df = pd.DataFrame(data, columns=column_names).sort_values(by=['WHIP'], ascending=True).reset_index(drop=True)
    whip = df.pop('WHIP')
    df['WHIP'] = whip
    display(df)
    return df

bullpen = get_general_bullpen_stats()

In [None]:
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
from IPython.display import display

# # generate the box scores.
def get_individual_box_scores(href):
    
    base_url = 'https://www.baseball-reference.com/boxes/'
    new_url = base_url + href['href'].split('/boxes/')[1]
    response = requests.get(new_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    # get the box score.
    table = soup.findAll("table")[0]

    # initialize scoreboard
    box_score = {
        i:{} for i in [1,2,3,4,5,6,7,8,9,'R','H','E']
    }

    # for table in tables:
    rows = table.findAll('tr')[1:3]
    for row in rows:
        team = row.select("a[href*='/teams/']")[0]['href'].split('/')[2]
        scores = [0 if td.text == 'X' else int(td.text) for td in row.find_all('td', {'class':'center'})[1:]]
        for score,key in zip(scores,box_score.keys()):
            box_score[key][team] = score 

    comments_removed = response.text.replace("<!--","").replace("-->","")
    _soup = BeautifulSoup(comments_removed, 'html.parser')
    
    tables = _soup.find_all("table", {"class":"sortable stats_table min_width shade_zero"})
    for table in tables:
        if 'batting' in table["id"]:

            team_unparsed = table["id"].split('batting')[0]
            team_separated = re.findall('[A-Z][^A-Z]*', team_unparsed)
            team = " ".join(team_separated) + ' Batting'

            rows = table.find_all('tr')
            headers = rows[0].find_all('th')
            table_list = []
            for row in rows[1:]:
                
                player_info = row.find('th').text.split(' ')
                name = " ".join(player_info[:2])
                name = name.replace(u'\xa0', u'')
                position = player_info[-1]
                
                table_dict = {header.text:None for header in headers[1:]}
                table_dict['name'] = name
                table_dict['position'] = position

                stats = [stat.text for stat in row.find_all('td')]
                for k,s in zip(table_dict.keys(), stats):
                    table_dict[k] = s

                table_list.append(table_dict)
                
            # remove pitchers and pinch hitters
            table_list = [player for player in table_list if player['position'] not in ['P','PH', 'Totals', '']]
            df = pd.DataFrame(table_list)
            
            # rearrange columns 
            cols = df.columns.to_list()
            cols = cols[-2:] + cols[:-2]
            
            df = df[cols].drop(columns=['WPA+','WPA-','cWPA','acLI','Details', 'PO','A'], axis=1).dropna(how='all')

            col_type_mapping = {
                'name':'string',
                'position':'string',
                'AB':'Int32',
                'R':'Int32',
                'H':'Int32',
                'RBI':'Int32',
                'BB':'Int32',
                'SO':'Int32',
                'PA':'Int32',
                'BA':'float64',
                'OBP':'float64',
                'SLG':'float64',
                'OPS':'float64',
                'Pit':'Int32',
                'Str':'Int32',
                'WPA':'float64',
                'aLI':'float64',
                'RE24':'float64',
            }
            
            df = df.astype(col_type_mapping, errors='ignore')

            # only keep players who registered an at-bat
            df = df[df['AB'] > 0]

            with pd.option_context('display.max_rows', None, 'display.max_columns', None):
                display(df)
            
            # print(df.columns)
            
        else:
            team_unparsed = table["id"].split('pitching')[0]
            team_separated = re.findall('[A-Z][^A-Z]*', team_unparsed)
            team = " ".join(team_separated) + ' Pitching'

            rows = table.find_all('tr')
            headers = rows[0].find_all('th')
            table_list = []
            for row in rows[1:]:
                
                player_info = row.find('th').text.split(' ')
                name = " ".join(player_info[:2])
                name = name.replace(u'\xa0', u'')
                position = 'P'
                
                table_dict = {header.text:None for header in headers[1:]}
                table_dict['name'] = name
                table_dict['position'] = position

                stats = [stat.text for stat in row.find_all('td')]
                for k,s in zip(table_dict.keys(), stats):
                    table_dict[k] = s

                table_list.append(table_dict)

            # # remove pitchers and pinch hitters
            table_list = [player for player in table_list if player['position'] not in ['Totals', '']]
            df = pd.DataFrame(table_list)
            
            # rearrange columns 
            cols = df.columns.to_list()
            cols = cols[-2:] + cols[:-2]
            
            df = df[cols]
            with pd.option_context('display.max_rows', None, 'display.max_columns', None):
                display(df)

def get_box_scores():
    
    base_url = 'https://www.baseball-reference.com/boxes/'
    response = requests.get(base_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    game_hrefs = [href for href in soup.select("a[href*='2022']") if 'Final' in href.text]

    for href in game_hrefs:
        get_individual_box_scores(href)
        break

    return game_hrefs

_ = get_box_scores()

In [None]:
from random import randint

from lcu_driver import Connector

connector = Connector()


async def set_random_icon(connection):
    # random number of a chinese icon
    random_number = randint(50, 78)

    # make the request to set the icon
    icon = await connection.request('put', '/lol-summoner/v1/current-summoner/icon',
                                    data={'profileIconId': random_number})

    # if HTTP status code is 201 the icon was applied successfully
    if icon.status == 201:
        print(f'Chinese icon number {random_number} was set correctly.')
    else:
        print('Unknown problem, the icon was not set.')


# fired when LCU API is ready to be used
@connector.ready
async def connect(connection):
    print('LCU API is ready to be used.')

    # check if the user is already logged into his account
    summoner = await connection.request('get', '/lol-summoner/v1/demon king rush')
    if summoner.status != 200:
        print('Please login into your account to change your icon and restart the script...')
    else:
        print('Setting new icon...')
        await set_random_icon(connection)


# fired when League Client is closed (or disconnected from websocket)
@connector.close
async def disconnect(_):
    print('The client have been closed!')

# starts the connector
connector.start()