### Compute Similarity Scores between MLB Players
https://www.baseball-reference.com/about/similarity.shtml

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [57]:
from pybaseball import playerid_lookup
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd

In [20]:
def get_player_id(full_name):
    # Attempts to return the BBRef ID for a player name.
    # If there are multiple IDs found, return the ID of the player who most recently played.
    # If there is a tie-breaker, return the ID of the player who comes first in A-Z order.

    names = full_name.split(' ')
    first_name, last_name = names[0].lower(), ' '.join(names[1:]).lower()
    lookup = playerid_lookup(last_name, first_name).sort_values('name_last', ascending=True).sort_values('mlb_played_last', ascending=False)
    if lookup.shape[0] == 0: # No IDs found
        return None
    return lookup.iloc[0]['key_bbref']

In [27]:
def get_player_url(player_id):
    # Generates BaseballReference.com URL based on player ID.

    return 'https://www.baseball-reference.com/players/'+player_id[0]+'/'+player_id+'.shtml'

In [127]:
def get_data(url):
    # Attempts to return all tabular information from a webpage.
    
    soup = BeautifulSoup(urlopen(url), features='lxml')
    #print(soup.prettify())
    rows = soup.findAll('tr')[1:]
    rows_head = [[th.getText() for th in rows[i].findAll('th')] for i in range(len(rows))]
    rows_tails = [[td.getText() for td in rows[i].findAll('td')] for i in range(len(rows))]
    rows_data = [rows_head[i] + rows_tails[i] for i in range(len(rows))]

    return soup
    return rows_data

In [69]:
def get_season_data_of_player(player_name, season):
    # Attempts to return a single-row DataFrame representing the season stats of an MLB player.
    # Returns None if unable to 1. Find an ID associated with the player or 2. Find url or data from url.

    assert type(season) == int, 'Enter the season parameter as an integer.'

    player_id = get_player_id(player_name)
    player_url = get_player_url(player_id)
    player_data = get_data(player_url)[6:]
    player_data_df = pd.DataFrame(player_data, columns=['Year','Age','Tm','Lg','G','PA','AB','R','H','2B','3B','HR','RBI','SB','CS','BB','SO','BA','OBP','SLG','OPS','OPS+','TB','GDP','HBP','SH','SF','IBB','Pos','Awards'])
    player_data_df = player_data_df[(player_data_df['Lg'] == 'AL') | (player_data_df['Lg'] == 'NL')]

    return player_data_df[player_data_df['Year'] == str(season)]

In [78]:
player1 = 'Aaron Judge'
player2 = 'Bobby Witt'
season = 2024

In [79]:
get_season_data_of_player(player1, season)

Unnamed: 0,Year,Age,Tm,Lg,G,PA,AB,R,H,2B,...,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB,Pos,Awards
12,2024,32,NYY,AL,140,626,499,109,160,33,...,1.152,219,348,20,9,0,2,18,*8D/97,AS


In [80]:
get_season_data_of_player(player2, season)

Unnamed: 0,Year,Age,Tm,Lg,G,PA,AB,R,H,2B,...,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB,Pos,Awards
4,2024,24,KCR,AL,143,629,568,118,191,40,...,0.993,172,343,4,7,0,7,7,*6/D,AS


In [128]:
get_data('https://www.baseball-reference.com/players/j/judgeaa01-field.shtml')

<!DOCTYPE html>
<html class="no-js" data-root="/home/br/build" data-version="klecko-" lang="en">
<head>
<meta charset="utf-8"/>
<meta content="ie=edge" http-equiv="x-ua-compatible"/>
<meta content="width=device-width, initial-scale=1.0, maximum-scale=2.0" name="viewport"/>
<link href="https://cdn.ssref.net/req/202409051" rel="dns-prefetch"/>
<script>
/* https://docs.osano.com/hc/en-us/articles/22469433444372-Google-Consent-Mode-v2  */
  window.dataLayer = window.dataLayer ||[];
      function gtag(){dataLayer.push(arguments);}
      gtag('consent','default',{
        'ad_storage':'denied',
        'analytics_storage':'denied',
        'ad_user_data':'denied',
        'ad_personalization':'denied',
        'personalization_storage':'denied',
        'functionality_storage':'granted',
        'security_storage':'granted',
        'wait_for_update': 500
      });
      gtag("set", "ads_data_redaction", true);
</script>
<script src="https://cmp.osano.com/16CGnCU8UtNhM14sg/12669873-8cf8-41e

In [141]:
tables = get_data('https://www.baseball-reference.com/players/j/judgeaa01-field.shtml').findAll("table")
for table in tables:
     if table.findParent("table") is None:
         print(str(table))

<table class="sortable stats_table" data-cols-to-freeze=",1" id="last5">
<caption>Last 5 Games Table</caption>
<colgroup><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/></colgroup>
<thead>
<tr>
<th aria-label="Date" class="poptip center" data-stat="date" scope="col">Date</th>
<th aria-label="Tm" class="poptip center" data-stat="team_name_abbr" scope="col">Tm</th>
<th aria-label=" " class="poptip center" data-stat="game_location" scope="col"></th>
<th aria-label="Opp" class="poptip center" data-stat="opp_name_abbr" scope="col">Opp</th>
<th aria-label="Result" class="poptip show_partial_when_sorting center" data-stat="game_result" data-tip="&lt;strong&gt;Game Result for Team&lt;/strong&gt;&lt;br&gt;W - Win, L - Loss, T - Tie (for a suspended game)" scope="col">Result</th>
<th aria-label="Pos" class="poptip sort_default_asc left" data-stat="pos_game" data-tip