In [1]:
import csv
import json
import re
import unicodedata
from datetime import datetime
from time import sleep
from urllib.error import HTTPError
from urllib.parse import urlparse, parse_qs
from urllib.request import urlopen

import numpy as np
import pandas as pd
import plotly.graph_objects as go
import requests
from bs4 import BeautifulSoup, SoupStrainer
from selenium import webdriver
from tqdm.auto import tqdm

In [2]:
POSITION_MAPPING = {**{value: i + 1 for i, value in enumerate(['Pitcher', 'Catcher', 'First Baseman', 'Second Baseman', 'Third Baseman', 'Shortstop', 'Left Fielder', 'Center Fielder', 'Right Fielder', 'Designated Hitter', 'Pinch Hitter', 'Pinch Runner'])}, 'Infielder': 'I', 'Outfielder': 'O', 'Two-Way Player': 'Y'}

STATE_DICT = {'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas', 'CA': 'California', 'CO': 'Colorado', 'CT': 'Connecticut', 'DE': 'Delaware', 'FL': 'Florida', 'GA': 'Georgia', 'HI': 'Hawaii', 'ID': 'Idaho', 'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa', 'KS': 'Kansas', 'KY': 'Kentucky', 'LA': 'Louisiana', 'ME': 'Maine', 'MD': 'Maryland', 'MA': 'Massachusetts', 'MI': 'Michigan', 'MN': 'Minnesota', 'MS': 'Mississippi', 'MO': 'Missouri', 'MT': 'Montana', 'NE': 'Nebraska', 'NV': 'Nevada', 'NH': 'New Hampshire', 'NJ': 'New Jersey', 'NM': 'New Mexico', 'NY': 'New York', 'NC': 'North Carolina', 'ND': 'North Dakota', 'OH': 'Ohio', 'OK': 'Oklahoma', 'OR': 'Oregon', 'PA': 'Pennsylvania', 'RI': 'Rhode Island', 'SC': 'South Carolina', 'SD': 'South Dakota', 'TN': 'Tennessee', 'TX': 'Texas', 'UT': 'Utah', 'VT': 'Vermont', 'VA': 'Virginia', 'WA': 'Washington', 'WV': 'West Virginia', 'WI': 'Wisconsin', 'WY': 'Wyoming', 'DC': 'District of Columbia', 'AS': 'American Samoa', 'GU': 'Guam', 'MP': 'Northern Mariana Islands', 'PR': 'Puerto Rico', 'UM': 'United States Minor Outlying Islands', 'VI': 'U.S. Virgin Islands', 'NSW': 'New South Wales', 'QLD': 'Queensland', 'AB': 'Alberta', 'BC': 'British Columbia', 'MB': 'Manitoba', 'NB': 'New Brunswick', 'ON': 'Ontario', 'PE': 'Prince Edward Island', 'PQ': 'Quebec', 'QC': 'Quebec', 'SK': 'Saskatchewan', 'SL': 'San Luis Potosí', 'SO': 'Sonora', 'VE': 'Veracruz'}

COUNTRY_DICT = {'USA': 'United States', 'VEN': 'Venezuela', 'DOM': 'Dominican Republic', 'MEX': 'Mexico', 'NIR': 'Northern Ireland', 'USSR': 'Soviet Union', 'NCA': 'Nicaragua', 'CUB': 'Cuba', 'PUR': 'Puerto Rico'}

MISSING_MLBAMS = {1000511: 110535, 1000769: 110789, 1000777: 110797, 1000778: 110798, 1001791: 111832, 1003245: 113308, 1003504: 113570, 1004075: 114157, 1004972: 115064, 1005919: 116046, 1006675: 116818, 1007769: 117936, 1007906: 118075, 1008462: 118644, 1009200: 119402, 1009235: 119428, 1009111: 119301, 1009203: 119396, 1009694: 119902, 1009698: 119902, 1010104: 120330, 1010128: 120356, 1010610: 120846, 1010739: 120968, 1010882: 121130, 1011053: 121323, 1012086: 122446, 1012886: 123135, 1013857: 124192, 1014429: 124756, 1014468: 124756}

BREF_NO_MLBAM = {1009181: 'morgapi01', 1010606: 'quinnjo01', 1002190: 'childsa01', 1014137: 'winklge01', 1007482: 'lehanja01', 1014470: 'gormato04', 1011344: 'ryanjo03', 1014471: 'mccaf01', 1009686: "o'brida01", 1014345: 'yinglch01', 1012879: 'tayloja02', 1014469: 'glaseno01'}

def parse_date(date: str) -> datetime.date:
    return (datetime.strptime(date[:-3], '%Y-%m') if date[-2:] == '00' else datetime.strptime(date, '%Y-%m-%d')).date()


def strip_accents(text):
    return str(unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode('utf-8'))


def fangraphs_player_scrape() -> pd.DataFrame:
    url = 'https://www.fangraphs.com/leaders/major-league?pos=all&stats={stats}&lg=nl%2Cal%2Caa%2Cua%2Cpl%2Cfl&type=0&season={season}&month=0&season1={season}&ind=0&qual=0&pagenum=1&pageitems=2000000000{split}'
    df = []
    stats = {'hitter': 'bat', 'pitcher': 'pit', 'fielder': 'fld'}
    with tqdm(total=len(stats), dynamic_ncols=True) as outer_pbar:
        for key, value in stats.items():
            outer_pbar.set_description(f'Retrieving {key} stats')
            with tqdm(range(1876, 2024), leave=False, dynamic_ncols=True) as inner_pbar:
                for season in inner_pbar:
                    inner_pbar.set_description(f'Processing {season}')
                    # print(url.format(stats=value, season=season, split=''))
                    try:
                        df.append(pd.DataFrame(json.loads(BeautifulSoup(requests.get(url.format(stats=value, season=season, split='')).content, 'html.parser', from_encoding='utf_8').find('script', type='application/json').text)['props']['pageProps']['dehydratedState']['queries'][0]['state']['data']['data']).replace('- - -', np.nan).drop(columns=['Name', 'PlayerNameRoute', 'Team', 'TeamName', 'TeamNameAbb', 'teamid', 'AgeR', 'SeasonMin', 'SeasonMax'], errors='ignore').rename(columns={'xMLBAMID': 'mlbamid', 'playerid': 'fangraphsid'}).astype({'mlbamid': 'Int64'})[['mlbamid', 'fangraphsid']])
                    except AttributeError:
                        pass
                    inner_pbar.update()
            outer_pbar.update()

    return pd.concat(df).drop_duplicates()


def scrape_baseball_reference(player_keys: dict[int, str]) -> pd.DataFrame:
    records = []
    with tqdm(player_keys.items()) as pbar:
        for key, value in pbar:
            pbar.set_postfix({'bref_id': value})
            record = {'fangraphsid': key, 'mlbamid': None, 'fullname': None, 'firstname': None, 'lastname': None, 'birthdate': None, 'birthcity': None, 'birthstateprovince': None, 'birthcountry': None, 'active': False, 'primaryposition': None, 'rosterfirstname': None, 'rosterlastname': None, 'middlename': None, 'isverified': False, 'deathdate': None, 'deathcity': None, 'deathstateprovince': None, 'deathcountry': None, 'mlbdebutdate': None, 'batside': None, 'pitchhand': None, 'heightinches': None, 'weight': None, 'strikezonetop': None, 'strikezonebottom': None, 'lastplayeddate': None, 'draftyear': None, 'namesuffix': None, 'primarynumber': None}
            
            sleep(1.0)
            with requests.get(f'https://www.baseball-reference.com/players/{value[0]}/{value}.shtml') as fp:
                soup = BeautifulSoup(fp.content, 'html.parser', from_encoding='utf_8')
    
            primary_number = soup.find_all('span', id='bling-alt-text')
            try:
                primary_number = [u for u in primary_number[0].string.split('\n\n') if u.strip()]
                primary_number = [re.compile(r'[0-9]+[-0-9]*[0-9]+').findall(a) for a in primary_number[primary_number.index('Uniforms:') + 1].split('\n')]
                primary_number = pd.DataFrame(primary_number, columns=['Number', 'Years'])
                primary_number['Years'] = primary_number['Years'].apply(lambda x: 1 - eval(x))
                primary_number = int(primary_number.groupby('Number')['Years'].sum().idxmax())
                record['primarynumber'] = primary_number
            except IndexError:
                pass
            
            player_data = json.loads(list(soup.find('script', type='application/ld+json').stripped_strings)[0])
            player_header = soup.find('div', id='meta').h1.parent
            
            name = player_data['name'].split()
            roster_first_name, roster_last_name = name[0], name[1:]
            
            if roster_last_name[-1] in ['Jr.', 'Sr.', 'II', 'III', 'IV']:
                roster_last_name, name_suffix = roster_last_name[:-1], roster_last_name[-1]
            else:
                name_suffix = None
            
            full_name = str(player_header.find(lambda tag: tag is not None and 'Full Name' in str(tag.string)).next_sibling).strip().split()
            first_name = full_name[0]
            middle_name = [name for name in full_name[1:] if name not in roster_last_name]
            middle_name = ' '.join(middle_name) if middle_name else None
            name, full_name, roster_last_name, middle_name = ' '.join(name), ' '.join(full_name), ' '.join(roster_last_name),  ' '.join(middle_name) if middle_name else None
            last_name = roster_last_name
            record['name'],  = name
            
            birth_date = player_header.find_all(lambda tag: tag is not None and tag.has_attr('data-birth'))
            if birth_date:
                birth_date = parse_date(birth_date[0]['data-birth'])
                # birth_date = datetime.strptime(birth_date[0]['data-birth'], '%Y-%m-%d').date()
                birth_place_str_list = list(player_header.find('a', string='Born:').parent.parent.stripped_strings)
                birth_city, birth_state_province, birth_country = [bp.replace('in ', '').strip(',')  if bp.upper() != 'US' else 'USA' for bp in birth_place_str_list[next(i for i, value in enumerate(birth_place_str_list) if value.isnumeric()) + 1:]]
                record['birthdate'] = birth_date
                record['birthcity'], record['birthstateprovince'], record['birthcountry'] = birth_city, birth_state_province, birth_country
            
            death_date = player_header.find_all(lambda tag: tag is not None and tag.has_attr('data-death'))
            if death_date:
                death_date = parse_date(death_date[0]['data-death'])
                # death_date = datetime.strptime(death_date[0]['data-death'], '%Y-%m-%d').date()
                death_place_str_list = list(player_header.find('a', string='Died:').parent.parent.stripped_strings)
                death_place_str_list = death_place_str_list[next(i for i, value in enumerate(death_place_str_list) if value.isnumeric()) + 1:]
                death_city, death_state_province, death_country = [bp.replace('in\xa0', '').strip(',') for bp in death_place_str_list]
                record['deathdate'] = death_date
                record['deathcity'], record['deathstateprovince'], record['deathcountry'] = death_city, death_state_province, death_country
            
            mlb_debut_date = player_header.find_all(lambda tag: tag is not None and 'Debut' in str(tag.string))
            if mlb_debut_date:
                mlb_debut_date = datetime.strptime(str(mlb_debut_date[0].parent.next_sibling.next_sibling.string).strip(), '%B %d, %Y').date()
                record['mlbdebutdate'] = mlb_debut_date
            
            last_played_date = player_header.find_all(lambda tag: tag is not None and 'Last Game' in str(tag.string))
            if last_played_date:
                last_played_date = datetime.strptime(str(last_played_date[0].parent.next_sibling.next_sibling.string).strip(), '%B %d, %Y').date()
                record['lastplayeddate'] = last_played_date
            
            bat_throw = player_header.find_all(lambda tag: tag is not None and 'Bats' in str(tag.string))
            if bat_throw:
                side_dict = {'Left': 'L', 'Right': 'R', 'Both': 'S'}
                bat_throw = [re.match(re.compile(r'[A-Z]+', re.IGNORECASE), x)[0] for x in list(bat_throw[0].parent.stripped_strings)]
                bat_side = side_dict[bat_throw[bat_throw.index('Bats') + 1]]
                pitch_hand = side_dict[bat_throw[bat_throw.index('Throws') + 1]]
                record['batside'], record['pitchhand'] = bat_side, pitch_hand
            
            height_inches = player_header.find_all(lambda tag: tag is not None and re.match(r'[0-9]+-[0-9]+', str(tag.string)))
            if height_inches:
                height_inches = np.array(height_inches[0].string.split('-'), dtype=int) @ np.array([12, 1])
                record['heightinches'] = height_inches
            
            weight = player_header.find_all(lambda tag: tag is not None and re.match(r'[0-9]+lb', str(tag.string)))
            if weight:
                weight = int(str(weight[0].string).strip().replace('lb', ''))
                record['weight'] = weight
            
            draft_year = player_header.find_all(lambda tag: tag is not None and 'Draft' in str(tag.string))
            if draft_year:
                draft_year = parse_qs(urlparse(draft_year)[0].parent.find('a').get('href').query)['year_ID'][0]
                record['draftyear'] = draft_year
            
            primary_position = player_header.find_all(lambda tag: tag is not None and 'Positions' in str(tag.string))
            if primary_position:
                primary_position = POSITION_MAPPING[list(primary_position[0].next_sibling.stripped_strings)[0].replace(' and ', ', ').split(', ')[0]]
                record['primaryposition'] = primary_position
            
            records.append(pd.Series(record).to_frame().T)
    
    return pd.concat(records).astype({'fangraphsid': int, 'mlbamid': 'Int64', 'active': bool, 'isverified': bool, 'primarynumber': 'Int64', 'heightinches': 'Int64', 'weight': 'Int64', 'birthdate': 'datetime64[ns]', 'deathdate': 'datetime64[ns]', 'mlbdebutdate': 'datetime64[ns]', 'lastplayeddate': 'datetime64[ns]', 'draftyear': 'Int64'})

In [6]:
df = fangraphs_player_scrape()

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/148 [00:00<?, ?it/s]

  0%|          | 0/148 [00:00<?, ?it/s]

  0%|          | 0/148 [00:00<?, ?it/s]

In [13]:
df = pd.read_csv('playerids.csv').astype({'mlbamid': 'Int64'})

Unnamed: 0,fangraphsid,mlbamid
0,1,407792
1,2,110236
2,3,110293
3,4,277397
4,5,456499


In [19]:
df = df.merge(pd.Series(MISSING_MLBAMS, name='mlbamid2'), left_on='fangraphsid', right_index=True, how='left').fillna({'mlbamid2': df['mlbamid']}).drop('mlbamid', axis=1).rename({'mlbamid2': 'mlbamid'}, axis=1).astype({'mlbamid': 'Int64'})
# df.loc[df['fangraphsid'] == 1014471, 'fullname'] = 'Bill McCaffrey'

In [21]:
records = []
bad_records = []

with tqdm(
    iterable={key: value for key, value in dict(zip(df['fangraphsid'], df['mlbamid'])).items() if key not in BREF_NO_MLBAM}.items(),
    total=df.shape[0] - len(BREF_NO_MLBAM),
    unit='players'
) as pbar:
    for fangraphsid, mlbamid in pbar:
        try:
            pbar.set_postfix_str(f'FanGraphsID: {fangraphsid}, MLBAMID: {mlbamid}')
            with urlopen(f'https://statsapi.mlb.com/api/v1/people/{mlbamid}/') as url:
                data = (pd.Series({'fangraphsid': fangraphsid, **{key: (value if not isinstance(value, dict) else value['code'].strip())
                                                                  for key, value in json.load(url)['people'][0].items()}})
                        .drop({'link', 'currentAge', 'boxscoreName', 'gender', 'isPlayer', 'nickName', 'nameSlug', 'firstLastName', 
                               'lastInitName', 'initLastName', 'fullLFMName', 'lastFirstName', 'nameTitle', 'nameMatrilineal',
                               'pronunciation', 'namePrefix'}, errors='ignore'))
                data = data.rename({**{col: col.lower() for col in data.index}, 'id': 'mlbamid', 'useName': 'rosterfirstname', 
                                    'useLastName': 'rosterlastname', 'height': 'heightinches'})
                if 'heightinches' in data.index:
                    data['heightinches'] = np.array([int(h.strip('"')) for h in data['heightinches'].split('\' ')]) @ np.array([12, 1])
                records.append(data)
        except HTTPError:
            bad_records.append(fangraphsid)

records = pd.DataFrame.from_records(records).astype({'fangraphsid': int, 'mlbamid': 'Int64', 'active': bool, 'isverified': bool, 'primarynumber': 'Int64', 'heightinches': 'Int64', 'weight': 'Int64', 'birthdate': 'datetime64[ns]', 'deathdate': 'datetime64[ns]', 'mlbdebutdate': 'datetime64[ns]', 'lastplayeddate': 'datetime64[ns]', 'draftyear': 'Int64'}).set_index('fangraphsid')

for col in list(records.select_dtypes(include='object')):
    records[col] = records[col].str.strip()
    records[col] = records[col].str.replace(r'\s+', ' ', regex=True)

# records.loc[4920, 'rosterlastname'] = 'Colón'

records['birthstateprovince'] = records['birthstateprovince'].map(STATE_DICT)
records['birthcountry'] = records['birthcountry'].map(COUNTRY_DICT)

records.to_csv('player-data.csv')

  0%|          | 0/20453 [00:00<?, ?players/s]

In [74]:
df = pd.read_csv('player-data.csv').astype({'fangraphsid': int, 'mlbamid': 'Int64', 'active': bool, 'isverified': bool, 'primarynumber': 'Int64', 'heightinches': 'Int64', 'weight': 'Int64', 'birthdate': 'datetime64[ns]', 'deathdate': 'datetime64[ns]', 'mlbdebutdate': 'datetime64[ns]', 'lastplayeddate': 'datetime64[ns]', 'draftyear': 'Int64'}).set_index('fangraphsid')
df.loc[[10472, 15118], 'rosterfirstname'] = ['Kiké', 'Jimmie']
df.loc[[4920, 5669, 1], 'rosterlastname'] = ['Colón', 'Álvarez III', 'Amézaga']
df.loc[[2495, 4243, 4772, 10171, 5669, 1000206, 1000207, 1642, 1003150, 1], 'fullfmlname'] = ['Pedro Manuel Álvarez', 'Jose Manuel Lobatón', 'Félix Abraham Graham Hernández', 'José Altagracia Ramírez', 'Henderson Javier Álvarez', 'Rubén Amaro', 'Rubén Amaro', 'Francisco José Rodríguez', 'Iván De Jesús', 'Alfredo Amézaga']
df.loc[[1000206, 1000207, 18401, 1296, 5669], 'fullname'] = ['Rubén Amaro', 'Rubén Amaro Jr.', 'Ronald Acuña Jr.', 'José Cruz Jr.', 'Henderson Álvarez III']
df.loc[[11530, 10472, 13398, 3312, 19959, 13755, 15104, 1013826], ['middlename', 'fullfmlname']] = [['Delfín', 'José Delfín Fernández'],
                                                                                                    ['José', 'Enrique José Hernández'],
                                                                                                    ['Francis', 'Steven Francis Rodríguez'],
                                                                                                    ['Manuel', 'Martín Manuel Prado'],
                                                                                                    ['Guillermo', 'Jesús Guillermo Luzardo'],
                                                                                                    ['Enrique', 'Tomás Enrique Nido'],
                                                                                                    ['Rayshad', 'Jonathan Rayshad Davis'],
                                                                                                    ['Davis', 'McKinley Davis Wheat']]
df.loc[[7048, 1013084, 1827], 'middlename'] = np.nan
df['fullfmlname'] = df['fullfmlname'].str.replace(r'\s+(Jr.|Sr.|II|III|IV)$', '', regex=True)
df['asciifullfmlname'] = df['fullfmlname'].apply(strip_accents)
df['casciifullfmlname'] = df[['firstname', 'middlename', 'lastname']].agg(lambda x: ' '.join([y for y in x.to_numpy() if pd.notna(y)]), axis=1).apply(strip_accents)
df.loc[df['asciifullfmlname'] != df['casciifullfmlname'], 'fullfmlname'] = df.loc[df['asciifullfmlname'] != df['casciifullfmlname'], 'casciifullfmlname']
df['asciifullfmlname'] = df['casciifullfmlname']
df = df.drop('casciifullfmlname', axis=1)

name_df = df.sort_values('birthdate', ascending=False)[['fullname', 'firstname', 'middlename', 'lastname', 'namesuffix', 'rosterfirstname', 'rosterlastname', 'namefirstlast', 'fullfmlname', 'asciifullfmlname']]
t = name_df[['firstname', 'middlename', 'lastname']].map(lambda x: len(x) if pd.notna(x) else 0)
t['second'] = name_df['middlename'].notna().astype(int)
t['first'] = 1
t['slices'] = t[['firstname', 'first', 'middlename', 'second']].cumsum(axis=1)[['first', 'second']].to_numpy().tolist()
name_df = name_df.merge(t[['slices']], left_index=True, right_index=True)

df[['firstname', 'middlename', 'lastname']] = pd.DataFrame(name_df[['fullfmlname', 'slices']].apply(lambda x: [x['fullfmlname'][i:j].strip() for i, j in zip([0] + x['slices'], x['slices'] + [None])], axis=1).tolist(), index=name_df.index, columns=['firstname', 'middlename', 'lastname'])
df = df.replace('', np.nan)
df.loc[[3397, 5365, 7448, 14140, 14239, 17581, 18316, 20040, 21023], 'rosterlastname'] = df.loc[[3397, 5365, 7448, 14140, 14239, 17581, 18316, 20040, 21023], 'lastname']
df.loc[(df['firstname'] != df['rosterfirstname']) & (df['firstname'].apply(strip_accents) == df['rosterfirstname']), 'rosterfirstname'] = df.loc[(df['firstname'] != df['rosterfirstname']) & (df['firstname'].apply(strip_accents) == df['rosterfirstname']), 'firstname']
df['rosterfullname'] = df['rosterfirstname'] + ' ' + df['rosterlastname']
df.loc[df['namesuffix'].notna() & (df['fullname'] != df['rosterfullname']), 'rosterfullname'] = df.loc[df['namesuffix'].notna() & (df['fullname'] != df['rosterfullname']), 'fullname']
df.loc[df['fullname'] != df['rosterfullname'], 'fullname'] = df.loc[df['fullname'] != df['rosterfullname'], 'rosterfullname']
# df = df.drop('rosterfullname', axis=1)
df['rosterfirstname'] = df.apply(lambda x: x['fullname'].replace(x['rosterlastname'], '').strip(), axis=1)

In [77]:
df

Unnamed: 0_level_0,mlbamid,fullname,firstname,lastname,primarynumber,birthdate,birthcity,birthstateprovince,birthcountry,heightinches,...,strikezonetop,strikezonebottom,middlename,deathstateprovince,namesuffix,deathdate,deathcity,deathcountry,asciifullfmlname,rosterfullname
fangraphsid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,407792,Alfredo Amézaga,Alfredo,Amézaga,4,1978-01-16,Obregon,,,71,...,3.319,1.513,,,,NaT,,,Alfredo Amezaga,Alfredo Amézaga
2,110236,Garret Anderson,Garret,Anderson,16,1972-06-30,Los Angeles,California,United States,75,...,3.490,1.601,Joseph,,,NaT,,,Garret Joseph Anderson,Garret Anderson
3,110293,Kevin Appier,Robert,Appier,55,1967-12-06,Lancaster,California,United States,74,...,3.467,1.589,Kevin,,,NaT,,,Robert Kevin Appier,Kevin Appier
4,277397,Larry Barnes,Larry,Barnes,,1974-07-23,Bakersfield,California,United States,73,...,3.411,1.565,Richard,,,NaT,,,Larry Richard Barnes,Larry Barnes
5,456499,Scott Patterson,Scott,Patterson,40,1979-06-20,Pittsburgh,Pennsylvania,United States,79,...,3.756,1.746,Robert,,,NaT,,,Scott Robert Patterson,Scott Patterson
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1014464,124788,Frank Zupo,Frank,Zupo,42,1939-08-29,San Francisco,California,United States,71,...,3.319,1.513,Joseph,CA,,2005-03-25,Burlingame,USA,Frank Joseph Zupo,Frank Zupo
1014465,124789,Paul Zuvella,Paul,Zuvella,17,1958-10-31,San Mateo,California,United States,72,...,3.371,1.535,,,,NaT,,,Paul Zuvella,Paul Zuvella
1014466,124790,George Zuverink,George,Zuverink,35,1924-08-20,Holland,Michigan,United States,76,...,3.549,1.627,,AZ,,2014-09-08,Tempe,USA,George Zuverink,George Zuverink
1014467,124791,Dutch Zwilling,Edward,Zwilling,17,1888-11-02,St.Louis,Missouri,United States,67,...,3.179,1.474,Harrison,CA,,1978-03-27,LaCrescenta,USA,Edward Harrison Zwilling,Dutch Zwilling


In [132]:
t = name_df[name_df['middlename'].notna() & name_df['namesuffix'].isna() & (name_df['asciifullfmlname'] != name_df['firstname'] + ' ' + name_df['middlename'] + ' ' + name_df['rosterlastname'].apply(strip_accents))]
t['namefirstlast'] = t.apply(lambda x: ' '.join([x['firstname'], x['rosterlastname']]), axis=1)
t['fullfmlname'] = t.apply(lambda x: ' '.join([x['firstname'], x['middlename'], x['rosterlastname']]), axis=1)
df.loc[t.index, ['namefirstlast', 'fullfmlname']] = t[['namefirstlast', 'fullfmlname']]
t = name_df[name_df['middlename'].notna() & name_df['namesuffix'].isna() & (name_df['asciifullfmlname'] != name_df['firstname'] + ' ' + name_df['middlename'] + ' ' + name_df['rosterlastname'].apply(strip_accents))]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  t['namefirstlast'] = t.apply(lambda x: ' '.join([x['firstname'], x['rosterlastname']]), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  t['fullfmlname'] = t.apply(lambda x: ' '.join([x['firstname'], x['middlename'], x['rosterlastname']]), axis=1)


In [133]:
t

Unnamed: 0_level_0,fullname,firstname,middlename,lastname,namesuffix,rosterfirstname,rosterlastname,namefirstlast,fullfmlname,asciifullfmlname,slices
fangraphsid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
11530,Jose Fernandez,Jose,Delfín,Fernandez,,Jose,Fernández,Jose Fernandez,José Delfín Fernández,Jose Delfin Fernandez,"[5, 12]"
10472,Enrique Hernandez,Enrique,José,Hernandez,,Enrique,Hernández,Enrique Hernandez,Enrique José Hernández,Enrique Jose Hernandez,"[8, 13]"
1642,Francisco Rodriguez,Francisco,José,Rodriguez,,Francisco,Rodríguez,Francisco Rodriguez,Francisco José Rodríguez,Francisco Jose Rodriguez,"[10, 15]"


In [134]:
df[df['middlename'].notna()][df[df['middlename'].notna()].apply(lambda x: x['middlename'] not in x['fullfmlname'], axis=1)]

Unnamed: 0_level_0,mlbamid,fullname,firstname,lastname,primarynumber,birthdate,birthcity,birthstateprovince,birthcountry,heightinches,...,strikezonetop,strikezonebottom,middlename,deathstateprovince,namesuffix,deathdate,deathcity,deathcountry,asciifullfmlname,rosterfullname
fangraphsid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [85]:
df['middlename'].isna().sum()

698

In [33]:
df[df['fullname'].str.replace(r'^\S+\s+', '', regex=True)!= df['rosterlastname'].str.replace(r'\s+(Jr.|Sr.|II|III|IV)$', '', regex=True).apply(strip_accents)]

Unnamed: 0,fangraphsid,mlbamid,fullname,firstname,lastname,birthdate,birthcity,birthstateprovince,birthcountry,active,...,batside,pitchhand,heightinches,weight,strikezonetop,strikezonebottom,lastplayeddate,draftyear,namesuffix,primarynumber
75,82,407794,Jung Keun Bong,Jung,Bong,1980-07-15,Seoul,,South Korea,False,...,L,L,75,215,3.490,1.601,2004-06-20,,,
124,140,408202,Luis C. Garcia,Luis,Garcia,1975-09-22,Hermosillo,,Mexico,False,...,S,R,75,215,3.490,1.601,2002-05-30,1994,,
190,221,110184,Sandy Alomar Jr.,Santos,Alomar,1966-06-18,Salinas,,Puerto Rico,False,...,R,R,75,235,3.490,1.601,2007-09-30,,Jr.,15
279,327,115135,Ken Griffey Jr.,George,Griffey,1969-11-21,Donora,PA,USA,False,...,L,L,74,230,3.467,1.589,2010-05-31,1987,Jr.,24
291,344,276377,Wily Mo Pena,Wily,Peña,1982-01-23,Laguna Salada,,Dominican Republic,False,...,R,R,75,260,3.490,1.601,2011-09-25,,,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18690,1013599,123907,Paul Louis Wanninger,Paul,Wanninger,1902-12-12,Birmingham,AL,USA,False,...,L,R,67,150,3.179,1.474,1927-06-27,,,
18741,1013657,123964,La Rue Washington,La,Washington,1953-09-07,Long Beach,CA,USA,False,...,R,R,72,170,3.371,1.535,1979-09-30,1975,,15
18743,1013659,123967,U L Washington,U,Washington,1953-10-27,Stringtown,OK,USA,False,...,S,R,71,175,3.319,1.513,1987-10-04,,,30
18991,1013922,124236,Wild Bill Widner,William,Widner,1867-06-03,Cincinnati,OH,USA,False,...,R,R,72,180,3.371,1.535,NaT,,,


In [48]:
df['dummylastname'] = df['lastname'].apply(strip_accents)
df['fullname'] = df.apply(lambda x: re.sub(x['dummylastname'], x['lastname'], x['fullname']), axis=1)

0        Alfredo Amezaga
1        Garret Anderson
2           Kevin Appier
3           Larry Barnes
4        Scott Patterson
              ...       
19476         Frank Zupo
19477       Paul Zuvella
19478    George Zuverink
19479     Dutch Zwilling
19480        Matt Zeiser
Length: 19481, dtype: object

In [95]:
t = df[(df['birthcountry'] == 'USA') & df['birthstateprovince'].notna()].value_counts('birthstateprovince').reset_index()
t['birthstateprovince'] = t['birthstateprovince'].map(STATE_DICT)
t

Unnamed: 0,birthstateprovince,count
0,California,2309
1,Pennsylvania,1357
2,New York,1118
3,Illinois,1041
4,Ohio,1006
5,Texas,966
6,Massachusetts,645
7,Missouri,600
8,Florida,593
9,Michigan,421


In [4]:
with requests.get('https://www.baseball-reference.com/players/m/morgapi01.shtml') as fp:
    soup = BeautifulSoup(fp.content, 'html.parser', from_encoding='utf_8')

player_header = soup.find('div', id='meta').h1.parent
print(player_header.prettify())

<div class="nothumb">
 <h1>
  <span>
   Pidgey Morgan
  </span>
 </h1>
 <p>
  <strong>
   Positions:
  </strong>
  Outfielder, Third Baseman and Pitcher
 </p>
 <p>
  <strong>
   Bats:
  </strong>
  Unknown
         •
  <strong>
   Throws:
  </strong>
  Right
 </p>
 <p>
  <strong>
   <a href="/bio/">
    Born:
   </a>
  </strong>
  <span data-birth="1853-05-00" id="necro-birth">
   May
   <a href="/leagues/majors/1853-births.shtml">
    1853
   </a>
  </span>
  <span>
   in
   <a href="/bio/MO_born.shtml">
    MO
   </a>
  </span>
  <span class="f-i f-us" style="">
   us
  </span>
 </p>
 <p>
  <strong>
   <a href="/bio/">
    Died:
   </a>
  </strong>
  <span data-death="1910-01-30" id="necro-death">
   <a href="/friv/dateofdeath.cgi?month=1&amp;day=30">
    January 30
   </a>
   ,
   <a href="/leagues/majors/1910-deaths.shtml">
    1910
   </a>
  </span>
  <span>
   in St. Louis,
   <a href="/bio/MO_died.shtml">
    MO
   </a>
  </span>
 </p>
 <p>
  <strong>
   <a href="/bio/#all_locat

In [5]:
player_data = json.loads(list(soup.find('script', type='application/ld+json').stripped_strings)[0])

In [8]:
name = player_data['name'].split()
roster_first_name, roster_last_name = name[0], name[1:]

if roster_last_name[-1] in ['Jr.', 'Sr.', 'II', 'III', 'IV']:
    roster_last_name, name_suffix = roster_last_name[:-1], roster_last_name[-1]
else:
    name_suffix = None

full_name = str(player_header.find(lambda tag: tag is not None and 'Full Name' in str(tag.string)).next_sibling).strip().split()
first_name = full_name[0]
middle_name = [name for name in full_name[1:] if name not in roster_last_name]
middle_name = ' '.join(middle_name) if middle_name else None
name, full_name, roster_last_name, middle_name = ' '.join(name), ' '.join(full_name), ' '.join(roster_last_name),  ' '.join(middle_name) if middle_name else None
last_name = roster_last_name

print(f'{name=}, {roster_first_name=}, {roster_last_name=}, {name_suffix=}, {full_name=}, {first_name=}, {middle_name=}, {last_name=}')

name='Pidgey Morgan', roster_first_name='Pidgey', roster_last_name='Morgan', name_suffix=None, full_name='Daniel Morgan', first_name='Daniel', middle_name=None, last_name='Morgan'


In [11]:
df['namesuffix'].unique()

array([nan, 'Jr.', 'III', 'II', 'Sr.', 'IV'], dtype=object)

In [6]:
birth_date = player_header.find_all(lambda tag: tag is not None and tag.has_attr('data-birth'))
if birth_date:
    birth_date = parse_date(birth_date[0]['data-birth'])
    # birth_date = datetime.strptime(birth_date[0]['data-birth'], '%Y-%m-%d').date()
    birth_place_str_list = list(player_header.find('a', string='Born:').parent.parent.stripped_strings)
    # birth_state_province = 
    # birth_city, birth_state_province = [bp.replace('in ', '').strip(',')  if bp.upper() != 'US' else 'USA' for bp in birth_place_str_list[next(i for i, value in enumerate(birth_place_str_list) if value.isnumeric()) + 1:-1]]

birth_place_str_list

['Born:', 'May', '1853', 'in', 'MO', 'us']

In [7]:
death_date = player_header.find_all(lambda tag: tag is not None and tag.has_attr('data-death'))
if death_date:
    death_date = parse_date(death_date[0]['data-death'])
    # death_date = datetime.strptime(death_date[0]['data-death'], '%Y-%m-%d').date()
    death_place_str_list = list(player_header.find('a', string='Died:').parent.parent.stripped_strings)
    # death_place_str_list = death_place_str_list[next(i for i, value in enumerate(death_place_str_list) if value.isnumeric()) + 1:]

death_place_str_list

['Died:', 'January 30', ',', '1910', 'in\xa0St. Louis,', 'MO']

In [12]:
b = birth_place_str_list[next(i for i, value in enumerate(birth_place_str_list) if value.startswith('in')) + 1:]
[value for value in b if value in STATE_DICT ]

['MO', 'us']

In [21]:
df['birthcountry'].isna().sum()

48

In [None]:
{'USA': 'United States', 'VEN': 'Venezuela', 'DOM': 'Dominican Republic', 'MEX': 'Mexico', 'NIR': 'Northern Ireland', 'USSR': 'Soviet Union', 'NCA': 'Nicaragua', 'CUB': 'Cuba', 'PUR': 'Puerto Rico'}

In [22]:
df.loc[(df['birthcountry'] != 'USA') & df['birthcountry'].str.isupper(), 'birthcountry'].value_counts().index.tolist()

['VEN', 'DOM', 'MEX', 'NIR', 'USSR', 'NCA', 'CUB', 'PUR']

In [20]:
df[(df['birthcountry'] != 'USA') & df['birthcountry'].str.isupper()]

Unnamed: 0,fangraphsid,mlbamid,fullname,firstname,lastname,birthdate,birthcity,birthstateprovince,birthcountry,active,...,batside,pitchhand,heightinches,weight,strikezonetop,strikezonebottom,lastplayeddate,draftyear,namesuffix,primarynumber
613,745,120074,David Ortiz,David,Ortiz,1975-11-18,Santo Domingo,,DOM,False,...,L,L,75,230,3.49,1.601,2016-10-02,,,34.0
1997,2882,501593,Luis Avilan,Luis,Avilan,1989-07-19,Caracas,,VEN,False,...,L,L,74,235,3.467,1.589,2021-04-15,,,43.0
2116,3284,434671,Anibal Sanchez,Anibal,Sanchez,1984-02-27,Maracay,,VEN,True,...,R,R,72,207,3.371,1.535,NaT,,,19.0
2783,5702,448855,Junior Guerra,Junior,Guerra,1985-01-16,San Felix,,VEN,True,...,R,R,72,235,3.371,1.535,NaT,,,41.0
3419,8553,467827,Gerardo Parra,Gerardo,Parra,1987-05-06,Santa Barbara del Zulia,,VEN,False,...,L,L,71,209,3.17,1.52,2021-10-03,,,8.0
3910,11338,506703,Adrian Sanchez,Adrian,Sanchez,1990-08-16,Maracaibo,,VEN,True,...,R,R,72,197,3.35,1.59,NaT,,,14.0
4077,12179,596748,Maikel Franco,Maikel,Franco,1992-08-26,Azua,,DOM,True,...,R,R,73,225,3.24,1.51,NaT,,,7.0
4490,14366,608841,Joey Meneses,Joey,Meneses,1992-05-06,Culiacan,Sinaloa,MEX,True,...,R,R,75,240,3.65,1.79,NaT,,,45.0
4498,14391,622795,Yohander Mendez,Yohander,Mendez,1995-01-17,Valencia,,VEN,True,...,L,L,76,230,3.549,1.627,NaT,,,74.0
4874,16411,620446,Richard Urena,Richard,Urena,1996-02-26,San Francisco de Macoris,,DOM,True,...,S,R,72,195,3.371,1.535,NaT,,,47.0


In [24]:
df[df['birthcountry'] == 'Russian Federation']

Unnamed: 0,fangraphsid,mlbamid,fullname,firstname,lastname,birthdate,birthcity,birthstateprovince,birthcountry,active,...,batside,pitchhand,heightinches,weight,strikezonetop,strikezonebottom,lastplayeddate,draftyear,namesuffix,primarynumber
6350,1000092,110096,Eddie Ainsmith,Edward,Ainsmith,1890-02-04,,,Russian Federation,False,...,R,R,71,180,3.319,1.513,1924-07-21,,,


In [60]:
t = [p.strip() for p in birth_place_str_list[next(i for i, value in enumerate(birth_place_str_list) if value.isnumeric()) + 1:] if p.strip() != 'in']
if t[0] in STATE_DICT:
    birth_country = 'USA'
    

['MO', 'us']

In [46]:
b'\xa0'.decode('utf-8')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa0 in position 0: invalid start byte

In [67]:
position_mapping = {**{value: i + 1 for i, value in enumerate(['Pitcher', 'Catcher', 'First Baseman', 'Second Baseman', 'Third Baseman', 'Shortstop', 'Left Fielder', 'Center Fielder', 'Right Fielder', 'Designated Hitter', 'Pinch Hitter', 'Pinch Runner'])}, 'Infielder': 'I', 'Outfielder': 'O', 'Two-Way Player': 'Y'}

position_mapping[list(player_header.find_all(lambda tag: tag is not None and 'Positions' in str(tag.string))[0].next_sibling.stripped_strings)[0].replace(' and ', ', ').split(', ')[0]]

5

In [55]:
full_name = str(player_header.find(lambda tag: tag is not None and 'Full Name' in str(tag.string)).next_sibling).strip().split()
first_name, last_name = full_name[0], full_name[-1]
if len(full_name) == 4:
    middle_name, last_name, name_suffix = full_name[1:]
elif len(full_name) == 3:
    if full_name[-1] in ['Jr.', 'Sr.', 'II', 'III', 'IV']:
        last_name, name_suffix = full_name[1:]
    else:
        middle_name = full_name[1]
    
full_name = str(player_header.find('h1').string).strip()
roster_first_name, roster_last_name = full_name.split()

birth_date = player_header.find_all(lambda tag: tag is not None and tag.has_attr('data-birth'))
if birth_date:
    birth_date = datetime.strptime(birth_date['data-birth'], '%Y-%m-%d').date()
    birth_place_str_list = list(player_header.find('a', string='Born:').parent.parent.stripped_strings)
    birth_city, birth_state_province, birth_country = [bp.replace('in ', '').strip(',')  if bp.upper() != 'US' else 'USA' for bp in birth_place_str_list[next(i for i, value in enumerate(birth_place_str_list) if value.isnumeric()) + 1:]]

death_date = player_header.find_all(lambda tag: tag is not None and tag.has_attr('data-death'))
if death_date:
    death_date = datetime.strptime(death_date[0]['data-death'], '%Y-%m-%d').date()
    death_place_str_list = list(player_header.find('a', string='Died:').parent.parent.stripped_strings)
    death_place_str_list = death_place_str_list[next(i for i, value in enumerate(death_place_str_list) if value.isnumeric()) + 1:]
    death_city, death_state_province, death_country = [bp.replace('in ', '').strip(',')  if bp.upper() != 'US' else 'USA' for bp in death_place_str_list]

debut_date = player_header.find_all(lambda tag: tag is not None and 'Debut' in str(tag.string))
if debut_date:
    debut_date = datetime.strptime(str(debut_date[0].parent.next_sibling.next_sibling.string).strip(), '%B %d, %Y').date()
    
last_played_date = player_header.find_all(lambda tag: tag is not None and 'Last Game' in str(tag.string))
if last_played_date:
    last_played_date = datetime.strptime(str(last_played_date[0].parent.next_sibling.next_sibling.string).strip(), '%B %d, %Y').date()

bat_throw = player_header.find_all(lambda tag: tag is not None and 'Bats' in str(tag.string))
if bat_throw:
    side_dict = {'Left': 'L', 'Right': 'R', 'Both': 'S'}
    bat_throw = [re.match(re.compile(r'[A-Z]+', re.IGNORECASE), x)[0] for x in list(bat_throw[0].parent.stripped_strings)]
    bat_side = side_dict[bat_throw[bat_throw.index('Bats') + 1]]
    pitch_hand = side_dict[bat_throw[bat_throw.index('Throws') + 1]]

height_inches = player_header.find_all(lambda tag: tag is not None and re.match(r'[0-9]+-[0-9]+', str(tag.string)))
if height_inches:
    height_inches = np.array(height_inches[0].string.split('-'), dtype=int) @ np.array([12, 1])

weight = player_header.find_all(lambda tag: tag is not None and re.match(r'[0-9]+lb', str(tag.string)))
if weight:
    weight = int(str(weight[0].string).strip().replace('lb', ''))

draft_year = player_header.find_all(lambda tag: tag is not None and 'Draft' in str(tag.string))
if draft_year:
    draft_year = parse_qs(urlparse(draft_year)[0].parent.find('a').get('href').query)['year_ID'][0]

primary_position = player_header.find_all(lambda tag: tag is not None and 'Positions' in str(tag.string))
if primary_position:
    primary_position = POSITION_MAPPING[list(primary_position[0].next_sibling.stripped_strings)[0].replace(' and ', ', ').split(', ')[0]]

primary_number = soup.find_all('span', id='bling-alt-text')
if primary_number:
    primary_number = [l for l in primary_number[0].string.split('\n\n') if l]
    primary_number = pd.DataFrame([re.compile(r'[0-9]+[-0-9]*[0-9]+').findall(a) for a in primary_number[primary_number.index('Uniforms:') + 1].split('\n')], columns=['Number', 'Years'])
    primary_number['Years'] = primary_number['Years'].apply(lambda x: 1 - eval(x))
    primary_number = int(primary_number.groupby('Number')['Years'].sum().idxmax())

['Norman Glaser',
 'Position:',
 'Pitcher',
 'Bats:',
 'Right\n        \xa0•',
 'Throws:',
 'Right',
 '5-11',
 ',',
 '186lb',
 '(180cm,\xa084kg)',
 'Born:',
 'August 31',
 ',',
 '1894',
 'in Cleveland,',
 'OH',
 'us',
 'Died:',
 'May 27',
 ',',
 '1979',
 'in\xa0Parma,',
 'OH',
 'Buried',
 ':',
 'Sunset Memorial Park, North Olmsted, OH',
 'Debut:',
 'September 21, 1920',
 '(Age 26-021d,\n    5,116th in major league history)',
 'vs. WSH',
 '2.1 IP, 7 H, 0 SO, 0 BB, 4 ER',
 'Last Game:',
 'September 21, 1920',
 '(Age 26-021d)',
 'vs. WSH',
 '2.1 IP, 7 H, 0 SO, 0 BB, 4 ER',
 'Full Name:',
 'Norman Matthew Glaser',
 'View Player Info',
 'from the',
 'B-R Bullpen',
 'More bio, uniform, draft, salary info']

In [45]:
BeautifulSoup(requests.get('https://www.baseball-reference.com/players/g/glaseno01.shtml').content, 'html.parser', from_encoding='utf_8').find('div', id='meta').contents

['\n',
 <div class="media-item"><img alt="Photo of Norman Glaser" class="" src="https://www.baseball-reference.com/req/202311010/images/headshots/5/5db5c652_davis.jpg"/>
 </div>,
 ' div.media-item ',
 <div>
 <h1>
 <span>Norman Glaser</span>
 </h1>
 <p>
 <strong>Position:</strong>
     Pitcher
   
 </p>
 <p>
 <strong>Bats: </strong>Right
          • 
 	<strong>Throws: </strong>Right
     
 </p>
 <p><span>5-11</span>, <span>186lb</span> (180cm, 84kg) </p>
 <p>
 <strong><a href="/bio/">Born:</a></strong>
 <span data-birth="1894-08-31" id="necro-birth">
 <a href="/friv/birthdays.cgi?month=8&amp;day=31">August 31</a>, <a href="/leagues/majors/1894-births.shtml">1894</a>
 </span>
 <span>
     
       in Cleveland, <a href="/bio/OH_born.shtml">OH</a>
 </span>
 <span class="f-i f-us" style="">us</span>
 </p>
 <p><strong><a href="/bio/">Died:</a></strong>
 <span data-death="1979-05-27" id="necro-death"><a href="/friv/dateofdeath.cgi?month=5&amp;day=27">May 27</a>, <a href="/leagues/majors/1979-

In [31]:
list(BeautifulSoup(requests.get('https://www.baseball-reference.com/players/g/glaseno01.shtml').content, 'html.parser', from_encoding='utf_8').find('div', id='meta').strings)

['\n',
 '\n',
 '\n',
 '\n',
 'Norman Glaser',
 '\n',
 '\n',
 '\n',
 'Position:',
 '\n    Pitcher\n  \n',
 '\n',
 '\n',
 'Bats: ',
 'Right\n        \xa0•\xa0\n\t',
 'Throws: ',
 'Right\n    \n',
 '\n',
 '5-11',
 ',\xa0',
 '186lb',
 '\xa0(180cm,\xa084kg) ',
 '\n',
 '\n',
 'Born:',
 '\n',
 '\n',
 'August 31',
 ', ',
 '1894',
 '\n',
 '\n',
 '\n    \n      in Cleveland, ',
 'OH',
 '\n',
 '\n',
 'us',
 '\n',
 '\n',
 'Died:',
 '\n',
 'May 27',
 ', ',
 '1979',
 '\n',
 '\n',
 '\n    in\xa0Parma,\xa0',
 'OH',
 '\n',
 '\n',
 '\n',
 'Buried',
 ':',
 '\n    \n        \n        Sunset Memorial Park, North Olmsted, OH\n    \n',
 '\n',
 'Debut:',
 '\n',
 '\n    September 21, 1920\n    ',
 '\n    (Age 26-021d,\n    5,116th in major league history)\n    ',
 '\xa0\xa0\xa0',
 'vs. WSH',
 ' 2.1 IP, 7 H, 0 SO, 0 BB, 4 ER\n    \n  ',
 '\n',
 'Last Game:',
 '\n',
 'September 21, 1920',
 '\n    (Age 26-021d)\n    ',
 '\xa0\xa0\xa0',
 'vs. WSH',
 ' 2.1 IP, 7 H, 0 SO, 0 BB, 4 ER\n  ',
 '\n',
 '\n',
 'Full Name:'

In [None]:
BeautifulSoup(requests.get('https://www.fangraphs.com/leaders/major-league?pos=all&stats=bat&lg=aa&lg=nl&type=8&season=2023&month=0&season1=1871&ind=0&team=0&rost=0&players=0&qual=0&pagenum=1&pageitems=2000000000').content, 'html.parser', from_encoding='utf_8').find('script', type='application/json').text

In [23]:
browser = webdriver.Chrome(options=webdriver.ChromeOptions().add_argument('headless'))
browser.get('https://www.fangraphs.com/leaders/minor-league?pos=all&level=1&lg=2,4,5,6,7,8,9,10,11,14,12,13,15,16,17,18,30,32,33&stats=bat&qual=0&type=0&team=&season=2023&seasonEnd=2023&org=&ind=0&splitTeam=false&players=&sort=23,1')
soup = BeautifulSoup(browser.page_source, "html.parser")

In [None]:
stats = soup.find('a', class_='data-export')['href'].strip('data:application/csv;charset=utf-8,')
stats.replace('%2C', ',').replace('%22', '"').replace('%20', ' ').replace('%0A', '\n')

In [None]:
statsreader = csv.reader(stats, delimiter='', lineterminator='%22%0A%22')

In [None]:
pd.read_csv(stats.replace('%2C', ',').replace('%22', '"').replace('%20', ' ').replace('%0A', '\n'))

In [None]:
pd.read_csv(soup.find('a', class_='data-export')['href'])