# Player id scraping
---
Getting all player data from KBO and NPB teams who have played in the MLB

## Obtaining team codes
The first step involves getting the links to every team's lineup for every season

In [1]:
import bs4
import numpy as np 
import pandas as pd 
from requests_html import AsyncHTMLSession
import time

In [2]:
league_codes = [
    ('KBO', 'KBO'),
    ('NPB_central', 'JPCL'),
    ('NPB_pacific', 'JPPL')
]
league_url = 'https://www.baseball-reference.com/register/league.cgi?code={league}&class=Fgn'


In [3]:
# Open session and generate blank list
session = AsyncHTMLSession()
link_list = []
# Loop for each league and gather all team links
for league_tup in league_codes:
    print('Beginning', league_tup[0], 'scrape...')
    # Render the page's JS
    res = await session.get(league_url.format(league=league_tup[1]))
    await res.html.arender()
    print(league_tup[0], 'rendered.')
    # Make some soup and take a look at the lg_history table
    league_history_soup = bs4.BeautifulSoup(res.html.html, 'lxml')
    league_history_rows = league_history_soup.select('#lg_history tbody tr')

    count = 0
    i = 0
    # Loop over each row and take each individual team's links
    while count < 20:  # take the last 20 seasons of players
        if league_history_rows[i].get('class') == 'thead':
            # This is a catch for blank rows, so as not to have a blank season
            i += 1
        else:
            for link in league_history_rows[i].select('td a'):
                # append the league it belongs to, and the link's reference
                link_list.append((league_tup[0], link.get('href')))
            i += 1
            count += 1
    print(len(link_list), 'total links after completion of', league_tup[0])
    print()
    time.sleep(5)
print('Completed.')

Beginning KBO scrape...
KBO rendered.
174 total links after completion of KBO

Beginning NPB_central scrape...
NPB_central rendered.
294 total links after completion of NPB_central

Beginning NPB_pacific scrape...
NPB_pacific rendered.
414 total links after completion of NPB_pacific

Completed.


## Obtaining only the MLB players' ids
The second step involves chasing each of these team links, and identifying the player_ids that are designated as former MLB players

One thing to keep in mind, is that these identify HOF'ers as well, so each players data has to be checked to make sure that they were in fact an MLB player

In [4]:
# Open session and generate blank list
session = AsyncHTMLSession()
id_list = []
curr_league = ''
failed_list = []
# Loop for each of the team links from the previous section
for i, link_tup in enumerate(link_list):
    print('Next iteration begins')
    # wait before calling
    time.sleep(float(np.random.rand(1)) * 5 + 7.5)  # random call times b/w 7.5 and 12.5 seconds 
    # Better follow progress with some updates
    if link_tup[0] != curr_league:
        curr_league = link_tup[0]
        print('Scraping', curr_league, '...')
    curr_id_length = len(id_list)
    # Connect into Baseball-Reference
    print('Sleep finished. Getting session')
    res = await session.get('https://www.baseball-reference.com' + link_tup[1])
    print('Got session, attempting render.')
    # Check for time-out errors when rendering JS
    try:
        await res.html.arender(timeout = 30.0)
    except TimeoutError:
        print('Page', i + 1, 'failed due to timeout.')
        failed_list.append((link_tup, 'TimeOut'))
        continue
    except TypeError:
        print('Page', i + 1, 'failed due to typeerror.')
        failed_list.append((link_tup, 'TypeError'))
        continue

    print('Page', i + 1, 'rendered.')
    # Make soup and cut print(lento MLB pitchers and batters
    team_soup = bs4.BeautifulSoup(res.html.html)
    mlb_batters = team_soup.select('#team_batting tbody tr td strong a')
    mlb_pitchers = team_soup.select('#team_pitching tbody tr td strong a')
    # Loop over each of these and add to the id_list (only if not in the list already) 
    for bttr in mlb_batters:
        bttr_id = bttr.get('href').split('id=')[1]
        if bttr_id not in id_list:
            id_list.append(bttr_id)
    for ptchr in mlb_pitchers:
        ptchr_id = ptchr.get('href').split('id=')[1]
        if ptchr_id not in id_list:
            id_list.append(ptchr_id)
    print('Page', i + 1, 'scraped.', len(id_list) - curr_id_length, 'new players added.')
    print(len(id_list), 'total players.')
    print()

print('Completed.')

Next iteration begins
Scraping KBO ...
Sleep finished. Getting session
Got session, attempting render.
Page 1 rendered.
Page 1 scraped. 3 new players added.
3 total players.

Next iteration begins
Sleep finished. Getting session
Got session, attempting render.
Page 2 rendered.
Page 2 scraped. 4 new players added.
7 total players.

Next iteration begins
Sleep finished. Getting session
Got session, attempting render.
Page 3 rendered.
Page 3 scraped. 3 new players added.
10 total players.

Next iteration begins
Sleep finished. Getting session
Got session, attempting render.
Page 4 rendered.
Page 4 scraped. 5 new players added.
15 total players.

Next iteration begins
Sleep finished. Getting session
Got session, attempting render.
Page 5 rendered.
Page 5 scraped. 3 new players added.
18 total players.

Next iteration begins
Sleep finished. Getting session
Got session, attempting render.
Page 6 rendered.
Page 6 scraped. 3 new players added.
21 total players.

Next iteration begins
Sleep fin

TimeoutError: Navigation Timeout Exceeded: 30000 ms exceeded.

In [7]:
pd.DataFrame(id_list, columns=['player_id'])

Unnamed: 0,player_id
0,fernan018jos
1,alcant001rau
2,flexen000chr
3,barnes001bra
4,hoying001jar
...,...
76,hacker001eri
77,rogers001esm
78,adlema001tim
79,bonill001lis
