# Player data scraping
---
Using JS-rendered HTML tables on Baseball-Reference

In [1]:
import bs4
import numpy as np 
import pandas as pd 
from requests_html import AsyncHTMLSession
from time import time 

In [2]:
# Player ids
player_id = 'fernan018jos'
# player_id = 'guerre002vla'

In [3]:
# set-up an Asynchronous session to work with Jupyter
session = AsyncHTMLSession()
res = await session.get('https://www.baseball-reference.com/register/player.fcgi?id={player_id}'.format(player_id=player_id))
# render the javascript on the page to get a fuller html file
await res.html.arender()

In [4]:
# parse the HTML to find the table we need, standard batting
table_search = bs4.BeautifulSoup(res.html.html, 'lxml').select('#standard_batting')
# read into a DataFrame (make sure to take the correct element... so long as the #standard_batting is found, this should work)
df = pd.read_html(str(table_search))[0]

In [5]:
# Cut to unaggregated data
agg_location = df.index[df.Year == 'Year'].tolist()[0]
df_out = df.iloc[:agg_location]
# Remove N/A's
df_out = df_out[~df_out.isna().all(axis=1)]

In [6]:
# Create a list of columns that need to change data types
type_change_dict = {'Year': int,
                    'Age': int,
                    'G': int,
                    'PA': int,
                    'AB': int,
                    'R': int,
                    'H': int,
                    '2B': int,
                    '3B': int,
                    'HR': int,
                    'RBI': int,
                    'SB': int,
                    'CS': int,
                    'BB': int,
                    'SO': int,
                    'BA': float,
                    'OBP': float,
                    'SLG': float,
                    'OPS': float,
                    'TB': int,
                    'GDP': int,
                    'HBP': int,
                    'SH': int,
                    'SF': int,
                    'IBB': int}
# Make sure that the Year is only 4 long
df_out['Year'] = df_out['Year'].str[:4]
# Loop around all of these 
for i in type_change_dict: 
    df_out[i] = df_out[i].astype(type_change_dict[i])
# In the output dataframe, add the playerid as a column
df_out['player_id'] = player_id

In [7]:
df_out

Unnamed: 0,Year,Age,AgeDif,Tm,Lg,Lev,Aff,G,PA,AB,...,OBP,SLG,OPS,TB,GDP,HBP,SH,SF,IBB,player_id
0,2007,19,-5.7,Matanzas,CNS,Fgn,,79,345,309,...,0.343,0.35,0.693,108,10,5,10,1,0,fernan018jos
1,2008,20,-5.3,Matanzas,CNS,Fgn,,81,375,335,...,0.372,0.379,0.751,127,21,10,7,3,1,fernan018jos
2,2009,21,-4.3,Matanzas,CNS,Fgn,,90,419,382,...,0.391,0.469,0.86,179,10,11,5,2,0,fernan018jos
3,2010,22,-3.8,Matanzas,CNS,Fgn,,87,360,319,...,0.34,0.37,0.71,118,16,5,7,3,5,fernan018jos
4,2011,23,-3.2,Matanzas,CNS,Fgn,,86,338,265,...,0.444,0.453,0.897,120,8,8,5,4,7,fernan018jos
5,2012,24,-2.2,Matanzas,CNS,Fgn,,87,364,301,...,0.456,0.492,0.947,148,7,8,4,2,13,fernan018jos
6,2013,25,,2 Teams,2 Lgs,Fgn-FgW,,87,329,248,...,0.48,0.444,0.924,110,7,8,2,1,21,fernan018jos
7,2013,25,-1.7,Matanzas,CNS,Fgn,,83,314,239,...,0.482,0.456,0.938,109,7,8,1,1,21,fernan018jos
8,2013,25,-4.4,Cuba,CARS,FgW,,4,15,9,...,0.429,0.111,0.54,1,0,0,1,0,0,fernan018jos
9,2014,26,-0.7,Matanzas,CNS,Fgn,,15,65,54,...,0.415,0.426,0.841,23,2,2,0,1,0,fernan018jos
