# Best way to scrape tables from sref is using pyquery

In [1]:
import requests
from pyquery import PyQuery as pq
import pandas as pd

In [2]:
url = "http://www.basketball-reference.com/players/w/walljo01.html"

In [3]:
response = requests.get(url)

# looks like this helps fix any issues with any scraping library
# can now just use pandas to scrape all dfs on player page
html = response.text
html = html.replace('<!--', '').replace('-->', '')

In [4]:
d = pq(html)
rows = d("#totals > tbody > tr")
headers = d("#totals > thead > tr > th")

In [5]:
rows

[<tr#totals.2011.full_table>, <tr#totals.2012.full_table>, <tr#totals.2013.full_table>, <tr#totals.2014.full_table>, <tr#totals.2015.full_table>, <tr#totals.2016.full_table>, <tr#totals.2017.full_table>]

In [6]:
# data from each row
data = [[td.text_content() for td in row.iterchildren()] for row in rows]

In [7]:
headers

[<th..poptip.sort_default_asc.center>, <th..poptip.sort_default_asc.center>, <th..poptip.sort_default_asc.center>, <th..poptip.sort_default_asc.center>, <th..poptip.sort_default_asc.center>, <th..poptip.center>, <th..poptip.center>, <th..poptip.center>, <th..poptip.center>, <th..poptip.center>, <th..poptip.center>, <th..poptip.center>, <th..poptip.center>, <th..poptip.center>, <th..poptip.center>, <th..poptip.center>, <th..poptip.center>, <th..poptip.center>, <th..poptip.center>, <th..poptip.center>, <th..poptip.center>, <th..poptip.center>, <th..poptip.center>, <th..poptip.center>, <th..poptip.center>, <th..poptip.center>, <th..poptip.center>, <th..poptip.center>, <th..poptip.center>, <th..poptip.center>]

In [8]:
[i.text_content() for i in headers]

['Season',
 'Age',
 'Tm',
 'Lg',
 'Pos',
 'G',
 'GS',
 'MP',
 'FG',
 'FGA',
 'FG%',
 '3P',
 '3PA',
 '3P',
 '2P',
 '2PA',
 '2P',
 'eFG%',
 'FT',
 'FTA',
 'FT%',
 'ORB',
 'DRB',
 'TRB',
 'AST',
 'STL',
 'BLK',
 'TOV',
 'PF',
 'PTS']

In [9]:
cols = [i.text_content() for i in headers]

In [10]:
df = pd.DataFrame(data=data, columns=cols)

In [11]:
df.head()

Unnamed: 0,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,2010-11,20,WAS,NBA,PG,69,64,2606,398,972,...,0.766,35,282,317,574,121,35,261,175,1131
1,2011-12,21,WAS,NBA,PG,66,66,2386,378,894,...,0.789,48,252,300,530,95,57,255,137,1076
2,2012-13,22,WAS,NBA,PG,49,42,1602,324,735,...,0.804,35,161,196,373,65,37,157,117,906
3,2013-14,23,WAS,NBA,PG,82,82,2980,579,1337,...,0.805,38,295,333,721,149,40,295,219,1583
4,2014-15,24,WAS,NBA,PG,79,79,2837,519,1166,...,0.785,36,330,366,792,138,45,304,180,1387


## Try pandas

In [12]:
dfs = pd.read_html(html)

In [13]:
len(dfs)

46

In [14]:
dfs[0].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 30 columns):
Season    8 non-null object
Age       7 non-null float64
Tm        7 non-null object
Lg        8 non-null object
Pos       7 non-null object
G         8 non-null int64
GS        8 non-null int64
MP        8 non-null float64
FG        8 non-null float64
FGA       8 non-null float64
FG%       8 non-null float64
3P        8 non-null float64
3PA       8 non-null float64
3P.1      8 non-null float64
2P        8 non-null float64
2PA       8 non-null float64
2P.1      8 non-null float64
eFG%      8 non-null float64
FT        8 non-null float64
FTA       8 non-null float64
FT%       8 non-null float64
ORB       8 non-null float64
DRB       8 non-null float64
TRB       8 non-null float64
AST       8 non-null float64
STL       8 non-null float64
BLK       8 non-null float64
TOV       8 non-null float64
PF        8 non-null float64
PTS       8 non-null float64
dtypes: float64(24), int64(2), object

In [15]:
dfs[1].head()

Unnamed: 0,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,2010-11,20.0,WAS,NBA,PG,69,64,2606,398,972,...,0.766,35,282,317,574,121,35,261,175,1131
1,2011-12,21.0,WAS,NBA,PG,66,66,2386,378,894,...,0.789,48,252,300,530,95,57,255,137,1076
2,2012-13,22.0,WAS,NBA,PG,49,42,1602,324,735,...,0.804,35,161,196,373,65,37,157,117,906
3,2013-14,23.0,WAS,NBA,PG,82,82,2980,579,1337,...,0.805,38,295,333,721,149,40,295,219,1583
4,2014-15,24.0,WAS,NBA,PG,79,79,2837,519,1166,...,0.785,36,330,366,792,138,45,304,180,1387


In [16]:
dfs[2].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 29 columns):
Season    8 non-null object
Age       7 non-null float64
Tm        7 non-null object
Lg        8 non-null object
Pos       7 non-null object
G         8 non-null int64
GS        8 non-null int64
MP        8 non-null int64
FG        8 non-null float64
FGA       8 non-null float64
FG%       8 non-null float64
3P        8 non-null float64
3PA       8 non-null float64
3P.1      8 non-null float64
2P        8 non-null float64
2PA       8 non-null float64
2P.1      8 non-null float64
FT        8 non-null float64
FTA       8 non-null float64
FT%       8 non-null float64
ORB       8 non-null float64
DRB       8 non-null float64
TRB       8 non-null float64
AST       8 non-null float64
STL       8 non-null float64
BLK       8 non-null float64
TOV       8 non-null float64
PF        8 non-null float64
PTS       8 non-null float64
dtypes: float64(22), int64(3), object(4)
memory usage: 1.9+ KB


In [17]:
dfs[3].head()

Unnamed: 0,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,FGA,...,TRB,AST,STL,BLK,TOV,PF,PTS,Unnamed: 29,ORtg,DRtg
0,2010-11,20.0,WAS,NBA,PG,69,64,2606,7.8,19.1,...,6.2,11.3,2.4,0.7,5.1,3.4,22.2,,100,110
1,2011-12,21.0,WAS,NBA,PG,66,66,2386,8.2,19.4,...,6.5,11.5,2.1,1.2,5.5,3.0,23.4,,100,107
2,2012-13,22.0,WAS,NBA,PG,49,42,1602,10.5,23.9,...,6.4,12.1,2.1,1.2,5.1,3.8,29.5,,105,103
3,2013-14,23.0,WAS,NBA,PG,82,82,2980,10.0,23.1,...,5.8,12.5,2.6,0.7,5.1,3.8,27.4,,106,104
4,2014-15,24.0,WAS,NBA,PG,79,79,2837,9.4,21.1,...,6.6,14.3,2.5,0.8,5.5,3.3,25.1,,105,102


In [18]:
dfs[5].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 29 columns):
(Unnamed: 0_level_0, Unnamed: 0_level_1, Season)      8 non-null object
(2-Pt Field Goals, % of FGA by Distance, Age)         7 non-null float64
(3-Pt Field Goals, FG% by Distance, Tm)               7 non-null object
(Unnamed: 3_level_0, Unnamed: 3_level_1, Lg)          8 non-null object
(Unnamed: 4_level_0, Dunks, Pos)                      7 non-null object
(Unnamed: 5_level_0, Unnamed: 5_level_1, G)           8 non-null int64
(Unnamed: 6_level_0, Corner, MP)                      8 non-null int64
(Unnamed: 7_level_0, Heaves, FG%)                     8 non-null float64
(Unnamed: 8_level_0, Unnamed: 8_level_1, Dist.)       8 non-null float64
(Unnamed: 9_level_0, Unnamed: 9_level_1, 2P)          8 non-null float64
(Unnamed: 10_level_0, Unnamed: 10_level_1, 0-3)       8 non-null float64
(Unnamed: 11_level_0, Unnamed: 11_level_1, 3-10)      8 non-null float64
(Unnamed: 12_level_0, Unnamed: 

In [19]:
dfs[5].head()

Unnamed: 0_level_0,Unnamed: 0_level_0,2-Pt Field Goals,3-Pt Field Goals,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,...,Unnamed: 19_level_0,Unnamed: 20_level_0,Unnamed: 21_level_0,Unnamed: 22_level_0,Unnamed: 23_level_0,Unnamed: 24_level_0,Unnamed: 25_level_0,Unnamed: 26_level_0,Unnamed: 27_level_0,Unnamed: 28_level_0
Unnamed: 0_level_1,Unnamed: 0_level_1,% of FGA by Distance,FG% by Distance,Unnamed: 3_level_1,Dunks,Unnamed: 5_level_1,Corner,Heaves,Unnamed: 8_level_1,Unnamed: 9_level_1,...,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
Unnamed: 0_level_2,Season,Age,Tm,Lg,Pos,G,MP,FG%,Dist.,2P,...,16 <3,3P,%Ast'd,%FGA,Md.,%Ast'd,%3PA,3P%,Att.,Md.
0,2010-11,20.0,WAS,NBA,PG,69,2606,0.409,11.0,0.882,...,0.316,0.296,0.223,0.028,26,0.765,0.113,0.385,6,0
1,2011-12,21.0,WAS,NBA,PG,66,2386,0.423,9.6,0.953,...,0.302,0.071,0.213,0.055,47,1.0,0.357,0.067,2,0
2,2012-13,22.0,WAS,NBA,PG,49,1602,0.441,11.3,0.939,...,0.372,0.267,0.25,0.024,17,0.75,0.267,0.5,3,0
3,2013-14,23.0,WAS,NBA,PG,82,2980,0.433,14.4,0.77,...,0.367,0.351,0.176,0.032,43,0.657,0.133,0.293,1,0
4,2014-15,24.0,WAS,NBA,PG,79,2837,0.445,13.5,0.814,...,0.395,0.3,0.2,0.032,37,0.646,0.198,0.256,7,0


In [20]:
dfs[17].head() # college stats

Unnamed: 0_level_0,Unnamed: 0_level_0,Totals,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Per Game,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,...,Unnamed: 16_level_0,Unnamed: 17_level_0,Unnamed: 18_level_0,Unnamed: 19_level_0,Unnamed: 20_level_0,Unnamed: 21_level_0,Unnamed: 22_level_0,Unnamed: 23_level_0,Unnamed: 24_level_0,Unnamed: 25_level_0
Unnamed: 0_level_1,Season,Age,College,G,MP,FG,FGA,3P,3PA,FT,...,TOV,PF,PTS,FG%,3P,FT%,MP,PTS,TRB,AST
0,2009-10,19.0,KENTUCKY,37,1288,202,438,37,114,175,...,149,72,616,0.461,0.325,0.754,34.8,16.6,4.3,6.5
1,Career,,,37,1288,202,438,37,114,175,...,149,72,616,0.461,0.325,0.754,34.8,16.6,4.3,6.5


In [23]:
dfs[19]

Unnamed: 0,0
0,"Mar 17, 2013 Player of the Week"
1,"Nov 25, 2013 Player of the Week"
2,"Mar 3, 2014 Player of the Week"
3,"Dec 15, 2014 Player of the Week"
4,"Jan 18, 2016 Player of the Week"
5,"Jan 2, 2017 Player of the Week"
6,"Mar 13, 2017 Player of the Week"


In [62]:
dfs[-1] # contract

Unnamed: 0,0,1,2
0,Team,2017-18,2018-19
1,Washington Wizards,"$18,063,850","$19,169,800"


# Scraping college link

In [24]:
college_link = d("#inner_nav > ul > li:nth-child(8) > div > ul:nth-child(4) > li > a")[0]

In [25]:
college_link.attrib["href"]

'http://www.sports-reference.com/cbb/players/john-wall-1.html'

In [26]:
college_link.text_content()

'College Basketball at Sports-Reference.com'

### Below is a better way as it doesn't rely on extracting the nth child of lists since that can change depending on the player page

### Instead we rely on extracting all links in the player's inpage nav menu and extracting the one associated to their college basketball stat page

In [27]:
inner_nav_links = d("#inner_nav > ul > li.condensed.hasmore > div > ul > li > a")

In [28]:
inner_nav_links

[<a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>]

In [29]:
[i.text_content() for i in inner_nav_links]

['2010-11',
 '2011-12',
 '2012-13',
 '2013-14',
 '2014-15',
 '2015-16',
 '2016-17',
 '2010-11',
 '2011-12',
 '2012-13',
 '2013-14',
 '2014-15',
 '2015-16',
 '2016-17',
 'Career',
 '2010-11',
 '2011-12',
 '2012-13',
 '2013-14',
 '2014-15',
 '2015-16',
 '2016-17',
 '2010-11',
 '2011-12',
 '2012-13',
 '2013-14',
 '2014-15',
 '2015-16',
 '2016-17',
 '2010-11',
 '2011-12',
 '2012-13',
 '2013-14',
 '2014-15',
 '2015-16',
 '2016-17',
 'Game Finder',
 'Streak Finder',
 'Shot Finder',
 'Event Finder',
 'Lineup Finder',
 'Plus/Minus Finder',
 'Compare John Wall',
 'College Basketball at Sports-Reference.com']

In [30]:
list_text = [i.text_content() for i in inner_nav_links]

In [31]:
# get the index of the sublist that contains the substring about college stats
college_text = "College Basketball at Sports-Reference.com"
idx = [i for i, s in enumerate(list_text) if college_text in s]

In [32]:
idx

[43]

In [33]:
college_li = inner_nav_links[idx[0]]

In [34]:
college_li.text_content()

'College Basketball at Sports-Reference.com'

In [35]:
college_li.attrib["href"]

'http://www.sports-reference.com/cbb/players/john-wall-1.html'

# Try scraping links to European stats

In [36]:
jennings_url = "http://www.basketball-reference.com/players/j/jennibr01.html"

In [37]:
response2= requests.get(jennings_url)
html2 = response2.text
html2 = html2.replace('<!--', '').replace('-->', '')

In [38]:
d2 = pq(html2)

In [39]:
inner_nav_links2 = d2("#inner_nav > ul > li.condensed.hasmore > div > ul > li > a")

In [40]:
euro_text = "Euro Stats at Basketball-Reference.com"

In [41]:
list_text2 = [i.text_content() for i in inner_nav_links2]
idx2 = [i for i, s in enumerate(list_text2) if euro_text in s][0]

In [42]:
idx2

48

In [43]:
d2("#inner_nav > ul > li.condensed.hasmore > div > ul > li > a")

[<a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>, <a>]

In [44]:
euro_li = inner_nav_links2[idx2]
euro_li.attrib["href"]

'http://www.basketball-reference.com/euro/players/brandon-jennings-1.html'

# Scraping college stats

Doesn't seem to work except for the very first per game table.

Not sure why, maybe broken html???

Also it's nbd since the bref pages contain the players college stats, plus I found how to access them via the season finder tool.

In [45]:
duncan_url = "http://www.sports-reference.com/cbb/players/tim-duncan-1.html"
response3 = requests.get(duncan_url)
# html3 = response3.text#.replace('<--', '').replace('-->', '')
html3 = response3.content
duncan_pq = pq(html3)

In [46]:
def get_table(pq_obj, table_id_selector):
    rows = pq_obj("{} > tbody > tr".format(table_id_selector))
    headers = pq_obj("{} > thead > tr > th".format(table_id_selector))
    # data from each row
    data = [[td.text_content() for td in row.iterchildren()] for row in rows]
    cols = [i.text_content() for i in headers]
    df = pd.DataFrame(data=data, columns=cols)
    return df

In [47]:
duncan_totals = get_table(duncan_pq, "#players_totals")

In [48]:
duncan_totals

In [49]:
duncan_pq("#players_per_game")

[<table#players_per_game.row_summable.sortable.stats_table>]

In [50]:
duncan_pq("table")

[<table#players_per_game.row_summable.sortable.stats_table>]

In [51]:
from bs4 import BeautifulSoup

In [52]:
soup = BeautifulSoup(html3, "lxml")

In [53]:
soup.select("#players_per_game > tbody > tr")[0]

<tr id="players_per_game.1994"><th class="left " data-stat="season" scope="row"><a href="/cbb/seasons/1994.html">1993-94</a></th><td class="left " data-stat="school_name"><a href="/cbb/schools/wake-forest/1994.html">Wake Forest</a></td><td class="left " data-stat="conf_abbr"><a href="/cbb/conferences/acc/1994.html">ACC</a></td><td class="right " data-stat="g">33</td><td class="right " data-stat="mp_per_g">30.2</td><td class="right " data-stat="fg_per_g">3.6</td><td class="right " data-stat="fga_per_g">6.7</td><td class="right " data-stat="fg_pct">.545</td><td class="right " data-stat="fg2_per_g">3.6</td><td class="right " data-stat="fg2a_per_g">6.6</td><td class="right " data-stat="fg2_pct">.543</td><td class="right " data-stat="fg3_per_g">0.0</td><td class="right " data-stat="fg3a_per_g">0.0</td><td class="right " data-stat="fg3_pct">1.000</td><td class="right " data-stat="ft_per_g">2.5</td><td class="right " data-stat="fta_per_g">3.3</td><td class="right " data-stat="ft_pct">.745</td

In [54]:
soup.select("#players_totals")

[]

In [55]:
soup2 = BeautifulSoup(html3, "html5lib")

In [56]:
soup2.select("#players_per_game > tbody > tr")[0]

<tr id="players_per_game.1994"><th class="left " data-stat="season" scope="row"><a href="/cbb/seasons/1994.html">1993-94</a></th><td class="left " data-stat="school_name"><a href="/cbb/schools/wake-forest/1994.html">Wake Forest</a></td><td class="left " data-stat="conf_abbr"><a href="/cbb/conferences/acc/1994.html">ACC</a></td><td class="right " data-stat="g">33</td><td class="right " data-stat="mp_per_g">30.2</td><td class="right " data-stat="fg_per_g">3.6</td><td class="right " data-stat="fga_per_g">6.7</td><td class="right " data-stat="fg_pct">.545</td><td class="right " data-stat="fg2_per_g">3.6</td><td class="right " data-stat="fg2a_per_g">6.6</td><td class="right " data-stat="fg2_pct">.543</td><td class="right " data-stat="fg3_per_g">0.0</td><td class="right " data-stat="fg3a_per_g">0.0</td><td class="right " data-stat="fg3_pct">1.000</td><td class="right " data-stat="ft_per_g">2.5</td><td class="right " data-stat="fta_per_g">3.3</td><td class="right " data-stat="ft_pct">.745</td

In [57]:
soup2.select("#players_per_game")

[<table class="row_summable sortable stats_table" data-cols-to-freeze="1" id="players_per_game"><caption>Per Game Table</caption>
    <colgroup><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col></colgroup>
    <thead>      
       <tr>
          <th aria-label="Season" class=" poptip sort_default_asc center" data-stat="season" scope="col">Season</th>
          <th aria-label="* = NCAA Tournament appearance" class=" poptip sort_default_asc center" data-stat="school_name" data-tip="* = NCAA Tournament appearance" scope="col">School</th>
          <th aria-label="Conference" class=" poptip sort_default_asc center" data-stat="conf_abbr" data-tip="Conference" scope="col">Conf</th>
          <th aria-label="Games" class=" poptip center" data-stat="g" data-tip=

In [58]:
soup3 = BeautifulSoup(html3, "html.parser")

In [59]:
len(soup3.find_all("tr"))

6

In [60]:
len(soup.find_all("table"))

1

In [61]:
d("#totals > tbody > tr")

[<tr#totals.2011.full_table>, <tr#totals.2012.full_table>, <tr#totals.2013.full_table>, <tr#totals.2014.full_table>, <tr#totals.2015.full_table>, <tr#totals.2016.full_table>, <tr#totals.2017.full_table>]