In [55]:
#Imports json, re (regex), requests, and scrapy
import json
import re

import requests
import scrapy

In [56]:
#headers will be a variable that stores our information when we make a request to goheels
headers = {'User-Agent': 'UNC Journo Class'}

In [57]:
#storing the URL we want to mine
base_url = 'http://goheels.com'
url = base_url + '/roster.aspx?path=mbball'

In [58]:
#storing the response, requested info from http://goheels.com/roster.aspx?path=mbball
resp = requests.get(url, headers=headers)

In [59]:
body_str = resp.content.decode('utf-8')

In [60]:
#defining a selector - focuses on 'body_str' tag on the css
sel = scrapy.Selector(text=body_str)

In [61]:
#selecting the first table
table = sel.css('table')[0]

In [62]:
#checking the table selection
table

<Selector xpath='descendant-or-self::table' data='<table class="sidearm-table sidearm-tabl'>

In [63]:
#selecting the column headings by the 'th' tag in css
cols = table.css('th').xpath('string()').extract()

In [64]:
#testing cols selection
cols

['No.', 'Name', 'Pos.', 'Ht.', 'Wt.', 'Yr.', 'Hometown / High School']

In [65]:
#rows are defined as index 1 and onwards (ignore first row)
#selected using 'tr' tag in css
rows = table.css('tr')[1:]

In [66]:
#where things get interesting!
#making a players list
#looping through rows, extracting and storing info from each column heading and corresponding string data
players = []
for r in rows:
    data = {}
    for i, d in enumerate(r.css('td')):
        a = d.css('a')
        if a:
            t = a.xpath('text()').extract()[0]
            data['href'] = a.xpath('@href').extract()[0]
        else:
            t = d.xpath('text()').extract()[0]
        data[cols[i]] = t
    players.append(data)

In [67]:
#testing to see that info was stored right in the players list
players

[{'Hometown / High School': 'Columbia, S.C. / Hammond School',
  'Ht.': '6-1',
  'Name': 'Seventh Woods',
  'No.': '0',
  'Pos.': 'G',
  'Wt.': '185',
  'Yr.': 'So.',
  'href': '/roster.aspx?rp_id=13521'},
 {'Hometown / High School': 'Greensboro, N.C. / Wesleyan Christian Academy',
  'Ht.': '6-6',
  'Name': 'Theo Pinson',
  'No.': '1',
  'Pos.': 'F/G',
  'Wt.': '220',
  'Yr.': 'Sr.',
  'href': '/roster.aspx?rp_id=13515'},
 {'Hometown / High School': 'Apopka, Fla. / Lake Highland Preparatory',
  'Ht.': '6-0',
  'Name': 'Joel Berry II',
  'No.': '2',
  'Pos.': 'G',
  'Wt.': '195',
  'Yr.': 'Sr.',
  'href': '/roster.aspx?rp_id=13508'},
 {'Hometown / High School': 'Guilderland, N.Y. / Northfield Mount Hermon School (Mass.)',
  'Ht.': '6-3',
  'Name': 'Andrew Platek',
  'No.': '3',
  'Pos.': 'G',
  'Wt.': '195',
  'Yr.': 'Fr.',
  'href': '/roster.aspx?rp_id=13528'},
 {'Hometown / High School': 'Douglasville, Ga. / Douglas County',
  'Ht.': '6-5',
  'Name': 'Brandon Robinson',
  'No.': '4',


In [68]:
#making a method that gets second-level information by entering a player page and then into the table with bios
def fetch_bio(player):
    player_url = base_url + player['href']
    print('Fetch bio', player_url)
    resp = requests.get(player_url, headers=headers)
    player_txt = resp.content.decode('utf-8')
    sel = scrapy.Selector(text=player_txt)
    player['sel'] = sel
    #extracting and storing bio and image
    player['bio'] = sel.css('#sidearm-roster-player-bio').xpath('string()').extract()[0]
    player['img'] = sel.css('.sidearm-roster-player-image img').xpath('@src').extract()[0]

In [69]:
#goes back into outer level?
js_obj_rx = re.compile(r'.*?responsive-roster-bio\.ashx.*?(?P<obj>{.*?})')

In [70]:
#making a new method to fetch stats, also triggering/"clicking" into the stats tab. 
def fetch_stats(player):
    text = player['sel'].xpath('string()').extract()[0]
    parts = text.split('$.getJSON("/services/')[1:]
    captured = js_obj_rx.findall(''.join(parts))
    clean_objs = []
    for obj_str in captured:
        # We only want the stats object...
        if 'stats' not in obj_str:
            continue
        #stripping curly braces, quote marks in each obj's extracted string
        obj_str = obj_str.replace('{', '').replace('}', '')
        obj_str = obj_str.replace("'", '').replace('"', '')
        #splitting data in obj_pairs by commas
        obj_pairs = obj_str.split(',')
        #splitting data in obj_pairs by colons
        obj_pairs = [x.split(":") for x in obj_pairs]
        clean_pairs = []
        #making a list to store clean data pairs
        #looping through every item in obj_pairs 
        for pair in obj_pairs:
            clean_pairs.append(['"{}"'.format(p.strip()) for p in pair])
        #joining back together the colon-separated data
        colonized = [":".join(p) for p in clean_pairs] 
        #joining back together the comma-separated data
        commas = ','.join(colonized)
        #making a variable that stores the data as a json string
        #will look just like (and be) json!
        json_str = "{" + commas + "}"
        #appends the clean_objs list with this json data
        clean_objs.append(json.loads(json_str))
    #storing everyone's unique URL for stats into the player list
    player['stats_url'] = stats_url = (
        "http://goheels.com/services/responsive-roster-bio.ashx?"
        "type={type}&rp_id={rp_id}&path={path}&year={year}"
        "&player_id={player_id}"
    ).format(**clean_objs[0])
    #once stats are fetched, print a statement saying 'Fetch stats' and the URL
    print('Fetch stats', stats_url)
    resp = requests.get(stats_url, headers=headers)
    json_stats = json.loads(resp.content.decode("utf-8"))
    #adding the stats into player list
    player['raw_stats'] = json_stats

In [71]:
#using fetch_bio and fetch_stats methods for every item in players
for p in players:
    fetch_bio(p)
    fetch_stats(p)

Fetch bio http://goheels.com/roster.aspx?rp_id=13521
Fetch stats http://goheels.com/services/responsive-roster-bio.ashx?type=stats&rp_id=13521&path=mbball&year=2017&player_id=4736
Fetch bio http://goheels.com/roster.aspx?rp_id=13515
Fetch stats http://goheels.com/services/responsive-roster-bio.ashx?type=stats&rp_id=13515&path=mbball&year=2017&player_id=4636
Fetch bio http://goheels.com/roster.aspx?rp_id=13508
Fetch stats http://goheels.com/services/responsive-roster-bio.ashx?type=stats&rp_id=13508&path=mbball&year=2017&player_id=4632
Fetch bio http://goheels.com/roster.aspx?rp_id=13528
Fetch stats http://goheels.com/services/responsive-roster-bio.ashx?type=stats&rp_id=13528&path=mbball&year=2017&player_id=5171
Fetch bio http://goheels.com/roster.aspx?rp_id=13516
Fetch stats http://goheels.com/services/responsive-roster-bio.ashx?type=stats&rp_id=13516&path=mbball&year=2017&player_id=4733
Fetch bio http://goheels.com/roster.aspx?rp_id=13518
Fetch stats http://goheels.com/services/respons

In [72]:
#testing first player to see if it worked. it is indeed seventh woods and his data!
players[0]

{'Hometown / High School': 'Columbia, S.C. / Hammond School',
 'Ht.': '6-1',
 'Name': 'Seventh Woods',
 'No.': '0',
 'Pos.': 'G',
 'Wt.': '185',
 'Yr.': 'So.',
 'bio': '\r\n                        Biography\r\n                                                    \r\n                            FRESHMAN SEASON (2016-17)\r\n\r\nTied a school record by appearing in all 40 games • Averaged 7.7 minutes per game in the primary relief role behind Joel Berry II at the point • Had 49 assists, 42 turnovers and 21 steals • Had 25 assists and 13 turnovers in ACC regular-season play • Made two steals five times • Was on the floor for five minutes vs. Louisville as UNC out-scored the Cards, 10-2 • Played his bestgame in conference play at Duke when he scored four points and had a team-high four assists in eight minutes • Played a seven and a half minute stretch in the first half as UNC cut Duke’s lead from three to one • Was the first time he led UNC in assists since Long Beach State on 11/15 • Had t

In [73]:
#now we will test theo pinson's data for parsing the fetched bio/stats
p = [p for p in players if p['Name'] == 'Theo Pinson'][0]

In [74]:
#selecting these nodes from the data
txt = p['raw_stats']['career_stats']

In [75]:
sel = scrapy.Selector(text=txt)

In [76]:
sel.css('section')

[]

In [77]:

#[x.pop('stats') for x in players]

In [78]:
# new method to parse stats
def parse_stats(player):
    # new dictionary for stats
    stats = {}
    #defines the keys of the dictionary
    for raw_key, raw_val in player['raw_stats'].items():
        txt = player['raw_stats'][raw_key]
        if not txt:
            print('Skipping {} for {}'.format(raw_key, player['Name']))
            continue
        sel = scrapy.Selector(text=txt)
        # Get all the tables
        for section in sel.css('section'):
            #grab title of table
            title = section.css('h5').xpath('string()').extract()[0]
            #grab headings on the table
            cols = section.css('tr')[0].css('th').xpath('string()').extract()
            #printing table title and table headings
            print('NEW SECTION', title)
            print('COLS', cols)
            #making a new list for stats, will be modified below in local/temporary loop 
            #accessible after looping
            these_stats = []
            #printing rows
            print('TRS', section.css('tr'))
            #looping through every row
            for r in section.css('tr')[0:]:
                print('row', r.xpath('string()').extract()[0].replace('\r', '').replace('\n', '').strip())
                # new dictionary s that will be made every time we loop through
                # will store year and stats for each item
                s = {}
                #using enumeration for every cell in the row, starting from index 1
                #tried multiple ways, usually get mismatch error.
                # when I get the matching right, it will give an index out of range error
                #thus i use exception handling for IndexError
                for i, d in enumerate(r.css('td'), 1):
                    try:
                        s[cols[i].lower()] = d.xpath('string()').extract_first()
                    except IndexError:
                        continue
                yr = r.css('th').xpath('string()')
                #is supposed to get the year, but again because of one-off error, grabs the opponent
                if yr:
                    yr = yr.extract()[0]
                    if yr.lower() in ('total', 'season'):
                        print('SKIPPING...')
                        continue
                    print('THE YR IS', yr)
                    #adds the year to the temporary loop-only dictionary
                    s['year'] = yr
                #appending the overall these_stats item with everyone's stats after every time looping through
                these_stats.append(s)
                #prints the stats
                print('THE STATS ARE', these_stats)
            #stores existing stats and places them into the previous/overarching stats object
            #uses temporary object to help
            existing = stats.get(raw_key, {})
            existing[title] = these_stats
            stats[raw_key] = existing
    #finally amends each player's 'stats' field with the stats dictionary object that has key-value pairs! incredible!
    player['stats'] = stats
    print('ADDING STATS', player['stats'])

In [79]:
#numbering mismatch might be happening here, but I've experimented...
# - changing index to 0 or back to 1
# - adjusting index of cols[i] to i-1, i+1...
# - .extract()[0] vs .extract_first()
# finally did exception handling for IndexError
# AFTER various combinations of above, still have mismatch for year and opponent

In [80]:
#testing parsing on Theo
#everything in "THE STATS ARE" is printed out correctly
#mismatched year/opponent
p = [p for p in players if p['Name'] == 'Theo Pinson'][0]
parse_stats(p)

NEW SECTION Game-By-Game Statistics
COLS ['Date', 'Opponent', 'GS', 'MIN', 'FGM/A', '%', '3FG/A', '%', 'FTM/A', '%', 'OFF', 'DEF', 'TOT', 'AVG', 'PF', 'AST', 'T/O', 'BLK', 'STL', 'PTS', 'AVG']
TRS [<Selector xpath='descendant-or-self::tr' data='<tr>\r\n                        <th scope='>, <Selector xpath='descendant-or-self::tr' data='<tr>                        \r\n          '>, <Selector xpath='descendant-or-self::tr' data='<tr>                        \r\n          '>, <Selector xpath='descendant-or-self::tr' data='<tr>                        \r\n          '>, <Selector xpath='descendant-or-self::tr' data='<tr>                        \r\n          '>, <Selector xpath='descendant-or-self::tr' data='<tr>                        \r\n          '>, <Selector xpath='descendant-or-self::tr' data='<tr>                        \r\n          '>, <Selector xpath='descendant-or-self::tr' data='<tr>                        \r\n          '>, <Selector xpath='descendant-or-self::tr' data='<tr>       

row Assists                        11                        vs. Miami
THE STATS ARE [{'year': 'Statistic'}, {'value': 'Points', 'opponent': '25'}, {'value': 'Minutes', 'opponent': '39'}, {'value': 'Field Goals Made', 'opponent': '10'}, {'value': 'Field Goal Attempts', 'opponent': '16'}, {'value': '3-Point Field Goals Made', 'opponent': '2'}, {'value': '3-Point Field Goal Attempts', 'opponent': '6'}, {'value': 'Free Throws Made', 'opponent': '10'}, {'value': 'Free Throw Attempts', 'opponent': '10'}, {'value': 'Rebounds', 'opponent': '15'}, {'value': 'Assists', 'opponent': '11'}]
row Blocks                        3                        vs. Clemson
THE STATS ARE [{'year': 'Statistic'}, {'value': 'Points', 'opponent': '25'}, {'value': 'Minutes', 'opponent': '39'}, {'value': 'Field Goals Made', 'opponent': '10'}, {'value': 'Field Goal Attempts', 'opponent': '16'}, {'value': '3-Point Field Goals Made', 'opponent': '2'}, {'value': '3-Point Field Goal Attempts', 'opponent': '6'}, {'value': 

In [81]:
#time to parse everyone!
for p in players:
    parse_stats(p)

NEW SECTION Game-By-Game Statistics
COLS ['Date', 'Opponent', 'GS', 'MIN', 'FGM/A', '%', '3FG/A', '%', 'FTM/A', '%', 'OFF', 'DEF', 'TOT', 'AVG', 'PF', 'AST', 'T/O', 'BLK', 'STL', 'PTS', 'AVG']
TRS [<Selector xpath='descendant-or-self::tr' data='<tr>\r\n                        <th scope='>, <Selector xpath='descendant-or-self::tr' data='<tr>                        \r\n          '>, <Selector xpath='descendant-or-self::tr' data='<tr>                        \r\n          '>, <Selector xpath='descendant-or-self::tr' data='<tr>                        \r\n          '>, <Selector xpath='descendant-or-self::tr' data='<tr>                        \r\n          '>, <Selector xpath='descendant-or-self::tr' data='<tr>                        \r\n          '>, <Selector xpath='descendant-or-self::tr' data='<tr>                        \r\n          '>, <Selector xpath='descendant-or-self::tr' data='<tr>                        \r\n          '>, <Selector xpath='descendant-or-self::tr' data='<tr>       

row 11/20/17                            at STANFORD                            *                            29                            4-8                            .500                            0-1                            .000                            1-2                            .500                            1                            4                            5                            4.3                            1                            2                            2                            0                            0                            9                            10.3
THE YR IS at STANFORD
THE STATS ARE [{'year': 'Date'}, {'opponent': '11/10/17', 'gs': '*', 'min': '26', 'fgm/a': '1-6', '%': '.000', '3fg/a': '1-6', 'ftm/a': '0-0', 'off': '0', 'def': '3', 'tot': '3', 'avg': '3.0', 'pf': '1', 'ast': '5', 't/o': '0', 'blk': '0', 'stl': '0', 'pts': '3', 'year': ' Northern Iowa'}, {'opponent': '11/15/17', 'gs': '*', 'min': '33', 'fgm/a': '5-12

THE YR IS at Duke
THE STATS ARE [{'year': 'Date'}, {'opponent': '11/10/17', 'gs': '*', 'min': '26', 'fgm/a': '1-6', '%': '.000', '3fg/a': '1-6', 'ftm/a': '0-0', 'off': '0', 'def': '3', 'tot': '3', 'avg': '3.0', 'pf': '1', 'ast': '5', 't/o': '0', 'blk': '0', 'stl': '0', 'pts': '3', 'year': ' Northern Iowa'}, {'opponent': '11/15/17', 'gs': '*', 'min': '33', 'fgm/a': '5-12', '%': '.900', '3fg/a': '0-3', 'ftm/a': '9-10', 'off': '0', 'def': '5', 'tot': '5', 'avg': '11.0', 'pf': '3', 'ast': '6', 't/o': '2', 'blk': '0', 'stl': '1', 'pts': '19', 'year': ' Bucknell'}, {'opponent': '11/20/17', 'gs': '*', 'min': '29', 'fgm/a': '4-8', '%': '.500', '3fg/a': '0-1', 'ftm/a': '1-2', 'off': '1', 'def': '4', 'tot': '5', 'avg': '10.3', 'pf': '1', 'ast': '2', 't/o': '2', 'blk': '0', 'stl': '0', 'pts': '9', 'year': 'at STANFORD'}, {'opponent': '11/23/17', 'gs': '*', 'min': '25', 'fgm/a': '4-7', '%': '.500', '3fg/a': '1-3', 'ftm/a': '2-4', 'off': '2', 'def': '7', 'tot': '9', 'avg': '10.5', 'pf': '1', 'ast':

THE STATS ARE [{'year': 'Date'}, {'opponent': '11/10/17', 'gs': '*', 'min': '26', 'fgm/a': '1-6', '%': '.000', '3fg/a': '1-6', 'ftm/a': '0-0', 'off': '0', 'def': '3', 'tot': '3', 'avg': '3.0', 'pf': '1', 'ast': '5', 't/o': '0', 'blk': '0', 'stl': '0', 'pts': '3', 'year': ' Northern Iowa'}, {'opponent': '11/15/17', 'gs': '*', 'min': '33', 'fgm/a': '5-12', '%': '.900', '3fg/a': '0-3', 'ftm/a': '9-10', 'off': '0', 'def': '5', 'tot': '5', 'avg': '11.0', 'pf': '3', 'ast': '6', 't/o': '2', 'blk': '0', 'stl': '1', 'pts': '19', 'year': ' Bucknell'}, {'opponent': '11/20/17', 'gs': '*', 'min': '29', 'fgm/a': '4-8', '%': '.500', '3fg/a': '0-1', 'ftm/a': '1-2', 'off': '1', 'def': '4', 'tot': '5', 'avg': '10.3', 'pf': '1', 'ast': '2', 't/o': '2', 'blk': '0', 'stl': '0', 'pts': '9', 'year': 'at STANFORD'}, {'opponent': '11/23/17', 'gs': '*', 'min': '25', 'fgm/a': '4-7', '%': '.500', '3fg/a': '1-3', 'ftm/a': '2-4', 'off': '2', 'def': '7', 'tot': '9', 'avg': '10.5', 'pf': '1', 'ast': '7', 't/o': '3', 

NEW SECTION Season Highs
COLS ['Statistic', 'Value', 'Opponent']
TRS [<Selector xpath='descendant-or-self::tr' data='<tr>\r\n                    <th class="tex'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                        <td>Points'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                        <td>Minute'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                        <td>Field '>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                        <td>Field '>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                        <td>3-Poin'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                        <td>3-Poin'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                        <td>Free T'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                        <td>Free T'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                        <td>Reboun'>, 

THE STATS ARE [{'year': 'Date'}, {'opponent': '11/10/17', 'gs': '', 'min': '10', 'fgm/a': '1-1', '%': '1.000', '3fg/a': '0-0', 'ftm/a': '2-2', 'off': '0', 'def': '0', 'tot': '0', 'avg': '4.0', 'pf': '0', 'ast': '0', 't/o': '0', 'blk': '0', 'stl': '0', 'pts': '4', 'year': ' Northern Iowa'}, {'opponent': '11/15/17', 'gs': '', 'min': '14', 'fgm/a': '0-1', '%': '.000', '3fg/a': '0-0', 'ftm/a': '0-2', 'off': '1', 'def': '2', 'tot': '3', 'avg': '2.0', 'pf': '0', 'ast': '1', 't/o': '0', 'blk': '0', 'stl': '0', 'pts': '0', 'year': ' Bucknell'}, {'opponent': '11/20/17', 'gs': '', 'min': '14', 'fgm/a': '1-3', '%': '.000', '3fg/a': '0-1', 'ftm/a': '0-0', 'off': '1', 'def': '0', 'tot': '1', 'avg': '2.0', 'pf': '1', 'ast': '0', 't/o': '1', 'blk': '1', 'stl': '1', 'pts': '2', 'year': 'at STANFORD'}, {'opponent': '11/23/17', 'gs': '', 'min': '10', 'fgm/a': '4-5', '%': '1.000', '3fg/a': '2-2', 'ftm/a': '2-2', 'off': '1', 'def': '1', 'tot': '2', 'avg': '4.5', 'pf': '0', 'ast': '1', 't/o': '0', 'blk': '

THE YR IS at Notre Dame
THE STATS ARE [{'year': 'Date'}, {'opponent': '11/10/17', 'gs': '', 'min': '16', 'fgm/a': '0-2', '%': '1.000', '3fg/a': '0-1', 'ftm/a': '2-2', 'off': '0', 'def': '2', 'tot': '2', 'avg': '2.0', 'pf': '1', 'ast': '3', 't/o': '0', 'blk': '0', 'stl': '0', 'pts': '2', 'year': ' Northern Iowa'}, {'opponent': '11/23/17', 'gs': '', 'min': '12', 'fgm/a': '1-4', '%': '.000', '3fg/a': '1-2', 'ftm/a': '0-0', 'off': '1', 'def': '2', 'tot': '3', 'avg': '2.5', 'pf': '2', 'ast': '2', 't/o': '1', 'blk': '1', 'stl': '1', 'pts': '3', 'year': 'vs Portland'}, {'opponent': '11/24/17', 'gs': '', 'min': '15', 'fgm/a': '2-4', '%': '1.000', '3fg/a': '0-0', 'ftm/a': '1-1', 'off': '2', 'def': '4', 'tot': '6', 'avg': '3.3', 'pf': '2', 'ast': '0', 't/o': '0', 'blk': '0', 'stl': '0', 'pts': '5', 'year': 'vs Arkansas'}, {'opponent': '11/26/17', 'gs': '', 'min': '10', 'fgm/a': '0-1', '%': '.000', '3fg/a': '0-1', 'ftm/a': '0-0', 'off': '0', 'def': '0', 'tot': '0', 'avg': '2.5', 'pf': '0', 'ast':

THE STATS ARE [{'year': 'Date'}, {'opponent': '11/10/17', 'gs': '', 'min': '4', 'fgm/a': '0-2', '%': '.000', '3fg/a': '0-1', 'ftm/a': '0-0', 'off': '0', 'def': '0', 'tot': '0', 'avg': '0.0', 'pf': '0', 'ast': '0', 't/o': '0', 'blk': '1', 'stl': '0', 'pts': '0', 'year': ' Northern Iowa'}, {'opponent': '11/20/17', 'gs': '', 'min': '2', 'fgm/a': '0-0', '%': '.000', '3fg/a': '0-0', 'ftm/a': '0-0', 'off': '0', 'def': '1', 'tot': '1', 'avg': '0.0', 'pf': '0', 'ast': '0', 't/o': '0', 'blk': '0', 'stl': '0', 'pts': '0', 'year': 'at STANFORD'}, {'opponent': '11/23/17', 'gs': '', 'min': '2', 'fgm/a': '1-2', '%': '.000', '3fg/a': '1-1', 'ftm/a': '0-0', 'off': '0', 'def': '0', 'tot': '0', 'avg': '1.0', 'pf': '1', 'ast': '0', 't/o': '1', 'blk': '0', 'stl': '0', 'pts': '3', 'year': 'vs Portland'}, {'opponent': '11/26/17', 'gs': '', 'min': '1', 'fgm/a': '0-0', '%': '.000', '3fg/a': '0-0', 'ftm/a': '0-0', 'off': '0', 'def': '0', 'tot': '0', 'avg': '0.8', 'pf': '0', 'ast': '0', 't/o': '0', 'blk': '0', 

row 03/08/18                            vs Miami                            *                            31                            3-8                            .375                            1-4                            .250                            6-7                            .857                            5                            2                            7                            4.7                            1                            5                            0                            1                            1                            13                            13.2
THE YR IS vs Miami
THE STATS ARE [{'year': 'Date'}, {'opponent': '12/20/17', 'gs': '', 'min': '17', 'fgm/a': '1-5', '%': '.875', '3fg/a': '1-5', 'ftm/a': '7-8', 'off': '1', 'def': '2', 'tot': '3', 'avg': '10.0', 'pf': '0', 'ast': '1', 't/o': '0', 'blk': '0', 'stl': '1', 'pts': '10', 'year': ' Wofford'}, {'opponent': '12/23/17', 'gs': '', 'min': '23', 'fgm/a': '4-7', '%': '1.0

row 12/06/17                             Western Carolina                            *                            20                            4-6                            .667                            0-0                            .000                            1-4                            .250                            1                            6                            7                            4.4                            2                            2                            1                            1                            0                            9                            6.2
THE YR IS  Western Carolina
THE STATS ARE [{'year': 'Date'}, {'opponent': '11/10/17', 'gs': '*', 'min': '18', 'fgm/a': '5-6', '%': '1.000', '3fg/a': '0-0', 'ftm/a': '4-4', 'off': '3', 'def': '3', 'tot': '6', 'avg': '14.0', 'pf': '2', 'ast': '2', 't/o': '3', 'blk': '0', 'stl': '0', 'pts': '14', 'year': ' Northern Iowa'}, {'opponent': '11/15/17', 'gs': '*', 'min': '11', 

row 01/03/18                            at Florida State                                                        15                            2-3                            .667                            0-0                            .000                            2-2                            1.000                            4                            1                            5                            4.9                            0                            1                            0                            0                            0                            6                            6.0
THE YR IS at Florida State
THE STATS ARE [{'year': 'Date'}, {'opponent': '11/10/17', 'gs': '', 'min': '14', 'fgm/a': '3-4', '%': '.600', '3fg/a': '0-0', 'ftm/a': '3-5', 'off': '3', 'def': '5', 'tot': '8', 'avg': '9.0', 'pf': '0', 'ast': '1', 't/o': '0', 'blk': '0', 'stl': '0', 'pts': '9', 'year': ' Northern Iowa'}, {'opponent': '11/15/17', 'gs': '', 'min': '17', 'fgm/a'

row 11/26/17                            vs Michigan State                                                        1                            0-0                            .000                            0-0                            .000                            0-0                            .000                            0                            1                            1                            0.5                            1                            0                            1                            0                            0                            0                            0.0
THE YR IS vs Michigan State
THE STATS ARE [{'year': 'Date'}, {'opponent': '11/10/17', 'gs': '', 'min': '5', 'fgm/a': '0-2', '%': '.000', '3fg/a': '0-0', 'ftm/a': '0-0', 'off': '0', 'def': '0', 'tot': '0', 'avg': '0.0', 'pf': '2', 'ast': '1', 't/o': '0', 'blk': '0', 'stl': '0', 'pts': '0', 'year': ' Northern Iowa'}, {'opponent': '11/20/17', 'gs': '', 'min': '2', 'fgm/a': 

row 01/06/18                            at Virginia                            *                            32                            4-9                            .444                            3-6                            .500                            0-0                            .000                            1                            3                            4                            3.6                            0                            3                            0                            0                            1                            11                            12.7
THE YR IS at Virginia
THE STATS ARE [{'year': 'Date'}, {'opponent': '11/10/17', 'gs': '*', 'min': '28', 'fgm/a': '3-8', '%': '1.000', '3fg/a': '2-4', 'ftm/a': '2-2', 'off': '0', 'def': '1', 'tot': '1', 'avg': '10.0', 'pf': '2', 'ast': '5', 't/o': '1', 'blk': '0', 'stl': '3', 'pts': '10', 'year': ' Northern Iowa'}, {'opponent': '11/15/17', 'gs': '*', 'min': '29', 'fgm/a': '

THE YR IS  Clemson
THE STATS ARE [{'year': 'Date'}, {'opponent': '11/10/17', 'gs': '', 'min': '2', 'fgm/a': '0-0', '%': '.000', '3fg/a': '0-0', 'ftm/a': '0-0', 'off': '0', 'def': '2', 'tot': '2', 'avg': '0.0', 'pf': '0', 'ast': '0', 't/o': '0', 'blk': '0', 'stl': '0', 'pts': '0', 'year': ' Northern Iowa'}, {'opponent': '11/20/17', 'gs': '', 'min': '2', 'fgm/a': '1-1', '%': '.000', '3fg/a': '0-0', 'ftm/a': '0-0', 'off': '0', 'def': '0', 'tot': '0', 'avg': '1.0', 'pf': '0', 'ast': '0', 't/o': '0', 'blk': '0', 'stl': '0', 'pts': '2', 'year': 'at STANFORD'}, {'opponent': '11/23/17', 'gs': '', 'min': '2', 'fgm/a': '0-0', '%': '.000', '3fg/a': '0-0', 'ftm/a': '0-0', 'off': '0', 'def': '1', 'tot': '1', 'avg': '0.7', 'pf': '0', 'ast': '1', 't/o': '0', 'blk': '0', 'stl': '0', 'pts': '0', 'year': 'vs Portland'}, {'opponent': '11/26/17', 'gs': '', 'min': '2', 'fgm/a': '0-0', '%': '.000', '3fg/a': '0-0', 'ftm/a': '0-0', 'off': '1', 'def': '1', 'tot': '2', 'avg': '0.5', 'pf': '0', 'ast': '0', 't/o'

NEW SECTION Season Highs
COLS ['Statistic', 'Value', 'Opponent']
TRS [<Selector xpath='descendant-or-self::tr' data='<tr>\r\n                    <th class="tex'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                        <td>Points'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                        <td>Minute'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                        <td>Field '>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                        <td>Field '>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                        <td>3-Poin'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                        <td>3-Poin'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                        <td>Free T'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                        <td>Free T'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                        <td>Reboun'>, 

THE YR IS at Duke
THE STATS ARE [{'year': 'Date'}, {'opponent': '11/10/17', 'gs': '', 'min': '9', 'fgm/a': '2-4', '%': '.000', '3fg/a': '0-0', 'ftm/a': '0-0', 'off': '0', 'def': '1', 'tot': '1', 'avg': '4.0', 'pf': '2', 'ast': '0', 't/o': '0', 'blk': '0', 'stl': '0', 'pts': '4', 'year': ' Northern Iowa'}, {'opponent': '11/15/17', 'gs': '', 'min': '4', 'fgm/a': '2-2', '%': '.000', '3fg/a': '0-0', 'ftm/a': '0-0', 'off': '1', 'def': '1', 'tot': '2', 'avg': '4.0', 'pf': '1', 'ast': '0', 't/o': '0', 'blk': '1', 'stl': '0', 'pts': '4', 'year': ' Bucknell'}, {'opponent': '11/20/17', 'gs': '', 'min': '9', 'fgm/a': '2-2', '%': '1.000', '3fg/a': '0-0', 'ftm/a': '4-4', 'off': '1', 'def': '4', 'tot': '5', 'avg': '5.3', 'pf': '4', 'ast': '0', 't/o': '0', 'blk': '0', 'stl': '0', 'pts': '8', 'year': 'at STANFORD'}, {'opponent': '11/23/17', 'gs': '', 'min': '9', 'fgm/a': '2-4', '%': '.500', '3fg/a': '0-0', 'ftm/a': '1-2', 'off': '3', 'def': '0', 'tot': '3', 'avg': '5.3', 'pf': '1', 'ast': '0', 't/o': 

In [82]:
#printing theo's stats
[p for p in players if p['Name'] == 'Theo Pinson'][0]['stats']

{'current_stats': {'Game-By-Game Statistics': [{'year': 'Date'},
   {'%': '.000',
    '3fg/a': '1-6',
    'ast': '5',
    'avg': '3.0',
    'blk': '0',
    'def': '3',
    'fgm/a': '1-6',
    'ftm/a': '0-0',
    'gs': '*',
    'min': '26',
    'off': '0',
    'opponent': '11/10/17',
    'pf': '1',
    'pts': '3',
    'stl': '0',
    't/o': '0',
    'tot': '3',
    'year': ' Northern Iowa'},
   {'%': '.900',
    '3fg/a': '0-3',
    'ast': '6',
    'avg': '11.0',
    'blk': '0',
    'def': '5',
    'fgm/a': '5-12',
    'ftm/a': '9-10',
    'gs': '*',
    'min': '33',
    'off': '0',
    'opponent': '11/15/17',
    'pf': '3',
    'pts': '19',
    'stl': '1',
    't/o': '2',
    'tot': '5',
    'year': ' Bucknell'},
   {'%': '.500',
    '3fg/a': '0-1',
    'ast': '2',
    'avg': '10.3',
    'blk': '0',
    'def': '4',
    'fgm/a': '4-8',
    'ftm/a': '1-2',
    'gs': '*',
    'min': '29',
    'off': '1',
    'opponent': '11/20/17',
    'pf': '1',
    'pts': '9',
    'stl': '0',
    't/o': 

In [83]:
#exporting json for the data we just stripped and stored
#copies each player as "p" (~creating dupe of players list called to_dump)
to_dump = [p.copy() for p in players]
#loops through each player
#"pops" - grabbing each item/data from top
for p in to_dump:
    p.pop('sel')
    for k in list(p.keys()):
        if 'raw' in k:
            p.pop(k)
#makes/opens file scraped_players.json and dumps data into it
with open('scraped_players.json', 'w') as f:
    json.dump(to_dump, f)
    
#open and preview the first bit of the json with cat below

In [84]:
cat scraped_players.json | cut -c 1-100


[{"No.": "0", "href": "/roster.aspx?rp_id=13521", "Name": "Seventh Woods", "Pos.": "G", "Ht.": "6-1"


In [85]:
#view all of the first player's data
to_dump[0]

{'Hometown / High School': 'Columbia, S.C. / Hammond School',
 'Ht.': '6-1',
 'Name': 'Seventh Woods',
 'No.': '0',
 'Pos.': 'G',
 'Wt.': '185',
 'Yr.': 'So.',
 'bio': '\r\n                        Biography\r\n                                                    \r\n                            FRESHMAN SEASON (2016-17)\r\n\r\nTied a school record by appearing in all 40 games • Averaged 7.7 minutes per game in the primary relief role behind Joel Berry II at the point • Had 49 assists, 42 turnovers and 21 steals • Had 25 assists and 13 turnovers in ACC regular-season play • Made two steals five times • Was on the floor for five minutes vs. Louisville as UNC out-scored the Cards, 10-2 • Played his bestgame in conference play at Duke when he scored four points and had a team-high four assists in eight minutes • Played a seven and a half minute stretch in the first half as UNC cut Duke’s lead from three to one • Was the first time he led UNC in assists since Long Beach State on 11/15 • Had t