In [26]:
import json
import re

import requests
import scrapy

In [27]:
url = "http://goheels.com/roster.aspx?rp_id=9450"

In [28]:
headers = {'User-Agent': 'UNC Journo Class'}

In [29]:
resp = requests.get(url, headers=headers)

In [30]:
body_bytes = resp.content

In [31]:
# Body is in bytes... turn it in to a string
body_str = body_bytes.decode('utf-8')

In [32]:
# Create a regular expression to find javascript objects on the page being passed to a function call...

# There is JS on the page that looks like
# $.getJSON("/services/responsive-roster-bio.ashx", { type: 'stats', rp_id: 9450, path: 'baseball', year: 2017, player_id: 3759 }
# and we want the object:
# { type: 'stats', rp_id: 9450, path: 'baseball', year: 2017, player_id: 3759 }
# This pattern will match and capture anything between curly braces following the ashx url
js_obj_rx = re.compile(r'.*?responsive-roster-bio\.ashx.*?({.*?})')

In [33]:
# Now we know what leads up to the content we're looking for so 
# we split the text content some token that preceeds our 
# regex to reduce the text size
parts = body_str.split('$.getJSON("/services/')[1:]


In [34]:
# Now we run our regular expression on the joined 
# parts (put them back in to a single string)
# and capture the JS objects
captured = js_obj_rx.findall(''.join(parts))

In [35]:
# Note how similar the JS is to JSON...
captured

["{ type: 'stats', rp_id: 9450, path: 'baseball', year: 2017, player_id: 3759 }",
 "{ type: 'related', rp_id: 9450, player_id: 3759 }"]

In [36]:
# Turn it in to valid json by splitting everything apart
clean_objs = []
for obj_str in captured:
    # We only want the stats object...
    if 'stats' not in obj_str:
        continue

    # Remove the braces
    obj_str = obj_str.replace('{', '').replace('}', '')
    # Remove the quotes
    obj_str = obj_str.replace("'", '').replace('"', '')
    # Split apart on commas
    obj_pairs = obj_str.split(',')
    # Now it looks like [" type: related", " rp_id: 9450", ...]
    # So we need to wrap everything in quotes to make it json so lets get separate values
    obj_pairs = [x.split(":") for x in obj_pairs]
    # Now it looks like [(' type', ' related'), ]
    # and we need to clean up all the leading / trailing white space
    # and wrap each value in double quotes
    clean_pairs = []
    for pair in obj_pairs:
        clean_pairs.append(['"{}"'.format(p.strip()) for p in pair])
    # And join them together with colons
    colonized = [":".join(p) for p in clean_pairs]
    # So now colonized is ['"type":"stats"']
    # and we need to join those together with commas
    commas = ','.join(colonized)
    # And make it look like a JSON object...
    json_str = "{" + commas + "}"
    # And make it a python dictionary
    clean_objs.append(json.loads(json_str))

In [37]:
clean_objs

[{'path': 'baseball',
  'player_id': '3759',
  'rp_id': '9450',
  'type': 'stats',
  'year': '2017'}]

In [38]:
# That was a lot of work! but now we can go get stats directly
#double splat
#keyword arg
stats_url = (
    "http://goheels.com/services/responsive-roster-bio.ashx?"
    "type={type}&rp_id={rp_id}&path={path}&year={year}"
    "&player_id={player_id}"
).format(**clean_objs[0])

In [39]:
stats_url

'http://goheels.com/services/responsive-roster-bio.ashx?type=stats&rp_id=9450&path=baseball&year=2017&player_id=3759'

In [40]:
resp = requests.get(stats_url, headers=headers)

In [41]:
sel = scrapy.Selector(text=resp.content.decode('utf-8'))

In [42]:
sel

<Selector xpath=None data='<html><body><p>{"current_stats":"\\r\\n\\t\\'>

In [43]:
# That looks weird... look closely at the data... html... body ... p... JSON!

In [44]:
resp.content

b'{"current_stats":"\\r\\n\\t\\t\\t<section>\\r\\n\\t\\t\\t\\t<h5>Hitting Statistics</h5>\\r\\n\\t\\t\\t\\t<div class=\\"sidearm-table-overflow-on-x-large\\">\\r\\n\\t\\t\\t\\t\\t<table class=\\"sidearm-table highlight-column-hover\\">\\r\\n\\t\\t\\t\\t\\t\\t<caption class=\\"hide\\">Greenfield, Aaron - Hitting Statistics</caption>\\r\\n\\t\\t\\t\\t\\t\\t<thead>\\r\\n\\t\\t\\t\\t\\t\\t\\t<tr>\\r\\n\\t\\t\\t\\t\\t\\t\\t\\t<th scope=\\"col\\" class=\\"text-left\\">Date</th>\\r\\n\\t\\t\\t\\t\\t\\t\\t\\t<th scope=\\"col\\" class=\\"text-left\\">Opponent</th>\\r\\n\\t\\t\\t\\t\\t\\t\\t\\t<th scope=\\"col\\">W/L</th>\\r\\n\\t\\t\\t\\t\\t\\t\\t\\t<th scope=\\"col\\">GS</th>\\r\\n\\t\\t\\t\\t\\t\\t\\t\\t<th scope=\\"col\\">AB</th>\\r\\n\\t\\t\\t\\t\\t\\t\\t\\t<th scope=\\"col\\">R</th>\\r\\n\\t\\t\\t\\t\\t\\t\\t\\t<th scope=\\"col\\">H</th>\\r\\n\\t\\t\\t\\t\\t\\t\\t\\t<th scope=\\"col\\">RBI</th>\\r\\n\\t\\t\\t\\t\\t\\t\\t\\t<th scope=\\"col\\">2B</th>\\r\\n\\t\\t\\t\\t\\t\\t\\t\\t<th scope=

In [45]:
html = json.loads(resp.content.decode("utf-8"))["current_stats"]

In [46]:
sel = scrapy.Selector(text=html)

In [47]:
sel

<Selector xpath=None data='<html><body><section>\r\n\t\t\t\t<h5>Hitting S'>

In [48]:
# That looks better!