In [None]:
import requests
from lxml import html, etree
from io import StringIO

In [None]:
xpath = "//ul[@class = 'ogn-childpages']/li/a/@href"

In [None]:
r = requests.get('https://www.d20pfsrd.com/bestiary/monster-listings/fey')
page = html.parse(StringIO(r.text))
urls_to_scrape = page.xpath(xpath)

# Scraping the URLs

Now that I have a list of URLs to scrape, I'm going to request those pages, do some *light* parsing, and store in Mongo. I'll do a second pass after that to get the information I really want. 

In [None]:
from pymongo import MongoClient
from pymongo.errors import DuplicateKeyError
import time


# connect to the hosted MongoDB instance
client = MongoClient('localhost', 27017)
db = client.pathfinder
raw_pages = db.raw


In [None]:
def get_page(url, retries = 5):
    try:
        r = requests.get(url)
        return(r.text)
    except:
        if retries > 0:
            print("Error on request, retrying after 1 second.")
            time.sleep(1)
            get_page(url, retries = retries - 1)
        else:
            print("Could not get page. Continuing to next url")
            return

In [None]:
raw_pages.create_index("url", unique=True)

In [None]:
def add_page_to_mongo(url, collection):
    page_content = get_page(url)
    dict_to_add = {"url": url, "content": page_content}
    try:
        collection.insert_one(dict_to_add)
    except DuplicateKeyError:
        print("Duplicate key, moving on")
    except:
        print("Unexpected error for url".format(url))

In [None]:
add_page_to_mongo(urls_to_scrape[0], raw_pages)

In [None]:
for url in urls_to_scrape:
    add_page_to_mongo(url, raw_pages)

# Parsing the Pages

In [None]:
page = raw_pages.find_one()

In [None]:
page['url']

In [None]:
page_tree = html.parse(StringIO(page["content"]))


In [None]:
e = page_tree.xpath("//div[@class='statblock']/p[@class='title']")[0]
e.text

In [None]:
e = page_tree.xpath("//div[@class='statblock']//*[contains(text(),'Senses')]")[0]
e = e.getnext()
while(e is not None):
    print(e.text)
    print(e.tail)
    print()
    e = e.getnext()    


In [None]:
def monster_parser(url):
    page = raw_pages.find_one({"url": url})
    # name
    e = page_tree.xpath("//div[@class='statblock']/p[@class='title']")[0]
    name = e.text
    
    # senses
    senses = {}
    e = page_tree.xpath("//div[@class='statblock']//*[contains(text(),'Senses')]")[0]
    e = e.getnext()
    while(e is not None):
        senses[e.text] = e.tail
        e = e.getnext()    
        
    return {"url": url, "name": name, "senses": senses}


In [None]:
monster_parser('https://www.d20pfsrd.com/bestiary/monster-listings/fey/alp/')

In [None]:
monsters = db.monsters
monsters.create_index('url')

In [None]:
url = 'https://www.d20pfsrd.com/bestiary/monster-listings/fey/alp/'
monsters.replace_one({"url": url}, monster_parser(url), upsert = True)

In [None]:
for url in urls_to_scrape:
    monsters.replace_one({"url": url}, monster_parser(url), upsert = True)

In [None]:
monsters.find_one()