In [4]:
import requests
from bs4 import BeautifulSoup
from unidecode import unidecode

In [8]:
def get_stats(meta):
    '''
    returns a list of  
    language, published, status, date status, words, chapters, comments, kudos, bookmarks, hits
    '''
    categories = ['language', 'published', 'status', 'words', 'chapters', 'comments', 'kudos', 'bookmarks', 'hits'] 

    stats = list(map(lambda category: meta.find("dd", class_=category), categories))

    if not stats[2]:
        stats[2] = stats[1] #no explicit completed field -- one shot
    try:		
        stats = [unidecode(stat.text) for stat in stats]
    except AttributeError as e: #for some reason, AO3 sometimes miss stat tags (like hits)
        new_stats = []
        for stat in stats:
            if stat: new_stats.append(unidecode(stat.text))
            else: new_stats.append('null')
        stats = new_stats

    stats[0] = stats[0].rstrip().lstrip() #language has weird whitespace characters
    #add a custom completed/updated field
    status  = meta.find("dt", class_="status")
    if not status: status = 'Completed' 
    else: status = status.text.strip(':')
    stats.insert(2, status)

    return stats      

def get_tag_info(category, meta):
    '''
    given a category and a 'work meta group, returns a list of tags (eg, 'rating' -> 'explicit')
    '''
    try:
        tag_list = meta.find("dd", class_=str(category) + ' tags').find_all(class_="tag")
    except AttributeError as e:
        return []
    return [unidecode(result.text) for result in tag_list] 

def get_tags(meta):
    '''
    returns a list of lists, of
    rating, category, fandom, pairing, characters, additional_tags
    '''
    tags = ['rating', 'category', 'fandom', 'relationship', 'character', 'freeform']
    return list(map(lambda tag: get_tag_info(tag, meta), tags))

# get kudos
def get_kudos(meta):
    if (meta):
        users = []
        ## hunt for kudos' contents
        kudos = meta.contents

        # extract user names
        for kudo in kudos:
            if kudo.name == 'a':
                if 'more users' not in kudo.contents[0] and '(collapse)' not in kudo.contents[0]:
                    users.append(kudo.contents[0])
        
        return users
    return []

# get author(s)
def get_authors(meta):
    tags = meta.contents
    authors = []

    for tag in tags:
        if tag.name == 'a':
            authors.append(tag.contents[0])

    return authors

def access_denied(soup):
    if (soup.find(class_="flash error")):
        return True
    if (not soup.find(class_="work meta group")):
        return True
    return False

In [97]:
def scrapefic(fic_id):
    url = 'http://archiveofourown.org/works/'+str(fic_id)+'?view_adult=true&amp;view_full_work=true'
    print(url)
    req = requests.get(url)

    src = req.text
    soup = BeautifulSoup(src, 'html.parser')
    if (access_denied(soup)):
        print('Access Denied')
    else:
        meta = soup.find("dl", class_="work meta group")
        author = get_authors(soup.find("h3", class_="byline heading"))
        tags = get_tags(meta)
        stats = get_stats(meta)
        title = unidecode(soup.find("h2", class_="title heading").string).strip()

    #get the fic itself
    content = soup.find_all("div", class_ = "chapter")
    if not content:
        content = soup.find("div", id = "chapters")
    return content

In [101]:
mist = scrapefic(234222)
eviv = scrapefic(20049589)
dude = scrapefic(10057010)

http://archiveofourown.org/works/234222?view_adult=true&amp;view_full_work=true
http://archiveofourown.org/works/20049589?view_adult=true&amp;view_full_work=true
http://archiveofourown.org/works/10057010?view_adult=true&amp;view_full_work=true


In [106]:
mist.find("div", class_ = "userstuff").find_all("p")

[<p> </p>,
 <p>His cloak was soaked within minutes. Three Impervius Charms, several Drying Spells and one Conjured umbrella later, even his pants were wet.</p>,
 <p>Draco tugged at his hood to pull it as low as it would go so that at least his face would be spared from the merciless rain. He half-expected the cloud to fly downward, peek beneath the hood and spew rain into his eyes. At this point, it would not have surprised him. When Draco had Conjured the umbrella on his way to the hospital wing, the cloud had engulfed his head, filling his nose and mouth with fog cold as ghost, and Draco had panicked so fiercely he had Vanished the umbrella with a mere thought.</p>,
 <p>"At least there's no thunder," Goyle said bracingly, perched on one of the empty beds that was on a safe distance away from Draco and his rain-spitting cloud.</p>,
 <p>Draco groaned. He wished Goyle hadn't said that. It had sounded more prophetic than comforting. Not to mention the cloud seemed suspiciously sentient. 

In [104]:
eviv[0].find("div", class_ = "userstuff").find_all("p")

[<p>In the summer before his fifth year at Hogwarts, Harry found himself staring at a door that Hermione had said wouldn’t open for anyone.</p>,
 <p>After the Dementor fiasco, Harry arrived at Grimmauld Place, and Hermione and Ron filled Harry in on their summer. Harry hadn’t been happy with either of them; he’d much rather have been cleaning doxies and dark artifacts with his friends than weeding the Dursleys’ garden or washing the Dursleys’ floors for the third time in one week.</p>,
 <p>Even more irritating was the fact that their lack of communication with him had apparently been by Dumbledore’s orders. Dumbledore, who hadn’t even bothered to look him in the eye during his trial, had effectively commanded his friends to abandon him. Even more hurtful, they’d listened.</p>,
 <p>He’d made a half-assed attempt at hiding his irritation before his trial. After the trial, however, Dumbledore’s completely apathetic attitude towards Harry made him not bother holding it in any longer.</p>,


In [107]:
dude[0].find("div", class_ = "userstuff").find_all("p")

[<p>Saturday 7th August, 1971</p>,
 <p>He woke up in the dark. It was too hot in the little room they’d put him in, being early August. Though he supposed that could be the fever. He always had a high temperature, the morning after. They used to put him in a room with a window, but a few months ago he’d been able to smash one of them, and if it hadn’t had bars anyway then he’d have escaped. He’d heard them talking about restraining him as he got older. He tried not to think about it. </p>,
 <p>He remembered the feeling of hunger, so intense it transformed into rage. He remembered howling and keening for hours, circling the cell over and over again. Perhaps they’d let him off lessons today, and he could sleep. It was the summer holiday’s anyway, and not fair that he had to do lessons when all the other boys were allowed to spend all day dossing about, playing football or watching telly. Sitting up, he stretched carefully, paying attention to every ache and pop of his joints. There was a