In [1]:
import requests
from bs4 import BeautifulSoup
from unidecode import unidecode

In [2]:
def access_denied(soup):
    if (soup.find(class_="flash error")):
        return True
    if (not soup.find(class_="work meta group")):
        return True
    return False

In [3]:
def get_stats(meta):
    '''
    returns a list of  
    language, published, status, date status, words, chapters, comments, kudos, bookmarks, hits
    '''
    categories = ['language', 'published', 'status', 'words', 'chapters', 'comments', 'kudos', 'bookmarks', 'hits'] 

    stats = list(map(lambda category: meta.find("dd", class_=category), categories))

    if not stats[2]:
        stats[2] = stats[1] #no explicit completed field -- one shot
    try:		
        stats = [unidecode(stat.text) for stat in stats]
    except AttributeError as e: #for some reason, AO3 sometimes miss stat tags (like hits)
        new_stats = []
        for stat in stats:
            if stat: new_stats.append(unidecode(stat.text))
            else: new_stats.append('null')
        stats = new_stats

    stats[0] = stats[0].rstrip().lstrip() #language has weird whitespace characters
    #add a custom completed/updated field
    status  = meta.find("dt", class_="status")
    if not status: status = 'Completed' 
    else: status = status.text.strip(':')
    stats.insert(2, status)

    return stats      

def get_tag_info(category, meta):
    '''
    given a category and a 'work meta group, returns a list of tags (eg, 'rating' -> 'explicit')
    '''
    try:
        tag_list = meta.find("dd", class_=str(category) + ' tags').find_all(class_="tag")
    except AttributeError as e:
        return []
    return [unidecode(result.text) for result in tag_list] 

def get_tags(meta):
    '''
    returns a list of lists, of
    rating, category, fandom, pairing, characters, additional_tags
    '''
    tags = ['rating', 'category', 'fandom', 'relationship', 'character', 'freeform']
    return list(map(lambda tag: get_tag_info(tag, meta), tags))

# get kudos
def get_kudos(meta):
    if (meta):
        users = []
        ## hunt for kudos' contents
        kudos = meta.contents

        # extract user names
        for kudo in kudos:
            if kudo.name == 'a':
                if 'more users' not in kudo.contents[0] and '(collapse)' not in kudo.contents[0]:
                    users.append(kudo.contents[0])
        
        return users
    return []

# get author(s)
def get_authors(meta):
    tags = meta.contents
    authors = []

    for tag in tags:
        if tag.name == 'a':
            authors.append(tag.contents[0])

    return authors

In [4]:
def scrapefic(fic_id):
    url = 'http://archiveofourown.org/works/'+str(fic_id)+'?view_adult=true&amp;view_full_work=true'
    print(url)
    req = requests.get(url)

    src = req.text
    soup = BeautifulSoup(src, 'html.parser')
    if (access_denied(soup)):
        print('Access Denied')

    #get the fic itself
    return soup

In [5]:
mist = scrapefic(234222)
eviv = scrapefic(20049589)
dude = scrapefic(10057010)
date = scrapefic(57465262)
soul = scrapefic(35109247)
this = scrapefic(31150925)

http://archiveofourown.org/works/234222?view_adult=true&amp;view_full_work=true
http://archiveofourown.org/works/20049589?view_adult=true&amp;view_full_work=true
http://archiveofourown.org/works/10057010?view_adult=true&amp;view_full_work=true
http://archiveofourown.org/works/57465262?view_adult=true&amp;view_full_work=true
http://archiveofourown.org/works/35109247?view_adult=true&amp;view_full_work=true
http://archiveofourown.org/works/31150925?view_adult=true&amp;view_full_work=true


In [6]:
chapters = this.select("div[id^=chapter-]")

In [10]:
first = scrapefic(30996401)
meta = first.find("dl", class_="work meta group")
stats = get_stats(meta)

http://archiveofourown.org/works/30996401?view_adult=true&amp;view_full_work=true


In [14]:
for i, stat in enumerate(stats):
    print(i, stat)

0 English
1 2021-04-30
2 Completed
3 2021-04-30
4 3,591
5 1/1
6 2
7 347
8 null
9 1,219


In [19]:
test = scrapefic(42027426)
meta = test.find("dl", class_="work meta group")
stats = get_stats(meta)
for i, stat in enumerate(stats):
    print(i, stat)

http://archiveofourown.org/works/42027426?view_adult=true&amp;view_full_work=true
0 English
1 2022-09-30
2 Updated
3 2024-03-25
4 947,233
5 38/?
6 69
7 263
8 71
9 8,743


In [20]:
chapters = test.select("div[id^=chapter-]")

In [24]:
for i, chapter in enumerate(chapters):
    title = chapter.select_one(".title")
    if title: title = title.text

    # first chapter has extra info in different place
    if i == 0:
        summary = test.select_one(".summary.module")
        if summary: summary = summary.text

        notes = test.select_one(".notes.module")
        if notes: notes = notes.text

        endnotes = test.select_one(".end.notes.module")
        if endnotes: endnotes = endnotes.text
    else:
        summary = chapter.select_one("div[id=summary]")
        if summary: summary = summary.text

        notes = chapter.select_one("div[id=notes]")
        if notes: notes = notes.text

        endnotes = chapter.select_one(".end.notes.module")
        if endnotes: endnotes = endnotes.text

    body = chapter.select_one(".userstuff.module")
    lines = body.select("p")
    text = "\n".join([unidecode(line.text) for line in lines])

    print(i, endnotes)


0 
Notes:

This story is so massive I have to post it in multipole chapters!


1 None
2 None
3 None
4 None
5 None
6 None
7 None
8 None
9 None
10 None
11 None
12 None
13 None
14 None
15 None
16 None
17 None
18 None
19 None
20 None
21 None
22 None
23 None
24 None
25 None
26 
Notes:

This story is so massive I have to post it in multipole chapters!


27 None
28 None
29 None
30 None
31 None
32 
Notes:

AN:
Edit: 31 October 2019
I should be sleeping. I should be working on school work. But no. No. I just have to be on A Song of Ice & Fire kick. I know my muse is giggling their ass off at my expense, surrounded by friggin' plot-bunnies that she loves to fling at me unexpectedly when she thinks I have too much focus on one thing
I hope you're all happy.
Some Notes of Things Mentioned in the Chapter:
1: Aikido
Aikido... I made Ana Marino know this for a reason. The poor woman has to have a fighting fucking chance in terms of self-defense in Westeros, and if Cersei Lannister couldn't bitch and 