In [1]:
from bs4 import BeautifulSoup
import requests
from time import sleep

In [12]:
# returns ID of the comment and saves it to database
def get_single_comment(ficid, comment, parentID):
    print("Single comment")
    
    # Get comment id
    commentid = comment['id'].split('_')[1]
    
    # if no header, probably a deleted comment
    if comment.find('h4', class_='heading byline') == None:
        print("Deleted comment, passing...")
        return commentid
    
    # Find username
    if comment.find('h4', class_='heading byline').find('a'):
        username = comment.find('h4', class_='heading byline').find('a').contents[0]
    else:
        username = comment.find("h4", class_="heading byline").find("span").text

    # get chapter number
    # each comment has header "username on Chapter x" so get that text and split on spaces to isolate number
    # if single chap fic, no header, so default to 1
    header = comment.find("span", class_="parent")
    if header:
        chapternumber = int(header.text.split(" ")[-1])
    else:
        chapternumber = 1
        
    # Get datetime
    datetime = comment.find('h4', class_='heading byline').find('span', class_="posted datetime").contents
    remove = ['\n', ' ']
    dateObj = {}
    
    for item in datetime:
        if item not in remove:
            itemClass = item['class'][0]
            itemValue = item.contents[0]
            dateObj[itemClass] = itemValue

    print(f'{dateObj["year"]}-{dateObj["month"]}-{dateObj["date"]}')
    

    # Get direct comment text
    text = comment.findAll("p")
    text = [str(p).replace("<br/>", "\n").replace("<p>", "").replace("</p>", "") for p in text]
    text = "\n".join(text)

    # print out comment data
    # TODO: enter data in SQL DB
    
    commentData = (ficid, chapternumber, username, dateObj, commentid, parentID, text)
    print(commentData)
    return commentid

In [13]:
# recursively loops over thread of comments
def get_comment_thread(ficid, thread, parentID):
    print("comment thread")
    
    # get individual comments
    comments = thread.findChildren("li", recursive=False)

    # track what id the parent comment should be
    newParentID = parentID
    for c, comment in enumerate(comments):
        # if only attr is class=commant, it's a collapsed thread we need to open
        if comment.attrs == {'class': ['comment']}:
            url = "http://archiveofourown.org" + comment.find("a")["href"]
            status = 429
            while 429 == status:
                req = requests.get(url)
                status = req.status_code
                if 429 == status:
                    print("Request answered with Status-Code 429")
                    print("Trying again in 1 minute...")
                    sleep(60)
            # for other errors, halt scraping
            if 400 <= status:
                print("Error:", status, ", halting scraping on fic", ficid)
                return
            src = req.text
            soup = BeautifulSoup(src, 'html.parser')
            thread = soup.find("ol", class_="thread")
            get_comment_thread(ficid, thread, newParentID)
            
        # if comments has attrs, it's a single comment
        elif comment.attrs != {}:
            # if we encounter a thread, that thread's parent will always be the most recent single comment
            # so maintain the ID of most recent single comment as parent ID
            newParentID = get_single_comment(ficid, comment, parentID)
            
        # if no attrs, it's a thread -- meaning it is a child of the previous comment
        else:
            thread = comment.findChild("ol")
            get_comment_thread(ficid, thread, newParentID)

In [14]:
def get_comment_page(ficid, page_num):
    url = 'http://archiveofourown.org/works/'+str(ficid)+'?view_adult=true&amp;view_full_work=true&show_comments=true&page='\
    + str(page_num)
    print("URL:", url)
    
    status = 429
    while 429 == status:
        req = requests.get(url)
        status = req.status_code
        if 429 == status:
            print("Request answered with Status-Code 429")
            print("Trying again in 1 minute...")
            sleep(60)
    # for other errors, write out to csv and pass
    if 400 <= status:
        print("Error:", status, ", halting scraping on page", page_num)
        return
    
    src = req.text
    soup = BeautifulSoup(src, 'html.parser')
    
    thread = soup.find('ol', class_ = 'thread')
    if not thread:
        print(f"Fic {ficid} has no comments on page {pagenum}")
        return
    
    get_comment_thread(ficid, thread, 0)

In [15]:
def get_all_comments(ficid):
    url = 'http://archiveofourown.org/works/'+str(ficid)+'?view_adult=true&amp;view_full_work=true&show_comments=true'
    
    status = 429
    while 429 == status:
        req = requests.get(url)
        status = req.status_code
        if 429 == status:
            print("Request answered with Status-Code 429")
            print("Trying again in 1 minute...")
            sleep(60)
    # for other errors, write out to csv and pass
    if 400 <= status:
        print("Error:", status, ", halting scraping on fic", ficid)
        return
    
    src = req.text
    soup = BeautifulSoup(src, 'html.parser')
    
    # check to see if enough comments for multiple pages
    if (soup.find('ol', class_='pagination actions')):
        # get max page num
        numpages = int(soup.find('ol', class_='pagination actions').findChildren("li", recursive=False)[-2].text)
        # get comments for each page
        for i in range(numpages):
            get_comment_page(ficid, i+1)
            
    # if only one page of comments
    else:
        get_comment_page(ficid, 1)

In [16]:
get_all_comments(10057010)

URL: http://archiveofourown.org/works/10057010?view_adult=true&amp;view_full_work=true&show_comments=true&page=1
comment thread
Single comment
2017-Mar-03
(10057010, 1, 'lou', {'day': 'Fri', 'date': '03', 'month': 'Mar', 'year': '2017', 'time': '12:36AM', 'timezone': 'UTC'}, '97071584', 0, "i really like your writing, and the story! i'm excited to read the rest of it :)")
comment thread
Single comment
2023-Jul-18
(10057010, 1, 'lololol', {'day': 'Tue', 'date': '18', 'month': 'Jul', 'year': '2023', 'time': '09:52PM', 'timezone': 'UTC'}, '671875867', '97071584', 'Honestly just here today to ask you how does it feel to be the first to comment on a fic that, as far as I know, turned out to be the one with the most hits on the whole ao3')
comment thread
Single comment
2023-Jul-21
(10057010, 1, 'noxx', {'day': 'Fri', 'date': '21', 'month': 'Jul', 'year': '2023', 'time': '06:34AM', 'timezone': 'UTC'}, '672704722', '671875867', 'ha')
Single comment
2023-Sep-04
(10057010, 1, 'Gabs', {'day': 'Mo

KeyboardInterrupt: 

In [6]:
url = "http://archiveofourown.org/works/10057010?view_adult=true&amp;view_full_work=true&show_comments=true&page=1"
req = requests.get(url)
src = req.text
soup = BeautifulSoup(src, "html.parser")

In [7]:
comment = soup.find("ol", class_="thread").findChildren("li", recursive=False)[0]

In [11]:
comment.find("h4", class_="heading byline").find("span").text

'lou'

In [8]:
# Find username
if comment.find('h4', class_='heading byline').find('a'):
    username = comment.find('h4', class_='heading byline').find('a').contents[0]
else:
    username = comment.find('h4', class_='heading byline').contents[0]
    
username

'\n'