In [1]:
from bs4 import BeautifulSoup
import requests
from time import sleep

In [2]:
# returns ID of the comment and saves it to database
def get_single_comment(ficid, comment, parentID):
    print("Single comment")
    # Find username
    if comment.find('h4', class_='heading byline').find('a'):
        username = comment.find('h4', class_='heading byline').find('a').contents[0]
    else:
        username = comment.find('h4', class_='heading byline').contents[0]

    # get chapter number
    # each comment has header "username on Chapter x" so get that text and split on spaces to isolate number
    # if single chap fic, no header, so default to 1
    header = comment.find("span", class_="parent")
    if header:
        chapternumber = int(header.text.split(" ")[-1])
    else:
        chapternumber = 1
        
    # Get datetime
    datetime = comment.find('h4', class_='heading byline').find('span', class_="posted datetime").contents
    remove = ['\n', ' ']
    dateObj = {}

    for item in datetime:
        if item not in remove:
            itemClass = item['class'][0]
            itemValue = item.contents[0]
            dateObj[itemClass] = itemValue

    # Get comment id
    commentid = comment['id'].split('_')[1]

    # Get direct comment text
    text = comment.findAll("p")
    text = [str(p).replace("<br/>", "\n").replace("<p>", "").replace("</p>", "") for p in text]
    text = "\n".join(text)

    # print out comment data
    # TODO: enter data in SQL DB
    
    commentData = (ficid, chapternumber, username, dateObj, commentid, parentID, text)
    print(commentData)
    return commentid

In [3]:
# recursively loops over thread of comments
def get_comment_thread(ficid, thread, parentID):
    print("comment thread")
    
    # get individual comments
    comments = thread.findChildren("li", recursive=False)

    # track what id the parent comment should be
    newParentID = parentID
    for c, comment in enumerate(comments):
        # if only attr is class=commant, it's a collapsed thread we need to open
        if comment.attrs = {'class': ['comment']}:
            url = "http://archiveofourown.org" + comment.find("a")["href"]
            status = 429
            while 429 == status:
                req = requests.get(url)
                status = req.status_code
                if 429 == status:
                    print("Request answered with Status-Code 429")
                    print("Trying again in 1 minute...")
                    sleep(60)
            # for other errors, halt scraping
            if 400 <= status:
                print("Error:", status, ", halting scraping on fic", ficid)
                return
            thread = soup.find("ol", class_="thread")
            get_comment_thread(ficid, thread, newParentID)
            
        # if comments has attrs, it's a single comment
        else if comment.attrs != {}:
            # if we encounter a thread, that thread's parent will always be the most recent single comment
            # so maintain the ID of most recent single comment as parent ID
            newParentID = get_single_comment(ficid, comment, parentID)
            
        # if no attrs, it's a thread -- meaning it is a child of the previous comment
        else:
            thread = comment.findChild("ol")
            get_comment_thread(ficid, thread, newParentID)

In [4]:
def get_comment_page(ficid, page_num):
    url = 'http://archiveofourown.org/works/'+str(ficid)+'?view_adult=true&amp;view_full_work=true&show_comments=true&page='\
    + str(page_num)
    print("URL:", url)
    
    status = 429
    while 429 == status:
        req = requests.get(url)
        status = req.status_code
        if 429 == status:
            print("Request answered with Status-Code 429")
            print("Trying again in 1 minute...")
            sleep(60)
    # for other errors, write out to csv and pass
    if 400 <= status:
        print("Error:", status, ", halting scraping on page", page_num)
        return
    
    src = req.text
    soup = BeautifulSoup(src, 'html.parser')
    
    thread = soup.find('ol', class_ = 'thread')
    if not thread:
        print(f"Fic {ficid} has no comments on page {pagenum}")
        return
    
    get_comment_thread(ficid, thread, 0)

In [7]:
def get_all_comments(ficid):
    url = 'http://archiveofourown.org/works/'+str(ficid)+'?view_adult=true&amp;view_full_work=true&show_comments=true'
    
    status = 429
    while 429 == status:
        req = requests.get(url)
        status = req.status_code
        if 429 == status:
            print("Request answered with Status-Code 429")
            print("Trying again in 1 minute...")
            sleep(60)
    # for other errors, write out to csv and pass
    if 400 <= status:
        print("Error:", status, ", halting scraping on fic", ficid)
        return
    
    src = req.text
    soup = BeautifulSoup(src, 'html.parser')
    
    # check to see if enough comments for multiple pages
    if (soup.find('ol', class_='pagination actions')):
        # get max page num
        numpages = int(soup.find('ol', class_='pagination actions').findChildren("li", recursive=False)[-2].text)
        # get comments for each page
        for i in range(numpages):
            get_comment_page(ficid, i+1)
            
    # if only one page of comments
    else:
        get_comment_page(ficid, 1)

In [10]:
get_comment_page(10057010, 1)

URL: http://archiveofourown.org/works/10057010?view_adult=true&amp;view_full_work=true&show_comments=true&page=1
comment thread
Single comment
(10057010, 1, '\n', {'day': 'Fri', 'date': '03', 'month': 'Mar', 'year': '2017', 'time': '12:36AM', 'timezone': 'UTC'}, '97071584', 0, "i really like your writing, and the story! i'm excited to read the rest of it :)")
comment thread
Single comment
(10057010, 1, '\n', {'day': 'Tue', 'date': '18', 'month': 'Jul', 'year': '2023', 'time': '09:52PM', 'timezone': 'UTC'}, '671875867', '97071584', 'Honestly just here today to ask you how does it feel to be the first to comment on a fic that, as far as I know, turned out to be the one with the most hits on the whole ao3')
comment thread
Single comment
(10057010, 1, '\n', {'day': 'Fri', 'date': '21', 'month': 'Jul', 'year': '2023', 'time': '06:34AM', 'timezone': 'UTC'}, '672704722', '671875867', 'ha')
Single comment
(10057010, 1, '\n', {'day': 'Mon', 'date': '04', 'month': 'Sep', 'year': '2023', 'time': 

AttributeError: 'NoneType' object has no attribute 'find'

In [8]:
get_all_comments(10057010)

Request answered with Status-Code 429
Trying again in 1 minute...
URL: http://archiveofourown.org/works/10057010?view_adult=true&amp;view_full_work=true&show_comments=true&page=1
comment thread
Single comment
(10057010, 1, '\n', {'day': 'Fri', 'date': '03', 'month': 'Mar', 'year': '2017', 'time': '12:36AM', 'timezone': 'UTC'}, '97071584', 0, "i really like your writing, and the story! i'm excited to read the rest of it :)")
comment thread
Single comment
(10057010, 1, '\n', {'day': 'Tue', 'date': '18', 'month': 'Jul', 'year': '2023', 'time': '09:52PM', 'timezone': 'UTC'}, '671875867', '97071584', 'Honestly just here today to ask you how does it feel to be the first to comment on a fic that, as far as I know, turned out to be the one with the most hits on the whole ao3')
comment thread
Single comment
(10057010, 1, '\n', {'day': 'Fri', 'date': '21', 'month': 'Jul', 'year': '2023', 'time': '06:34AM', 'timezone': 'UTC'}, '672704722', '671875867', 'ha')
Single comment
(10057010, 1, '\n', {'d

AttributeError: 'NoneType' object has no attribute 'find'

In [14]:
fic_id = 10057010
url = 'http://archiveofourown.org/works/'+str(fic_id)+'?view_adult=true&amp;view_full_work=true&show_comments=true&page=1'
req = requests.get(url, "")
src = req.text
soup = BeautifulSoup(src, 'html.parser')

print("URL:", url)

URL: http://archiveofourown.org/works/10057010?view_adult=true&amp;view_full_work=true&show_comments=true&page=1


In [33]:
def get_thread(comment):
    return comment.find("ol", class_="thread").findChildren("li", recursive=False)

In [44]:
comments = get_thread(soup)
comments[1]

<li>
<ol class="thread">
<li class="even guest comment group" id="comment_671875867" role="article">
<h4 class="heading byline">
<span>lololol</span><span class="role"> (Guest)</span>
<span class="parent">
on Chapter 1
</span>
<span class="posted datetime">
<abbr class="day" title="Tuesday">Tue</abbr> <span class="date">18</span>
<abbr class="month" title="July">Jul</abbr> <span class="year">2023</span>
<span class="time">09:52PM</span> <abbr class="timezone" title="UTC">UTC</abbr>
</span>
</h4>
<div class="icon">
<span class="visitor icon"></span>
</div>
<blockquote class="userstuff">
<p>Honestly just here today to ask you how does it feel to be the first to comment on a fic that, as far as I know, turned out to be the one with the most hits on the whole ao3</p>
</blockquote>
<h5 class="landmark heading">Comment Actions</h5>
<ul class="actions" id="navigation_for_comment_671875867">
<li id="add_comment_reply_link_671875867"><a data-remote="true" href="/comments/add_comment_reply?chapt

In [45]:
comments = get_thread(comments[1])
comments[0]

<li class="even guest comment group" id="comment_671875867" role="article">
<h4 class="heading byline">
<span>lololol</span><span class="role"> (Guest)</span>
<span class="parent">
on Chapter 1
</span>
<span class="posted datetime">
<abbr class="day" title="Tuesday">Tue</abbr> <span class="date">18</span>
<abbr class="month" title="July">Jul</abbr> <span class="year">2023</span>
<span class="time">09:52PM</span> <abbr class="timezone" title="UTC">UTC</abbr>
</span>
</h4>
<div class="icon">
<span class="visitor icon"></span>
</div>
<blockquote class="userstuff">
<p>Honestly just here today to ask you how does it feel to be the first to comment on a fic that, as far as I know, turned out to be the one with the most hits on the whole ao3</p>
</blockquote>
<h5 class="landmark heading">Comment Actions</h5>
<ul class="actions" id="navigation_for_comment_671875867">
<li id="add_comment_reply_link_671875867"><a data-remote="true" href="/comments/add_comment_reply?chapter_id=22409387&amp;id=671

In [46]:
comments = get_thread(comments[1])
comments[4]

<li>
<ol class="thread">
<li class="odd comment group user-4407885" id="comment_705507193" role="article">
<h4 class="heading byline">
<a href="/users/avatarazutara/pseuds/avatarazutara">avatarazutara</a>
<span class="parent">
on Chapter 1
</span>
<span class="posted datetime">
<abbr class="day" title="Saturday">Sat</abbr> <span class="date">28</span>
<abbr class="month" title="October">Oct</abbr> <span class="year">2023</span>
<span class="time">03:55AM</span> <abbr class="timezone" title="UTC">UTC</abbr>
</span>
</h4>
<div class="icon">
<a href="/users/avatarazutara/pseuds/avatarazutara"><img alt="" class="icon" src="https://s3.amazonaws.com/otw-ao3-icons/icons/4737675/standard.jpeg?1638696085"/></a>
</div>
<blockquote class="userstuff">
<p>omfg 😂</p>
</blockquote>
<h5 class="landmark heading">Comment Actions</h5>
<ul class="actions" id="navigation_for_comment_705507193">
<li id="add_comment_reply_link_705507193"><a data-remote="true" href="/comments/add_comment_reply?chapter_id=2240

In [47]:
comments = get_thread(comments[4])
comments[2]

<li class="even guest comment group" id="comment_719663482" role="article">
<h4 class="heading byline">
<span>stella</span><span class="role"> (Guest)</span>
<span class="parent">
on Chapter 1
</span>
<span class="posted datetime">
<abbr class="day" title="Monday">Mon</abbr> <span class="date">11</span>
<abbr class="month" title="December">Dec</abbr> <span class="year">2023</span>
<span class="time">11:26PM</span> <abbr class="timezone" title="UTC">UTC</abbr>
</span>
</h4>
<div class="icon">
<span class="visitor icon"></span>
</div>
<blockquote class="userstuff">
<p>wait wdym?</p>
</blockquote>
<h5 class="landmark heading">Comment Actions</h5>
<ul class="actions" id="navigation_for_comment_719663482">
<li id="add_comment_reply_link_719663482"><a data-remote="true" href="/comments/add_comment_reply?chapter_id=22409387&amp;id=719663482&amp;page=1&amp;view_full_work=true">Reply</a></li>
<li><a href="/comments/719663482">Thread</a></li>
<li>
<a href="/comments/97071584">Parent Thread</a>
<

In [48]:
comments = get_thread(comments[2])
comments[1]

<li class="odd guest comment group" id="comment_740899906" role="article">
<h4 class="heading byline">
<span>holly</span><span class="role"> (Guest)</span>
<span class="parent">
on Chapter 1
</span>
<span class="posted datetime">
<abbr class="day" title="Monday">Mon</abbr> <span class="date">12</span>
<abbr class="month" title="February">Feb</abbr> <span class="year">2024</span>
<span class="time">03:35PM</span> <abbr class="timezone" title="UTC">UTC</abbr>
</span>
</h4>
<div class="icon">
<span class="visitor icon"></span>
</div>
<blockquote class="userstuff">
<p>some people think that mskingbean89 is taylor swift</p>
</blockquote>
<h5 class="landmark heading">Comment Actions</h5>
<ul class="actions" id="navigation_for_comment_740899906">
<li id="add_comment_reply_link_740899906"><a data-remote="true" href="/comments/add_comment_reply?chapter_id=22409387&amp;id=740899906&amp;page=1&amp;view_full_work=true">Reply</a></li>
<li><a href="/comments/740899906">Thread</a></li>
<li>
<a href="

In [49]:
comments = get_thread(comments[1])
comments[0]

<li class="even guest comment group" id="comment_741277471" role="article">
<h4 class="heading byline">
<span>anna</span><span class="role"> (Guest)</span>
<span class="parent">
on Chapter 1
</span>
<span class="posted datetime">
<abbr class="day" title="Tuesday">Tue</abbr> <span class="date">13</span>
<abbr class="month" title="February">Feb</abbr> <span class="year">2024</span>
<span class="time">04:36PM</span> <abbr class="timezone" title="UTC">UTC</abbr>
</span>
</h4>
<div class="icon">
<span class="visitor icon"></span>
</div>
<blockquote class="userstuff">
<p>nobody actually thinks that LMFAO it’s just an inside joke</p>
</blockquote>
<h5 class="landmark heading">Comment Actions</h5>
<ul class="actions" id="navigation_for_comment_741277471">
<li id="add_comment_reply_link_741277471"><a data-remote="true" href="/comments/add_comment_reply?chapter_id=22409387&amp;id=741277471&amp;page=1&amp;view_full_work=true">Reply</a></li>
<li><a href="/comments/741277471">Thread</a></li>
<li>
<

In [61]:
comments[1].attrs == {'class': ['comment']}

True

In [55]:
comments[1].find("a")["href"]

'/comments/741277471'

In [62]:
url = "http://archiveofourown.org" + comments[1].find("a")["href"]
req = requests.get(url, "")
src = req.text
soup = BeautifulSoup(src, 'html.parser')
thread = soup.find("ol", class_="thread")

In [63]:
get_comment_thread(777, thread, 999)

comment thread
Single comment
(777, 1, '\n', {'day': 'Tue', 'date': '13', 'month': 'Feb', 'year': '2024', 'time': '04:36PM', 'timezone': 'UTC'}, '741277471', 999, 'nobody actually thinks that LMFAO it’s just an inside joke')
comment thread
Single comment
(777, 1, 'ImComplicatedAF23', {'day': 'Mon', 'date': '19', 'month': 'Feb', 'year': '2024', 'time': '08:10AM', 'timezone': 'UTC'}, '743364499', '741277471', 'Yes we actually think that')
Single comment
(777, 1, '\n', {'day': 'Thu', 'date': '18', 'month': 'Apr', 'year': '2024', 'time': '03:14PM', 'timezone': 'UTC'}, '764696182', '741277471', 'bffr yes we do')
