In [36]:
from bs4 import BeautifulSoup
import re
import time
import datetime
import requests
import argparse
import os
import csv
import sys
from unidecode import unidecode

import pandas as pd, numpy as np

# Parse args

In [2]:
page_empty = False
username = "incessantbeat"
url = f'https://archiveofourown.org/users/{username}/bookmarks'
page = 1
num_requested_fic = 0
num_recorded_fic = 0
csv_name = ""
multichap_only = False
tags = []

# keep track of all processed ids to avoid repeats:
# this is separate from the temporary batch of ids
# that are written to the csv and then forgotten
seen_ids = []

In [3]:
seen_ids

[]

# Main user's bookmarked ids

In [4]:
def get_ids():
    """ Get work ids of bookmarked fics """
    global page_empty
    req = requests.get(url)
    soup = BeautifulSoup(req.text, "lxml")

    fics = soup.select("li.bookmark.blurb.group")
    # see if we've gone too far and run out of fic:
    if len(fics) == 0:
        page_empty = True
        print("No more fics to fetch!")
        return

    # process list for new fic ids
    ids = []
    for idx, f in enumerate(fics):
        try:
            header = f.find('h4', class_='heading').find(href=True)
            t = header['href'].split('/')[-1]
            n = header.text
            if t not in seen_ids:
                ids.append(t)
                seen_ids.append(t)
        except:
            continue
    return ids

def update_url_to_next_page():
    global url
    global page
    key = "page="
    start = url.find(key)

    # there is already a page indicator in the url
    if start != -1:
        # find where in the url the page indicator starts and ends
        page_start_index = start + len(key)
        page_end_index = url.find("&", page_start_index)
        # if it's in the middle of the url
        if page_end_index != -1:
            page = int(url[page_start_index:page_end_index]) + 1
            url = url[:page_start_index] + str(page) + url[page_end_index:]
        # if it's at the end of the url
        else:
            page = int(url[page_start_index:]) + 1
            url = url[:page_start_index] + str(page)

    # there is no page indicator, so we are on page 1
    else:
        # there are other modifiers
        if url.find("?") != -1:
            url = url + "&page=2"
        # there are no modifiers yet
        else:
            url = url + "?page=2"
        page = 2


def retrieve_ids():
    while not page_empty:
        time.sleep(5)
        print(f"Processing page {page}...")
        ids = get_ids()
        update_url_to_next_page()

In [5]:
retrieve_ids()
print(seen_ids)

Processing page 1...
Processing page 2...
No more fics to fetch!
['28429821', '33658459', '33494707', '29839875', '21332461', '25333126', '26027836', '29618487', '29288772', '25839787', '23586082', '24363613', '25681186', '23492518', '24260716', '12830118', '12520952']


NOTE: do I need to write bookmarked ids to a text file?

# Metadata of bookmarked ids

In [67]:
# get author(s)
def get_authors(meta):
    tags = meta.contents
    authors = []
    for tag in tags:
        if tag.name == 'a':
            authors.append(tag.contents[0])
    return authors

def get_tag_info(category, meta):
    '''
    given a category and a 'work meta group, returns a list of tags (eg, 'rating' -> 'explicit')
    '''
    try:
        tag_list = meta.find("dd", class_=str(category) + ' tags').find_all(class_="tag")
    except AttributeError as e:
        return []
    return [unidecode(result.text).rstrip().lstrip().lower() for result in tag_list] 

def get_tags(meta):
    '''
    returns a list of lists, of
    rating, category, fandom, pairing, characters, additional_tags
    '''
    tags = ['rating', 'category', 'fandom', 'relationship', 'character', 'freeform']
    info_list = list(map(lambda tag: get_tag_info(tag, meta), tags))
    res = {}
    for tag, info in zip(tags, info_list):
        res[tag] = info
    return res

def get_stats(meta):
    categories = ['language', 'published', 'status', 'words', 'chapters', 'comments', 'kudos', 'bookmarks', 'hits']
    stats = list(map(lambda category: meta.find("dd", class_=category), categories))
    res = {}
    for cat, stat in zip(categories, stats):
        if stat:
            res[cat] = unidecode(stat.text).rstrip().lstrip().lower()
        else:
            res[cat] = np.nan
    return res

def get_kudos(meta):
    if (meta):
        users = []
        ## hunt for kudos' contents
        kudos = meta.contents

        # extract user names
        for kudo in kudos:
            if kudo.name == 'a':
                if 'more users' not in kudo.contents[0] and '(collapse)' not in kudo.contents[0]:
                    users.append(kudo.contents[0])

        return users
    return []

# get bookmarks by page
def get_bookmarks(url):
    bookmarks = []

    req = requests.get(url)
    src = req.text

    time.sleep(5)
    soup = BeautifulSoup(src, 'html.parser')

    print('scraping bookmarks ')

    # find all pages
    if (soup.find('ol', class_='pagination actions')):
        pages = soup.find('ol', class_='pagination actions').findChildren("li" , recursive=False)
        max_pages = int(pages[-2].contents[0].contents[0])
        count = 1

        while count <= max_pages:
            # extract each bookmark per user
            tags = soup.findAll('h5', class_='byline heading')
            bookmarks += get_other_users(tags)

            # next page
            count+=1
            req = requests.get(url+'?page='+str(count))
            src = req.text
            soup = BeautifulSoup(src, 'html.parser')
            time.sleep(5)
    else:
        tags = soup.findAll('h5', class_='byline heading')
        bookmarks += get_other_users(tags)
    return bookmarks

# get users form bookmarks, excluding yourself
def get_other_users (meta):
    users = []
    for tag in meta:
            user = tag.findChildren("a" , recursive=False)[0].contents[0]
            users.append(user)
    return [u for u in users if u != username]

def access_denied(soup):
    if soup.find(class_="flash error"):
        return True
    if not soup.find(class_="work meta group"):
        return True
    return False

def get_metadata(fic_id):
    print(f"Scraping {fic_id}...")
    url = f'http://archiveofourown.org/works/{fic_id}?view_adult=true'
    req = requests.get(url)
    soup = BeautifulSoup(req.text, 'html.parser')
    if access_denied(soup):
        print('Access Denied')
        return
    else:
        meta = soup.find("dl", class_="work meta group")
        meta_dict = {}
        meta_dict["url"] = url
        meta_dict["author"] = get_authors(soup.find("h3", class_="byline heading"))
        # unpack tags and stats sub-tags
        for key, value in {**get_tags(meta), **get_stats(meta)}.items():
            if len(value) > 1:
                meta_dict[key] = value
            else:
                meta_dict[key] = value[0]
        meta_dict["title"] = unidecode(soup.find("h2", class_="title heading").string).strip()
        visible_kudos = get_kudos(soup.find('p', class_='kudos'))
        hidden_kudos = get_kudos(soup.find('span', class_='kudos_expanded hidden'))
        meta_dict["all_kudos"] = visible_kudos + hidden_kudos
        
        #get bookmarks
        bookmark_url = f'http://archiveofourown.org/works/{fic_id}/bookmarks'
        meta_dict["all_bookmarks"] = get_bookmarks(bookmark_url)
        return meta_dict

In [77]:
url = f'http://archiveofourown.org/works/28429821?view_adult=true'
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
meta = soup.find("dl", class_="work meta group")
dest = {**get_tags(meta), **get_stats(meta)} 
dest.items()

dict_items([('rating', ['mature']), ('category', ['m/m']), ('fandom', ['nct (band)']), ('relationship', ['lee donghyuck | haechan/mark lee']), ('character', ['mark lee (nct)', 'lee donghyuck | haechan', 'na jaemin']), ('freeform', ['alternate universe - college/university', 'friends with benefits', 'making out', 'alcohol and weed', 'angst', 'unrequited crush', 'pining', 'drunken confessions', 'drunken kissing', 'markhyuck week 2021', 'day 5: touch | passion', 'sexual tension', 'unresolved romantic tension']), ('language', 'english'), ('published', '2020-12-30'), ('status', nan), ('words', '4199'), ('chapters', '1/1'), ('comments', '32'), ('kudos', '369'), ('bookmarks', '42'), ('hits', '4669')])

In [68]:
d = get_metadata(28429821)

Scraping 28429821...
scraping bookmarks 


In [51]:
d

{'url': 'http://archiveofourown.org/works/28429821?view_adult=true',
 'author': ['tinymark (lumoon33)'],
 'tags': {'rating': ['mature'],
  'category': ['m/m'],
  'fandom': ['nct (band)'],
  'relationship': ['lee donghyuck | haechan/mark lee'],
  'character': ['mark lee (nct)', 'lee donghyuck | haechan', 'na jaemin'],
  'freeform': ['alternate universe - college/university',
   'friends with benefits',
   'making out',
   'alcohol and weed',
   'angst',
   'unrequited crush',
   'pining',
   'drunken confessions',
   'drunken kissing',
   'markhyuck week 2021',
   'day 5: touch | passion',
   'sexual tension',
   'unresolved romantic tension']},
 'stats': {'language': 'english',
  'published': '2020-12-30',
  'status': nan,
  'words': '4199',
  'chapters': '1/1',
  'comments': '32',
  'kudos': '369',
  'bookmarks': '42',
  'hits': '4669'},
 'title': 'falling to the bathroom floor',
 'all_kudos': ['otterseoul',
  'ari_thereyet',
  'Slut_Seokjinnie',
  'skymoonlight',
  'nomdeguerre',
  '

In [69]:
df = pd.DataFrame([d]).reset_index()

In [70]:
df

Unnamed: 0,index,url,author,rating,category,fandom,relationship,character,freeform,language,...,status,words,chapters,comments,kudos,bookmarks,hits,title,all_kudos,all_bookmarks
0,0,http://archiveofourown.org/works/28429821?view...,[tinymark (lumoon33)],[mature],[m/m],[nct (band)],[lee donghyuck | haechan/mark lee],"[mark lee (nct), lee donghyuck | haechan, na j...","[alternate universe - college/university, frie...",english,...,,4199,1/1,32,369,42,4669,falling to the bathroom floor,"[otterseoul, ari_thereyet, Slut_Seokjinnie, sk...","[chenlecentric, leehaechanace, beom00, lovedia..."
