In [1]:
from bs4 import BeautifulSoup
import requests
import re

In [2]:
# Creates the url we are scraping from (Assumes user knowledge of sorting/rating vals)
def get_fanfic_url(genre, series, sort, rating):
    page_flags = f'?&srt={sort}&r={rating}&p='
    page_url = f'https://www.fanfiction.net/{genre}/{series}/{page_flags}'
    return page_url

In [3]:
# Yields a generator for a given fanfic url (Assumes html indexers are unique identifiers)
def fanfic_gen(page_url, pages_wanted=None):
    num_pages = get_max_pages(page_url)
    num_pages = num_pages if not pages_wanted else min(num_pages, pages_wanted)
    
    for page in range(1, num_pages + 1):
        page_text = requests.get(page_url + str(page)).text
        soup = BeautifulSoup(page_text, 'lxml')
        
        headings = [[title.text, title['href'], author.text] for (title, author) in \
                    zip(soup.find_all(class_ = 'stitle'), \
                        soup.find_all('a', href=re.compile(r'/u/')))]
        stories = [heading + stats.text.split(' - ') for (heading, stats) in \
                   zip(headings, soup.find_all(class_ = 'z-padtop2 xgray'))]
                
        for story in stories:
            yield story

In [4]:
# Returns the maximum page number for a given fanfic url (Assumes at least 50 stories for url)
def get_max_pages(page_url):
    page_text = requests.get(page_url).text
    soup = BeautifulSoup(page_text, 'lxml')
    pages = int(soup.find('a', text=re.compile('Last'))['href'].split('=')[-1])
    return pages