In [1]:
from bs4 import BeautifulSoup
from collections import OrderedDict
import requests
import re

In [2]:
# Creates the url we are scraping from (Assumes user knowledge of sorting/rating vals)
def get_story_url(genre, series, sort, rating):
    page_flags = f'?&srt={sort}&r={rating}&p='
    page_url = f'https://www.fanfiction.net/{genre}/{series}/{page_flags}'
    return page_url

In [3]:
# Yields an iterator for a given story url (Assumes html indexers are unique identifiers)
def story_gen(page_url, pages_wanted=None):    
    num_pages = get_last_page(page_url)
    num_pages = num_pages if not pages_wanted else min(num_pages, pages_wanted)
    
    for page in range(1, num_pages + 1):
        print(f'Scraping page {page}/{num_pages}...')
        page_text = requests.get(page_url + str(page)).text
        soup = BeautifulSoup(page_text, 'lxml')
        
        stories = [[title.text, title['href'], author.text] + stats.text.split(' - ') \
                    for (title, author, stats) in \
                    zip(soup.find_all(class_ = 'stitle'), \
                        soup.find_all('a', href=re.compile(r'/u/')), \
                        soup.find_all(class_ = 'z-padtop2 xgray'))]
        
        for story in stories:
            yield story_lexer(story)

In [4]:
# Gives types for a tokenized story list
def story_lexer(story):
    fields = OrderedDict()
    fields['name'] = story.pop(0)
    fields['link'] = f'https://www.fanfiction.net{story.pop(0)}'
    fields['author'] = story.pop(0)
    fields['rating'] = story.pop(0)[7:]
    fields['language'] = story.pop(0)
    fields['genre'] = story.pop(0) if story[0][:10] != 'Chapters: ' else None
    fields['chapters'] = int(story.pop(0)[10:].replace(',', ''))
    fields['words'] = int(story.pop(0)[7:].replace(',', ''))
    fields['reviews'] = int(story.pop(0)[9:].replace(',', '')) if story[0][:9] == 'Reviews: ' else None
    fields['favs'] = int(story.pop(0)[6:].replace(',', '')) if story[0][:6] == 'Favs: ' else None
    fields['follows'] = int(story.pop(0)[9:].replace(',', '')) if story[0][:9] == 'Follows: ' else None
    fields['updated'] = story.pop(0)[9:] if story[0][:9] == 'Updated: ' else None
    fields['published'] = story.pop(0)[11:]
    fields['characters'] = story.pop(0) if story and story[0] != 'Complete' else None
    fields['complete'] = bool(story)
    return fields

In [5]:
# Returns the last page number for a given story url (Assumes at least 50 stories for url)
def get_last_page(page_url):
    page_text = requests.get(page_url).text
    soup = BeautifulSoup(page_text, 'lxml')
    last_page = int(soup.find('a', text=re.compile('Last'))['href'].split('=')[-1])
    return last_page