In [261]:
import pandas as pd
import numpy as np
import time, re, requests
import cloudscraper
scraper = cloudscraper.create_scraper()

In [263]:
class fanficScraper:
    def __init__(self):
        self.base_url = 'http://fanfiction.net'
        self.rate_limit = 1
    
    #####################
    # Tidying Functions #
    #####################
    
    def select_story_text(self, text):
        text = text[text.find("id=\'storytext\'"):]
        text = text[:text.find("<SELECT id=chap_select")]
        return text
    
    def html_cleaner(self, text):
        cleaner_regex = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
        cleantext = re.sub(cleaner_regex, '', text)
        return cleantext
    
    ######################
    # Pipeline functions #
    ######################
    
    def get_metadata(self, fic_id):
        url = '{0}/s/{1}'.format(self.base_url, fic_id)
        result = requests.get(url)
        metadata = scraper.get(url).text
        return metadata
    
    def get_chapter(self, fic_id, chapter_id):
        url = '{0}/s/{1}/{2}'.format(self.base_url, fic_id, chapter_id)
        result = requests.get(url)
        raw = scraper.get(url).text
        story_text = self.select_story_text(raw)
        clean_story_text = self.html_cleaner(story_text)
        return clean_story_text
    
    def get_fic(self, fic_id):
        raw = self.get_metadata(fic_id)
        title = re.search(r"var title = (.*);", raw).groups()[0]
        num_chapters = int(re.search(r"Chapters: (.*) - Words", raw).groups()[0])
        
        fic = dict.fromkeys(('Title', 'Text'), [])
        fic['Title'] = re.sub(r'[^A-Za-z0-9 ]+', '', title)
        
        time.sleep(self.rate_limit)
        for chapter in range(1, num_chapters + 1):
            time.sleep(self.rate_limit)
            chapter_text = self.get_chapter(fic_id, chapter)
            fic['Text'].append(chapter_text)
        fic['Text'] = ' '.join(fic['Text'])
        return fic
    

# Scraping

In [268]:

fic_list = ['13594003', '3157478', '13018460', '5412010', '13784511', '13780235', '3883938', '6466185', '12658932']

for i in fic_list:
    scrape_fics = fanficScraper()
    output_fic = scrape_fics.get_fic(i)
    text_title = '{0}.txt'.format(output_fic['Title'])
    with open(text_title, 'w', errors = "ignore") as f:
        f.write(output_fic['Text'])