In [31]:
import mechanicalsoup
import time
import json
from timeit import default_timer as timer
import re
from unidecode import unidecode

SITE = "https://wolnelektury.pl/"  # Default site to work on, tested.


class Scrapper:
    def __init__(self, site):
        """
        Webscrapper that extracts text data from fanfiction site.
        parameters
        ----------
        site : string
        the site from which the data are taken
        attributes
        ----------
        linksOnPage : list
        links to the fanfics
        downloaded : list
        list of all downloaded fanfics
        """
        self.site = site
        self.__browser = mechanicalsoup.StatefulBrowser(raise_on_404=True)
        self.response = self.__browser.open(self.site)
        self.linksOnPage = list()
        self.linksToDownload = list()


    def open_site(self, site):
        """ Opens the site """
        self.__browser.open(site)
        self.site = self.__browser.get_url()
        return site
        

    def follow_link(self, follow_link):
        """ Follows the link passed as an input """
        self.__browser.follow_link(follow_link)
        return (self.__browser.get_url())

    def filter_catalog(self,Filters = [],epoch = 'pozytywizm',genre = 'wiersz',author = 'adam-asnyk'):
        """ Filters the fanficts looking for only the desired settings """
        URL = str(self.__browser.get_url())
        print(URL)
        self.author = author
        try:
            FilterSet = ''
            for i in range(len(Filters)):
                FilterSet += f'{Filters[i]}/{FilterValues[i]}'
                newURL = f'{str(URL)}{FilterSet}'
            return newURL
        except:
            FilterSet = ''
            Filters = ['epoka', 'gatunek','autor']  # list of desired filters
            FilterValues = [epoch,genre,author]  # list of values for filters
            FilterSet = ''
            for i in range(len(Filters)):
                FilterSet += f'{Filters[i]}/{FilterValues[i]}/'
            newURL = f'{str(URL)}katalog/{FilterSet}'
            return newURL
        
    def preparing_links(self):
        """This module is tasked with finding all links to stories on a search
         page of fanfiction.net. It takes a ResultSet of URLs, extracted from
         StatefulBrowser, and runs a regular expression.
        To reduce calcultions, the module searches only for links to the last
         chapters of stories. Then it reconstructs links to the firsts.
         Yeah, it's faster that way."""

        all_links = self.__browser.links()
        all_links = all_links[1::2]
        name = self.author.split('-')[1]
        self.linksOnPage = re.findall(r'katalog/lektura/{}.+\"'.format(name), str(all_links))
#         self.linksOnPage = all_links 
        return self.linksOnPage


if __name__ == '__main__':
    scrap = Scrapper(SITE)
    print('Scrapper object created!')
    scrap.open_site(scrap.filter_catalog())
    print(scrap.site)


Scrapper object created!
https://wolnelektury.pl/
https://wolnelektury.pl/katalog/epoka/pozytywizm/gatunek/wiersz/autor/adam-asnyk/


In [32]:

scrap.preparing_links()

['katalog/lektura/asnyk-ze-sceny-swiata/"><img alt="Cover" class="cover" src="/media/book/cover_thumb/asnyk-ze-sceny-swiata_gkaOJkV.jpg"/></a>, <a href="/katalog/lektura/asnyk-ze-sceny-swiata/">Ze sceny świata</a>, <a href="/katalog/rodzaj/liryka/">Liryka</a>, <a href="/info/wymagajace-uwspolczesnienia/">Ten utwór wymaga uwspółcześnienia</a>, <a href="/media/book/epub/asnyk-ze-sceny-swiata.epub">EPUB</a>, <a class="read-more-show hide" href="#">więcej</a>, <a class="read-more-hide hide" href="#">mniej</a>, <a href="/katalog/lektura/asnyk-nad-glebiami/"><img alt="Cover" class="cover" src="/media/book/cover_thumb/asnyk-nad-glebiami_9sWSTIg.jpg"/></a>, <a href="/katalog/lektura/asnyk-nad-glebiami/">Nad głębiami</a>, <a href="/katalog/rodzaj/liryka/">Liryka</a>, <a href="/info/wymagajace-uwspolczesnienia/">Ten utwór wymaga uwspółcześnienia</a>, <a href="/media/book/epub/asnyk-nad-glebiami.epub">EPUB</a>, <a class="read-more-show hide" href="#">więcej</a>, <a class="read-more-hide hide" hre

'https://wolnelektury.pl/katalog/epoka/pozytywizm/gatunek/wiersz/autor/adam-asnyk/'