In [44]:
import mechanicalsoup
import time
import json
from timeit import default_timer as timer
import re
from unidecode import unidecode

SITE = "https://wolnelektury.pl/"  # Default site to work on, tested.


class Scrapper:
    def __init__(self, site):
        """
        Webscrapper that extracts text data from fanfiction site.
        parameters
        ----------
        site : string
        the site from which the data are taken
        attributes
        ----------
        linksOnPage : list
        links to the fanfics
        downloaded : list
        list of all downloaded fanfics
        """
        self.site = site
        self.__browser = mechanicalsoup.StatefulBrowser(raise_on_404=True)
        self.response = self.__browser.open(self.site)
        self.linksOnPage = list()
        self.linksToDownload = list()


    def open_site(self, site):
        """ Opens the site """
        self.__browser.open(site)
        self.site = self.__browser.get_url()
        return site
        

    def follow_link(self, follow_link):
        """ Follows the link passed as an input """
        self.__browser.follow_link(follow_link)
        return (self.__browser.get_url())

    def filter_catalog(self,Filters = [],FilterValues=[]):
        """ Filters the fanficts looking for only the desired settings """
        URL = str(self.__browser.get_url())
        print(URL)
        try:
            FilterSet = ''
            for i in range(len(Filters)):
                FilterSet += f'{Filters[i]}/{FilterValues[i]}'
                newURL = f'{str(URL)}{FilterSet}'
            return newURL
        except:
            FilterSet = ''
            Filters = ['epoka', 'gatunek','autor']  # list of desired filters
            FilterValues = ['pozytywizm','wiersz','adam-asnyk']  # list of values for filters
            FilterSet = ''
            for i in range(len(Filters)):
                FilterSet += f'{Filters[i]}/{FilterValues[i]}/'
            newURL = f'{str(URL)}katalog/{FilterSet}'
            return newURL
    def preparing_links(self):
        """This module is tasked with finding all links to stories on a search
         page of fanfiction.net. It takes a ResultSet of URLs, extracted from
         StatefulBrowser, and runs a regular expression.
        To reduce calcultions, the module searches only for links to the last
         chapters of stories. Then it reconstructs links to the firsts.
         Yeah, it's faster that way."""

        all_links = self.__browser.links()
        all_links = all_links[1::2]
        self.linksOnPage = re.findall(r'/autor/', str(all_links))
        self.linksOnPage = all_links 
        return self.linksOnPage


if __name__ == '__main__':
    scrap = Scrapper(SITE)
    print('Scrapper object created!')
    scrap.open_site(scrap.filter_catalog())
    print(scrap.site)
    

Scrapper object created!
https://wolnelektury.pl/
https://wolnelektury.pl/katalog/epoka/pozytywizm/gatunek/wiersz/autor/adam-asnyk/


In [45]:

scrap.preparing_links()

[<a class="action" href="/towarzystwo/">
           Tak, chcę pomóc Wolnym Lekturom!
         </a>,
 <a href="/towarzystwo/"><div class="annoy-banner-inner"><div class="text"><p>Wspieraj Wolne Lektury</p></div></div></a>,
 <a href="/katalog/lektury/">5569</a>,
 <a href="/uzytkownik/login/?next=/katalog/epoka/pozytywizm/gatunek/wiersz/autor/adam-asnyk/" id="login">Zaloguj się</a>,
 <a href="/katalog/lektury/">Literatura</a>,
 <a href="/katalog/audiobooki/">Audiobooki</a>,
 <a href="/katalog/">Wszystkie utwory</a>,
 <a href="/katalog/gatunek/wiersz/">Wiersz</a>,
 <a href="/katalog/epoka/pozytywizm/gatunek/wiersz/">X</a>,
 <a href="/katalog/autor/ajschylos/">Ajschylos</a>,
 <a href="/katalog/autor/edmondo-de-amicis/">Edmondo de Amicis</a>,
 <a href="/katalog/autor/wladyslaw-anczyc/">Władysław Anczyc</a>,
 <a href="/katalog/autor/jerzy-andrzejewski/">Jerzy Andrzejewski</a>,
 <a href="/katalog/autor/guillaume-apollinaire/">Guillaume Apollinaire</a>,
 <a href="/katalog/autor/arystofanes/">Ar