In [53]:
import mechanicalsoup
import time
import json
from timeit import default_timer as timer
import re
from unidecode import unidecode

SITE = "https://wolnelektury.pl/"  # Default site to work on, tested.


class Scrapper:
    def __init__(self, site):
        """
        Webscrapper that extracts text data from wolne lektury site.
        parameters
        ----------
        site : string
        the site from which the data are taken
        attributes
        ----------
        linksOnPage : list
        links to the fanfics
        downloaded : list
        list of all downloaded fanfics
        """
        self.__original_site = site
        self.site = site
        self.__browser = mechanicalsoup.StatefulBrowser(raise_on_404=True)
        self.response = self.__browser.open(self.site)
        self.linksOnPage = list()
        self.linksToDownload = list()


    def open_site(self, site):
        """ Opens the site """
        self.__browser.open(site)
        self.site = self.__browser.get_url()
        return site
        

    def follow_link(self, follow_link):
        """ Follows the link passed as an input """
        self.__browser.follow_link(follow_link)
        return (self.__browser.get_url())

    def filter_catalog(self,Filters = ['gatunek'],FilterValues= ['wiersz'],
                       epoch = 'pozytywizm',genre = 'wiersz',author = 'adam-asnyk'):
        """ Filters the fanficts looking for only the desired settings """
        URL = str(self.__browser.get_url())
        print(URL)
        self.author = author
        try:
            FilterSet = 'katalog/'
            for i in range(len(Filters)):
                FilterSet += f'{Filters[i]}/{FilterValues[i]}'
                newURL = f'{str(URL)}{FilterSet}'
            print(newURL)
        return newURL
        except:
            FilterSet = ''
            Filters = ['epoka', 'gatunek','autor']  # list of desired filters
            FilterValues = [epoch,genre,author]  # list of values for filters
            FilterSet = ''
            for i in range(len(Filters)):
                FilterSet += f'{Filters[i]}/{FilterValues[i]}/'
            newURL = f'{str(URL)}katalog/{FilterSet}'
            return newURL
        
    def preparing_links(self):
        """This module is tasked with finding all links to stories on a search
         page of fanfiction.net. It takes a ResultSet of URLs, extracted from
         StatefulBrowser, and runs a regular expression.
        To reduce calcultions, the module searches only for links to the last
         chapters of stories. Then it reconstructs links to the firsts.
         Yeah, it's faster that way."""

        all_links = self.__browser.links()
        self.linksOnPage = re.findall(r'katalog/lektura/[a-z-]+/', str(all_links))
        return self.linksOnPage
    
    def detect_txt(self,link):
        self.open_site(SITE + link)
        all_links = self.__browser.links()
        self.linksOnPage = re.findall(r'media/book/txt/[a-z-]+.txt', str(all_links))
        try:
            return self.linksOnPage[0]
        except:
            print('No txt version available')
    @staticmethod
    def clean_txt(txt):
        text = re.sub('\r','',txt)
        try:
            text = text.split('-----')[:-1]
        except:
            pass
        return ''.join(text)
    
    def download_txt(self,link):
        text= self.__browser.get(self.__original_site + link).text
        text = self.clean_txt(text)
        return text
    


if __name__ == '__main__':
    scrap = Scrapper(SITE)
    print('Scrapper object created!')
    scrap.open_site(scrap.filter_catalog())
    print(scrap.site)
    links = scrap.preparing_links()
#     scrap.open_site(SITE + links[0])
    texts = list()
    for n in range(100):
        try:
            link = scrap.detect_txt(links[n])
            texts.append(scrap.download_txt(link))
        except:
            pass


Scrapper object created!
https://wolnelektury.pl/
https://wolnelektury.pl/katalog/gatunek/wiersz
https://wolnelektury.pl/katalog/gatunek/wiersz/
No txt version available


In [11]:
# text.split('\r')
text = re.sub('\r','',text)
text.split('-----')[:-1]

['Edward Leszczyński\n\nNiańka\n\n\n\n— Niańko moja ty stara,\ndaj mi nową zabawkę,\nzbrzydła mi biała Ara,\nco ma z kółka huśtawkę,\ni lalki, co tak samo\nwołają ciągle: mamo.\n\n— Cóż ja dam mojej malutkiej?\nchyba krosna tęsknoty,\nhaftuj na nich swe smutki,\nmaluj na nich sen złoty,\njasnych maluj rycerzy,\naż twe serce uwierzy.\n\n— Krosna twoje niezdarne,\npalce na nich pokłułam,\nsame nici mam czarne,\njasną nić już wysnułam;\nrycerz złotem dzierzgany\nnie zszedł z koniem ze ściany.\n\n— Cóż ja dam mojej dziewczynce?\nchyba dla jej igraszki\nw rubinowej dam skrzynce\nserduszka z złocistej blaszki;\npod rubinową tęczą\nte serduszka wciąż dźwięczą.\n\n— Coś straszy w każdym serduszku,\nwszystkie dziwnie łopocą\nw rubinach przy mym łóżku,\nspać nie dadzą mi nocą;\ndość mam twojej grzechotki,\nwróć mi, wróć mi sen słodki.\n\n— Cóż ja dam mojej królewnie?\nchyba trumienkę białą —\nw sen utulę ją pewnie,\nby jej dobrze się spało,\ni położę pod główkę\nzeschłą w słońcu makówkę.\n\n— Pó

In [50]:
len(links)

939

In [29]:
scrap.download_txt(link)

AttributeError: 'list' object has no attribute 'split'