In [31]:
import requests
import time
import csv

from bs4 import BeautifulSoup
from urllib.parse import urljoin

class CrawledArticle():
    def __init__(self, title, emoji, content, image):
        self.title = title
        self.emoji = emoji
        self.content = content
        self.image = image

class ArticleFetcher():
    def __init__(self, mainPageUrl):
        self.__mainPageUrl = mainPageUrl
        self.__pages = { }
        self.__articles = []        
        
    def __extract_doc(self, pgurl):
        r = requests.get(pgurl)
        doc = BeautifulSoup(r.text, "html.parser")
        
        self.__pages[pgurl] = doc
        return doc
        
    def __get_next_page_url(self, pgurl, doc): 
        nav = doc.select(".navigation a")
        if len(nav) == 1:
            return urljoin(pgurl, nav[0].attrs["href"])
        else:
            return None
    
    def __fetch(self, doc, pgurl):  
        
        for card in doc.select(".card"):
            emoji = card.select_one(".emoji").text
            content = card.select_one(".card-text").text
            title = card.select(".card-title span")[1].text
            img = urljoin(pgurl, card.select_one("img").attrs["src"])

            yield CrawledArticle(title, emoji, content, img)
    
    def fetch__all__pages(self) :
        url = self.__mainPageUrl
        while (url != None) : 
            doc = self.__extract_doc(url)            
            print(url)
            time.sleep(1)
            yield self.__fetch(doc, url)
            url = self.__get_next_page_url(url, doc)
    
    def write_to_file(self, fileName, numArticles) :
        
        with open(fileName, 'w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file, delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            writer.writerow(["emoji", "title", "content", "image"])
            
            pages = fetcher.fetch__all__pages()
            
            count = 0
            
            art_array = ( [ a.emoji, a.title, a.content, a.image] for pg in pages for a in pg )
            for art in art_array:
                count += 1
                if count > numArticles:
                    break
                    
                writer.writerow(art)
                

# print(r.status_code)
# print(r.headers)
# print(r.text)

fetcher = ArticleFetcher("http://python.beispiel.programmierenlernen.io/")
pages = fetcher.fetch__all__pages() # --> fetch_all _pages return list of list of articles ()

counter = 0

# flatten list via list comprehension (no generator-support):
# articles = [art for pg in pages for art in pg]

# flatten with generator-support):
articles = (art for pg in pages for art in pg)

for art in articles:

    counter += 1
    if (counter == 8):
        break
    print(art.emoji + ": " + art.title)

http://python.beispiel.programmierenlernen.io/
😩: Polarised modular conglomeration
😐: Cross-group contextually-based middleware
😌: De-engineered encompassing structure
😚: Fully-configurable multi-tasking interface
😠: Versatile eco-centric core
😮: Optional maximized utilisation
😢: Open-architected secondary product
http://python.beispiel.programmierenlernen.io/index.php?page=2


In [34]:
fetcher.write_to_file('crawler_output_generator.csv', 5)

http://python.beispiel.programmierenlernen.io/


In [35]:
 with open('crawler_output_generator.csv', 'r', newline='', encoding='utf-8') as file:
    spamreader = csv.reader(file, delimiter=';', quotechar='"')
    for row in spamreader:
        print(', '.join(row))

emoji, title, content, image
😩, Polarised modular conglomeration, Optio numquam ut accusantium laborum unde assumenda. Ea et totam asperiores fugiat voluptatem vitae. Et provident nam et mollitia., http://python.beispiel.programmierenlernen.io/img/1.jpg
😐, Cross-group contextually-based middleware, Deleniti atque autem et commodi cupiditate cupiditate. Fuga illum quas aliquam velit. Labore dolor fugit quia id odio nam., http://python.beispiel.programmierenlernen.io/img/2.jpg
😌, De-engineered encompassing structure, Assumenda tempora inventore harum cumque voluptatibus sit et. Et omnis et dolore quod voluptas sit a., http://python.beispiel.programmierenlernen.io/img/3.jpg
😚, Fully-configurable multi-tasking interface, Cumque unde officia autem quia at fugit. Sint iure veritatis culpa aut provident aliquam in. Eos eum accusantium quia vel dignissimos nesciunt expedita. Rem aut accusantium et tempore., http://python.beispiel.programmierenlernen.io/img/4.jpg
😠, Versatile eco-centric core, 