In [5]:
!pip install requests beautifulsoup4 tiktoken huggingface_hub
!pip install datasets



Importing necessary packages

In [6]:
import requests
from bs4 import BeautifulSoup
import re
import tiktoken
from tqdm import tqdm
import pandas as pd

HTML Cleaning

In [45]:
"""
def clean_text(html_text):
    ""Odstraní HTML tagy a jiné nadbytečné znaky z textu.""
    soup = BeautifulSoup(html_text, "html.parser")
    text = soup.get_text()
    text = re.sub(r'\s+', ' ', text)  # Odstranění přebytečných bílých znaků
    return text.strip()
    
"""    
import re
from bs4 import BeautifulSoup
import html

def clean_text(html_text):
    """Odstraní HTML tagy, dekóduje HTML entity a odstraňuje skripty a styly."""
    # Parse HTML
    soup = BeautifulSoup(html_text, "html.parser")
    
    # Remove script and style tags
    for script_or_style in soup(["script", "style"]):
        script_or_style.decompose()
    
    # Get text
    text = soup.get_text(separator=" ")  # Přidá mezery mezi bloky
    
    # Decode HTML entities
    text = html.unescape(text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()



Function for webscraping

In [43]:
def scrape_article_content(page_content):
    soup = BeautifulSoup(page_content, "html.parser")
    admissible_elements = soup.select("#main-content > p, \
        #main-content > h1, #main-content > h2, #main-content > h3, \
        #main-content > h4, #main-content > h5, #main-content > h6")
    
    content_bits = list(map(lambda el: el.get_text(separator=" "), admissible_elements))
    raw_content = " ".join(content_bits)
    
    return clean_text(raw_content)

In [53]:
def scrape_article(article_url):
    try:
        response = requests.get(article_url)
        if response.status_code != 200:
            print(f"Failed to retrieve {article_url}")
            return ''

        return scrape_article_content(response.content)
    except Exception as e:
        print(f"Error retrieving {article_url}: {e}")
        return ''

In [39]:
def get_article_links(page_content):
    soup = BeautifulSoup(page_content, "html.parser")
    article_links = soup.find_all(class_="text-black")

    return list(map(lambda link: link.get("href"), article_links))

In [58]:
def scrape_paginated_list(base_list_url, num_pages=10):
    """Prochází URL a pro stránkovatelné URL iteruje přes více stránek."""
    texts = []
    for page in tqdm(range(1, num_pages + 1)):
        pagination_url = base_list_url + str(page)

        try:
            list_response = requests.get(pagination_url)
            if list_response.status_code != 200:
                print(f"Failed to retrieve {pagination_url}")
                break

            for article_link in tqdm(get_article_links(list_response.content), leave=False):
                content = scrape_article(article_link)
                if content != '':
                    texts.append(content)
        except Exception as e:
            print(f"Error retrieving {pagination_url}: {e}")
            break
    return texts

Our chosen webpages - we are interested in traffic pages

Let us now run webscraping...

In [61]:
# Spuštění webscrapingu pro zadaný počet stránek na každém zdroji
texts = scrape_paginated_list("https://zdopravy.cz/category/zeleznice/page/", 70) # should take about 10 minutes

100%|██████████| 70/70 [10:22<00:00,  8.90s/it]


... and count the number of tokens:

In [62]:
encoder = tiktoken.get_encoding("gpt2")
total_tokens = sum(len(encoder.encode(text)) for text in texts)

print(f"Celkový počet tokenů: {total_tokens}")
print(f"Počet textových bodů: {len(texts)}")

Celkový počet tokenů: 1279543
Počet textových bodů: 1032


Save the scraped text as .csv file

In [63]:
data = pd.DataFrame(texts, columns=["text"])
#data.to_csv("zeleznice_dataset.csv", index=False, escapechar="\\")
data.to_csv("zeleznice_dataset.csv", index=False, escapechar="\\", encoding="utf-8-sig")