In [1]:
# from 2024-02-29 (page 231) until 2021-01-20 (page 721)

import locale
import os
import re
import uuid
from datetime import datetime

import requests
import yt_dlp
from bs4 import BeautifulSoup

locale.setlocale(locale.LC_TIME, 'ro_RO.UTF-8')


url = 'https://www.protv.ro/emisiuni/stirile-pro-tv/videoclipuri/episoade/pagina-'
title_pattern = re.compile(r'^\s*Stirile PRO TV # 06\.00')
dash_pattern = re.compile(
    r'"DASH"\s*:\s*\[\s*\{\s*"src"\s*:\s*"([^"]+?\.mpd)"', re.DOTALL
)


ydl_opts = {
    # Prefer best video up to 720p + best audio. If that combo isn't available,
    # fall back to the best single stream <=720p, then any best.
    'format': 'bv[height<=720]+ba/b[height<=720]/b',
    'merge_output_format': 'mkv',
    'overwrites': False,
    'retries': 10,
    'fragment_retries': 10,
    'concurrent_fragment_downloads': 16,
    'ignoreerrors': 'only_download',
    'noprogress': False,
    'quiet': False,
}


for i in range(721, 724):  # 231
    print(i)
    response = requests.get(url + str(i))
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')
    anchors = soup.find_all('a', string=title_pattern)  # type: ignore

    for a in anchors:
        text = a.get_text()
        split = text.split('â€“')
        date_str = split[1].strip()
        date = datetime.strptime(date_str, '%d %B %Y')
        name = os.path.join('scraped', date.strftime('%Y/%m/%d'), str(uuid.uuid4()))
        out_dir = os.path.dirname(name)

        if os.path.exists(out_dir):
            print(f'Skipping {name}')
            continue

        href = a['href']

        response = requests.get(href)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        iframe = soup.find('iframe', class_='player-container js-player-container')
        data_src = iframe['data-src']  # type: ignore

        response = requests.get(data_src)  # type: ignore
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        scripts = soup.find_all('script', src=False)
        dash_src = None
        for s in scripts:
            if not s.string:
                continue
            m = dash_pattern.search(s.string)
            if m:
                dash_src = m.group(1)
                break
        assert dash_src, "Couldn't find DASH"

        os.makedirs(os.path.dirname(name), exist_ok=False)

        opts = ydl_opts | {
            'outtmpl': f'{name}.%(ext)s',
        }

        with yt_dlp.YoutubeDL(opts) as ydl:  # type: ignore
            print(f'\n=== Downloading: {name} ===')
            ydl.download([dash_src])

721
Skipping scraped/2021/01/25/28c4e658-a317-45fe-94f3-5a77812caf3f
722

=== Downloading: scraped/2021/01/21/541160db-c9e9-4445-850c-387c2d9a4b8d ===
[generic] Extracting URL: https://cmero-ott-vod-web-prep-sec.ssl.cdn.cra.cz/P-1RDHKP6qbAFOG45xyuJQ==,1763556035/0276/5130/r...d2-S4kSoCAM.ism/.mpd
[generic] .mpd: Downloading webpage




[generic] .mpd: Extracting information
[info] .mpd: Downloading 1 format(s): vid=3478000+aud_rum=128000
[dashsegments] Total fragments: 3797
[download] Destination: scraped/2021/01/21/541160db-c9e9-4445-850c-387c2d9a4b8d.fvid=3478000.mp4
[download] 100% of    4.61GiB in 00:03:12 at 24.59MiB/s                    
[dashsegments] Total fragments: 3797
[download] Destination: scraped/2021/01/21/541160db-c9e9-4445-850c-387c2d9a4b8d.faud_rum=128000.m4a
[download] 100% of  178.15MiB in 00:02:11 at 1.35MiB/s                     
[Merger] Merging formats into "scraped/2021/01/21/541160db-c9e9-4445-850c-387c2d9a4b8d.mkv"
Deleting original file scraped/2021/01/21/541160db-c9e9-4445-850c-387c2d9a4b8d.faud_rum=128000.m4a (pass -k to keep)
Deleting original file scraped/2021/01/21/541160db-c9e9-4445-850c-387c2d9a4b8d.fvid=3478000.mp4 (pass -k to keep)
723

=== Downloading: scraped/2021/01/20/6a91b0eb-6cf9-48fe-a6b3-c7b5149fe8ab ===
[generic] Extracting URL: https://cmero-ott-vod-web-prep-sec.ssl.cdn



[generic] .mpd: Extracting information
[info] .mpd: Downloading 1 format(s): vid=3480000+aud_rum=128000
[dashsegments] Total fragments: 3792
[download] Destination: scraped/2021/01/20/6a91b0eb-6cf9-48fe-a6b3-c7b5149fe8ab.fvid=3480000.mp4
[download] 100% of    4.61GiB in 00:03:03 at 25.75MiB/s                    
[dashsegments] Total fragments: 3793
[download] Destination: scraped/2021/01/20/6a91b0eb-6cf9-48fe-a6b3-c7b5149fe8ab.faud_rum=128000.m4a
[download] 100% of  177.45MiB in 00:02:11 at 1.35MiB/s                     
[Merger] Merging formats into "scraped/2021/01/20/6a91b0eb-6cf9-48fe-a6b3-c7b5149fe8ab.mkv"
Deleting original file scraped/2021/01/20/6a91b0eb-6cf9-48fe-a6b3-c7b5149fe8ab.fvid=3480000.mp4 (pass -k to keep)
Deleting original file scraped/2021/01/20/6a91b0eb-6cf9-48fe-a6b3-c7b5149fe8ab.faud_rum=128000.m4a (pass -k to keep)
