# Scrape the ProTV news archive

In [None]:
import re
from datetime import date, datetime

import requests
from bs4 import BeautifulSoup


def parse_date(date_str: str) -> date:
    return datetime.strptime(date_str, '%d.%m.%Y').date()


since = parse_date('05.06.2015')

page = 1
stop = False

results = []

while not stop:
    html = requests.get(
        f'https://stirileprotv.ro/video/stirile-diminetii/?page={page}'
    ).text
    soup = BeautifulSoup(html, 'html.parser')

    for item in soup.select('div.vid_item'):
        a_tag = item.find('a', href=True)
        title_el = a_tag.find(class_='title')  # type: ignore

        href = a_tag['href']  # type: ignore
        title_text = title_el.get_text(strip=True)  # type: ignore

        m = re.search(r'(\d{2}\.\d{2}\.\d{4})', title_text)
        try:
            date_str = m.group(1)  # type: ignore
        except:  # noqa: E722
            print(f'Could not match date format for {title_text}')
            continue
        date_obj = parse_date(date_str)

        if since > date_obj:
            stop = True
            break

        results.append({'date': date_str, 'date_obj': date_obj, 'href': href})

    page += 1

results.sort(key=lambda x: x['date_obj'], reverse=True)

In [None]:
import os
from datetime import timedelta
from pathlib import Path
import time


def alternating_pairs(n):
    for i in range(1, n + 1):
        yield -i
        yield i


one_day = timedelta(1)


def stringify_date(date_obj: date) -> str:
    return date_obj.strftime('%Y/%m/%d')


def try_get_video_url(r):
    id = re.search(r'\d+', r['href']).group()  # type: ignore
    date_str = stringify_date(r['date_obj'])
    vid_url = f'https://vid2.stirileprotv.ro/{date_str}/{id}-2.mp4'
    response = requests.head(vid_url)

    if response.status_code == 200:
        return vid_url, date_str, id
    elif response.status_code == 404:
        for td in alternating_pairs(2 * 365):
            td = timedelta(td)
            try_date = r['date_obj'] + td
            date_str = stringify_date(try_date)
            vid_url = f'https://vid2.stirileprotv.ro/{date_str}/{id}-2.mp4'
            response = requests.head(vid_url)
            if response.status_code == 200:
                print(f'Found for {r["date"]} at {date_str} actually.')
                return vid_url, date_str, id
            elif response.status_code == 404:
                continue
            else:
                raise ValueError(f"Something's wrong for {r['date']}")
        raise ValueError(f'Could not find for {r["date"]}')
    else:
        raise ValueError(f"Something's wrong for {r['date']}")


downloaded = 0

root = 'scraped'

for r in results:
    vid_url, date_str, id = try_get_video_url(r)
    Path(os.path.join(root, date_str)).mkdir(parents=True, exist_ok=True)
    file_name = os.path.join(root, date_str, id + '-2.mp4')
    if os.path.exists(file_name):
        print(f'Skipping {file_name}')
        continue
    with requests.get(vid_url, stream=True) as r:
        r.raise_for_status()
        with open(file_name, 'wb') as f:
            # Be nice to the server!
            for chunk in r.iter_content(chunk_size=1024 * 1024):
                if chunk:
                    f.write(chunk)
                    downloaded += len(chunk)
                    if downloaded >= 1024 * 1024 * 1024:
                        print('Downloaded 1GB')
                        downloaded -= 1024 * 1024 * 1024
                # Be nice to the server!
                time.sleep(1)
    print(f'Done with {vid_url}.')