# Scrape the PrimaTV FOCUS 22 & 23 YouTube archive

## Find interpreter in thumbnail for FOCUS 22 & 23

### Get thumbnails for all videos

In [None]:
import os
import re
import requests
from yt_dlp import YoutubeDL


playlist_url = (
    'https://www.youtube.com/playlist?list=PL4GqKMm44AA10WhcE1-ZFdZjhxXLXHHhU'
)
title_pattern = re.compile(r'^FOCUS\s*2[23]', re.IGNORECASE)

ydl_opts = {
    'quiet': True,
    'skip_download': True,
    'extract_flat': 'in_playlist',
    'no_warnings': True,
    'cookiefile': 'cookies.txt',
}

out_dir = 'thumbs'
os.makedirs(out_dir, exist_ok=True)


with YoutubeDL(ydl_opts) as ydl:  # type: ignore
    playlist_info = ydl.extract_info(playlist_url, download=False)

    for entry in playlist_info.get('entries', []):
        if not entry:
            continue

        title = entry.get('title') or ''
        video_id = entry.get('id')

        if not video_id:
            continue

        filename = os.path.join('thumbs', f'{video_id}.jpg')

        if os.path.exists(filename):
            print(f'skip {filename}')
            continue

        if not re.search(title_pattern, title):
            continue

        thumb_urls = [
            f'https://i.ytimg.com/vi/{video_id}/maxresdefault.jpg',
            f'https://i.ytimg.com/vi/{video_id}/hqdefault.jpg',
        ]

        for thumb_url in thumb_urls:
            print(f'Trying thumbnail for: {title} -> {thumb_url}')
            resp = requests.get(thumb_url, stream=True)
            if resp.status_code == 200:
                with open(filename, 'wb') as f:
                    for chunk in resp.iter_content(chunk_size=8192):
                        if chunk:
                            f.write(chunk)
                print(f'Saved to {filename}')
                break
            else:
                resp.close()
        else:
            print(f'Could not get thumbnail for: {title}')

### Crop and find interpreter

In [None]:
import numpy as np


def crop_interpreter(img: np.ndarray):
    h, w = img.shape[:2]

    if h == 720:
        X = 1020
        Y = 480
        W = 221
        H = 237
    elif h == 360:
        X = 388
        Y = 225
        W = 82
        H = 88
    else:
        raise ValueError('Height not supported')

    return img[Y : Y + H, X : X + W]


Manually delete garbage

In [None]:
import cv2
import face_recognition


files = os.listdir('thumbs')
idx = 0

window_name = 'Viz'
cv2.namedWindow(window_name, cv2.WINDOW_NORMAL)

while True:
    print(files[idx])

    img = crop_interpreter(
        face_recognition.load_image_file(os.path.join('thumbs', files[idx]))
    )
    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

    cv2.imshow(window_name, img)

    key = cv2.waitKey(0)

    if key == ord('q'):
        break

    if key == ord('d'):
        idx = (idx + 1) % len(files)

    if key == ord('a'):
        idx = (idx - 1) % len(files)

    if key == ord('x'):
        os.remove(os.path.join('thumbs', files[idx]))
        files.remove(files[idx])

cv2.destroyAllWindows()

### Download videos

In [None]:
import os
from yt_dlp import YoutubeDL

os.makedirs('scraped', exist_ok=True)

KNOWN_EXTS = {'mkv', 'mp4'}

ydl_opts = {
    'format': 'bv*+ba[language=ro]/bv*+ba/b',
    'merge_output_format': 'mkv',
    'outtmpl': {
        'default': os.path.join('scraped', '%(id)s.%(ext)s'),
    },
    'overwrites': False,
    'retries': 10,
    'fragment_retries': 10,
    'concurrent_fragment_downloads': 16,
    'ignoreerrors': 'only_download',
    'noprogress': False,
    'quiet': True,
    'cookiefile': 'cookies.txt',
}


def is_already_scraped(video_id: str) -> bool:
    for ext in KNOWN_EXTS:
        if os.path.exists(os.path.join('scraped', f'{video_id}.{ext}')):
            return True
    return False


urls = []
for thumb in os.listdir('thumbs'):
    video_id = thumb.replace('.jpg', '')
    if not is_already_scraped(video_id):
        urls.append(f'https://www.youtube.com/watch?v={video_id}')

print(f'Downloading {len(urls)} new videos...')

with YoutubeDL(ydl_opts) as ydl:  # type: ignore
    if urls:
        ydl.download(urls)
    else:
        print('All videos already scraped.')
