# Extract and crop the interpreter

In [None]:
import os
import sys

import numpy as np

notebook_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(notebook_dir, '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

%load_ext autoreload
%autoreload 2

ROOT_DIR = 'scraped'


# Interpreter box
def crop_interpreter(img: np.ndarray):
    h, w = img.shape[:2]

    if h == 720:
        X, Y = 952, 347
        W, H = 300, 322
    elif h == 432:
        X, Y = 571, 208
        W, H = 180, 193
    else:
        raise ValueError('Height not supported')

    return img[Y : Y + H, X : X + W]

## Get frame indexes between which the interpreter is present

In [None]:
import json
import os
from cv2 import VideoCapture
import cv2
import face_recognition

from utils.video import get_vid_metadata, get_frame


known_faces = []
known_people = []
for file_name in os.listdir('known_people'):
    img = face_recognition.load_image_file(os.path.join('known_people', file_name))
    enc = face_recognition.face_encodings(img)[0]
    known_faces.append(enc)
    known_people.append(file_name.replace('.png', ''))


def is_known(idx: int, cap: VideoCapture, crop=True, shrink=False):
    """Wether we recognize one person in the interpreter box or not"""
    frame = get_frame(idx, cap)
    if crop:
        frame = crop_interpreter(frame)
    if shrink:
        frame = cv2.resize(frame, None, fx=0.6, fy=0.6, interpolation=cv2.INTER_AREA)
    frame = np.ascontiguousarray(frame[:, :, :3], dtype=np.uint8)
    encodings = face_recognition.face_encodings(frame)
    for enc in encodings:
        matches = face_recognition.compare_faces(known_faces, enc)
        if any(matches):
            return known_people[np.argmax(matches)]
    return None


# false, ..., false, true, ..., true, false, ..., false
def bsearch(l: int, r: int, predicate):
    """Binary search on a sequence like [false, ..., false, true, ..., true]. Returns index of first true.
    Helps optimally search for the start and end indexes where the interpreter is present"""
    res = r
    while l <= r:
        m = (l + r) // 2
        if predicate(m):
            res = m
            r = m - 1
        else:
            l = m + 1  # noqa: E741
    return res

In [None]:
def do_segment(path: str):
    cap, fps, frame_count = get_vid_metadata(path)
    stride = (
        fps * 60 * 15
    )  # 30 min of SL at least (required by law) but let's search in chunks of 15 mins so we don't jump over if a tiny bit less than 30 mins
    true_idx = -1
    interpreter = None
    for i in range(0, frame_count, int(stride)):
        interpreter = is_known(i, cap)
        if interpreter:
            true_idx = i
            break
    if true_idx == -1:
        return None

    start = bsearch(0, true_idx, lambda x: is_known(x, cap))
    first_false = bsearch(true_idx, frame_count - 1, lambda x: not is_known(x, cap))

    if first_false < frame_count and not is_known(first_false, cap):
        end = first_false - 1
    else:
        end = frame_count - 1

    return start, end, interpreter


for root, _, files in os.walk(ROOT_DIR):
    for file in files:
        if file.endswith('.seg.mp4') or file.endswith('.seg.mkv'):
            continue

        if not file.endswith('.mp4') and not file.endswith('.mkv'):
            continue
        vid = os.path.join(root, file)
        seg = vid.replace('.mp4', '.json').replace('.mkv', '.json')
        if os.path.exists(seg):
            print(f'Skipping {seg}')
            continue
        segment = do_segment(vid)
        if not segment:
            print(f'NONE FOR {seg}!!!')
            continue
        with open(seg, 'w') as f:
            json.dump(
                {'start': segment[0], 'end': segment[1], 'interpreter': segment[2]}, f
            )
        print(seg)

### Sometimes the interpreter disappears for some time and reappears quite later. Do find them out

In [None]:
import json
import os


# Reuse the do segment logic but apply it BEFORE the identified segment and AFTER the identified segment
# (Originally this function was used to FIND the segment. Now see if there are some other, weird segments
# we missed.)
def do_segment_weird(path: str, startx: int, endx: int):
    cap, fps, frame_count = get_vid_metadata(path)
    frame_count -= 1
    stride = fps * 60 * 5  # 5 min
    stride = min(stride, frame_count // 12)
    true_idx = -1
    interpreter = None
    if startx == -1:
        startx = 0
    if endx == -1:
        endx = frame_count
    for i in range(startx, endx, int(stride)):
        interpreter = is_known(i, cap)
        if interpreter:
            true_idx = i
            break
    if true_idx == -1:
        return None

    start = bsearch(0, true_idx, lambda x: is_known(x, cap))
    first_false = bsearch(true_idx, frame_count - 1, lambda x: not is_known(x, cap))

    if first_false < frame_count and not is_known(first_false, cap):
        end = first_false - 1
    else:
        end = frame_count - 1

    return start, end, interpreter


for root, _, files in os.walk(ROOT_DIR):
    for file in files:
        if file.endswith('.seg.mp4') or file.endswith('.seg.mkv'):
            continue

        if not file.endswith('.mp4') and not file.endswith('.mkv'):
            continue
        vid = os.path.join(root, file)
        seg = vid.replace('.mp4', '.json').replace('.mkv', '.json')
        if not os.path.exists(seg):
            print(f'Skipping {seg}')
            continue

        with open(seg) as f:
            seg = json.load(f)

        print(f'trying {file}')
        segment = do_segment_weird(vid, -1, seg['start'] - 1)
        if segment:
            print(file + ' BEFORE')
        segment = do_segment_weird(vid, seg['end'] + 1, -1)
        if segment:
            print(file + ' AFTER')

In [None]:
for root, _, files in os.walk(ROOT_DIR):
    for file in files:
        if file.endswith('.seg.mp4') or file.endswith('.seg.mkv'):
            continue

        if not file.endswith('.mp4') and not file.endswith('.mkv'):
            continue
        vid = os.path.join(root, file)
        seg = vid.replace('.mp4', '.json').replace('.mkv', '.json')
        if not os.path.exists(seg):
            print(f'Skipping {seg}')
            continue

        with open(seg) as f:
            seg = json.load(f)

        if isinstance(seg, list):
            continue

        cap, fps, frame_count = get_vid_metadata(vid)

        start = seg['end'] + 1
        end = min(seg['end'] + 1 + int(10 * 60 * fps), frame_count)

        fc = False
        for i in range(start, end, int(5 * fps)):
            if is_known(i, cap, shrink=True):
                print(vid + 'first_case  ' + str(i))
                fc = True
                break

        if fc:
            continue

        start = seg['end'] - 1
        end = max(seg['start'], seg['end'] - 1 - int(10 * 60 * fps))

        for i in range(start, end, int(-5 * fps)):
            if not is_known(i, cap, shrink=True):
                print(vid + 'second_case  ' + str(i))
                break

        print(f'Done with {vid}')
        # incepe de la end - 1 (spre inceput), ia la fiecare 5 secunde timp de 10 minute daca nu este interpret.

In [None]:
first_case = set()

with open('a.txt', 'r', encoding='utf-8') as f:
    for line in f:
        if 'first_case' in line:
            split = line.split('first_case  ')
            vid_path = split[0]
            idx = int(split[1])

            seg = vid_path.replace('.mp4', '.json').replace('.mkv', '.json')
            with open(seg) as f:
                seg = json.load(f)

            yes = 0
            no = 0

            cap, fps, _ = get_vid_metadata(vid_path)

            start = idx
            end = min(seg['end'] - 1, int(idx + 3 * fps))

            for i in range(start, end + 1, 5):
                if not is_known(i, cap):
                    no += 1
                else:
                    yes += 1

            if no >= yes:
                first_case.add((vid_path, idx))
                print(vid_path + ' !!!')

            print(f'Done with {vid_path}')

            # de aici vezi daca exista 3 secunde fara content (din 5 in 5 cadre)

In [88]:
# in fata nu e content => bsearch [idx, end]

for vid_path, start in first_case:
    seg_path = vid_path.replace('.mp4', '.json').replace('.mkv', '.json')
    with open(seg_path) as f:
        seg = json.load(f)

    if isinstance(seg, list):
        continue

    cap, fps, frame_count = get_vid_metadata(vid_path)

    if not is_known(start, cap, shrink=True):
        raise ValueError()

    first_false = bsearch(
        start, frame_count - 1, lambda x: not is_known(x, cap, shrink=True)
    )

    if first_false < frame_count and not is_known(first_false, cap):
        end = first_false - 1
    else:
        print(f'skipped {vid_path}')
        continue

    if not is_known(end, cap, shrink=True):
        raise ValueError(vid)

    seg = [seg, {'start': start, 'end': end, 'interpreter': seg['interpreter']}]
    with open(seg_path, 'w') as f:
        json.dump(seg, f)
        print(seg_path)

skipped scraped/2021/08/13/2e049f12-0d83-4448-9864-875a77f071ef.mkv
scraped/2021/12/14/cfd58d07-5b90-4601-8682-6c225dd7b4be.json
scraped/2021/08/06/cd3fa7ca-5bd3-4622-93f3-0bd569ee3ed5.json
scraped/2021/02/19/b4ea7651-3bf9-4be7-b68b-ea65337e173c.json
scraped/2021/06/03/afb09472-54a9-487d-ad3f-884406fa0079.json
scraped/2021/07/26/6a104128-9d43-4197-b625-816194b6a366.json
scraped/2021/02/15/f45e0ec0-58a0-4469-b3a4-1d0aa8f688a2.json
scraped/2021/06/29/6a303aa8-c6df-4dc3-bbe7-2309cca574e0.json
scraped/2021/03/11/c5470d67-b194-4a3a-9454-7f0465466050.json
scraped/2021/05/20/fe23b9b1-ca3a-4aae-a24a-085934fc1f5c.json
scraped/2021/02/22/5027f027-5939-46f9-a96f-aeeac5556c5e.json
scraped/2021/03/23/1c91ea95-d8d6-438b-89fe-89a21177f16c.json
scraped/2021/04/02/5f505e87-eba0-4f2c-a312-ff51bb1af062.json
scraped/2021/06/24/45625068-4f24-4cc2-ba13-0e31f8565359.json
scraped/2021/05/25/19d268b5-7e2a-4ba9-97d1-8f3cea3b0516.json
scraped/2021/06/10/a0f48b6c-80e5-4617-bbe6-0b1f73d3a6dc.json
scraped/2021/03/3

In [None]:
second_case = set()

with open('b.txt', 'r', encoding='utf-8') as f:
    for line in f:
        if 'second_case' in line:
            split = line.split('second_case  ')
            vid_path = split[0]
            idx = int(split[1])

            seg = vid_path.replace('.mp4', '.json').replace('.mkv', '.json')
            with open(seg) as f:
                seg = json.load(f)

            if isinstance(seg, list):
                continue

            yes = 0
            no = 0

            cap, fps, _ = get_vid_metadata(vid_path)

            start = max(seg['start'], idx - 1)
            end = max(seg['start'], int(idx - 1 - 3 * fps))

            for i in range(start, end - 1, -5):
                if not is_known(i, cap):
                    no += 1
                else:
                    yes += 1

            if no >= yes:
                second_case.add((vid_path, idx))
                print(vid_path + ' !!!')

            print(f'Done with {vid_path}')

            # de aici vezi daca exista 3 secunde fara content (din 5 in 5 cadre)

In [92]:
# in spate nu e content => bsearch [start, idx] si [idx, end]

# in fata nu e content => bsearch [idx, end]

for vid_path, idx in second_case:
    seg_path = vid_path.replace('.mp4', '.json').replace('.mkv', '.json')
    with open(seg_path) as f:
        seg = json.load(f)

    if isinstance(seg, list):
        continue

    cap, fps, frame_count = get_vid_metadata(vid_path)

    if is_known(idx, cap, shrink=True):
        raise ValueError()

    first_false = bsearch(seg['start'], idx, lambda x: not is_known(x, cap))
    if first_false < frame_count and not is_known(first_false, cap):
        end = first_false - 1
    else:
        print(f'skipped {vid_path}')
        continue
    if not is_known(end, cap, shrink=True):
        raise ValueError(vid)
    if is_known(end + 1, cap, shrink=True):
        raise ValueError(vid)

    start = bsearch(idx, seg['end'], lambda x: is_known(x, cap))
    if not is_known(start, cap, shrink=True):
        raise ValueError(vid)
    if is_known(start - 1, cap, shrink=True):
        raise ValueError(vid)

    seg = [
        {'start': seg['start'], 'end': end, 'interpreter': seg['interpreter']},
        {'start': start, 'end': seg['end'], 'interpreter': seg['interpreter']},
    ]
    with open(seg_path, 'w') as f:
        json.dump(seg, f)
        print(seg_path)

scraped/2022/03/04/70c66d3f-98f0-4f81-9da9-87dc178f47a1.json
scraped/2021/07/05/06695085-ca1d-40d4-b3c4-bf3c2858b297.json
scraped/2021/03/16/e19db1f3-89b9-4585-af73-9c4718fa19c5.json
scraped/2021/04/21/67577aa5-0b31-46d8-a6be-eaf65c0f02fc.json
scraped/2021/05/04/a41c384a-3a5f-4144-92ff-8b846724ceaf.json
scraped/2021/04/13/86bf98a5-a53d-4856-b2b1-ed7c4bcaf7dc.json
scraped/2021/06/08/1ecb01ef-8e96-44c5-ba39-e701614092f9.json
scraped/2021/07/06/eee51705-961b-4f15-82f0-1858620902e4.json
scraped/2021/06/23/1ddca7d2-f7ab-4445-88ad-2541454f994f.json
scraped/2021/03/18/eada8597-c6b3-47ec-92b1-ee837d598c66.json
scraped/2021/02/26/d3859b70-d4e2-4afd-9669-d74a6c879c03.json
scraped/2021/06/15/fcf31a6b-43dc-4810-892a-9513795f049a.json
scraped/2021/03/01/d485f09d-dc2f-47ab-b1ef-5bf156f7ba94.json
scraped/2021/07/09/3245adbc-7a94-49c1-8736-11b56a9d6ab6.json
scraped/2021/05/10/21b433ba-f595-40e9-a1d7-ad3c5939a6d2.json
scraped/2021/03/17/d2f2546b-adbf-4a92-8953-ec031621eefd.json
scraped/2021/07/12/11bc4

### Viz

In [None]:
import matplotlib.pyplot as plt

idx = 122007
vid = '2021/05/31/5a936525-9441-47de-a7e5-b5050b3a640a.mkv'

cap, _, _ = get_vid_metadata(os.path.join('scraped', vid))
frame = get_frame(idx, cap)
frame = crop_interpreter(frame)

plt.imshow(frame)
plt.axis('off')
plt.show()

## Crop segments with ffmpeg -- lossless quality

In [1]:
import os
import json
import shlex
import subprocess


ENCODER_TYPE = 'cpu'  # Options: "nvidia", "cpu"
QUALITY_SETTING = 20  # 0 is lossless (huge files). 18-24 is high quality.


def crop_segment(
    vid_path: str,
    output_path: str,
    start_frame: int,
    end_frame: int,
    fps: float,
    W: int,
    H: int,
    X: int,
    Y: int,
) -> bool:
    start_sec = start_frame / fps
    duration_sec = (end_frame - start_frame + 1) / fps

    crop_filter = f'crop={W}:{H}:{X}:{Y}'
    quiet_opts = ['-hide_banner', '-loglevel', 'error']

    command = [
        'ffmpeg',
        *quiet_opts,
        '-y',
        '-ss',
        f'{start_sec:.6f}',
        '-i',
        vid_path,
        '-t',
        f'{duration_sec:.6f}',
        '-vf',
        crop_filter,
    ]

    if ENCODER_TYPE == 'nvidia':
        encoder_opts = [
            '-c:v',
            'h264_nvenc',
            '-rc',
            'constqp',
            '-qp',
            str(QUALITY_SETTING),
            '-preset',
            'p1',
        ]
    elif ENCODER_TYPE == 'cpu':
        encoder_opts = [
            '-c:v',
            'libx264',
            '-crf',
            str(QUALITY_SETTING),
            '-preset',
            'superfast',
        ]
    else:
        raise ValueError(f'Unknown ENCODER_TYPE: {ENCODER_TYPE}')

    command.extend(encoder_opts)

    command.extend(
        [
            '-c:a',
            'copy',
            output_path,
        ]
    )

    try:
        subprocess.run(
            command,
            check=True,
            stdout=subprocess.DEVNULL,
            stderr=subprocess.PIPE,
            text=True,
        )
        return True
    except subprocess.CalledProcessError as e:
        print('--- FFMPEG ERROR ---')
        print(f'Error processing {vid_path} -> {output_path}')
        print(f'Command: {" ".join([shlex.quote(c) for c in command])}')
        print(f'Error Output: {e.stderr}')
        print('--------------------')
        return False
    except Exception as e:
        print('--- UNEXPECTED FFMPEG ERROR ---')
        print(f'Error processing {vid_path} -> {output_path}')
        print(f'Command: {" ".join([shlex.quote(c) for c in command])}')
        print(f'Exception: {e}')
        print('--------------------')
        return False


def get_w_h_fps(vid_path: str):
    command = [
        'ffprobe',
        '-v',
        'error',
        '-select_streams',
        'v:0',
        '-show_entries',
        'stream=width,height,r_frame_rate',
        '-of',
        'json',
        vid_path,
    ]

    try:
        result = subprocess.run(command, check=True, capture_output=True, text=True)
        metadata = json.loads(result.stdout)['streams'][0]

        w = int(metadata['width'])
        h = int(metadata['height'])

        # r_frame_rate is a fraction like "30/1" or "2997/100"
        num, den = metadata['r_frame_rate'].split('/')
        fps = float(num) / float(den)

        return w, h, fps

    except Exception as e:
        print(f'Error getting metadata for {vid_path}: {e}')
        print(f'ffprobe output: {result.stderr}')  # type: ignore
        raise


def enqueue(path: str):
    with open('queue.txt', 'a') as q:
        q.write(path + '\n')


for root, _, files in os.walk('scraped'):
    for file in files:
        if not file.endswith('.json'):
            continue

        json_path = os.path.join(root, file)
        vid_path = json_path.replace('.json', '.mp4')
        base_out_path = json_path.replace('.json', '.seg.mp4')

        if not os.path.exists(vid_path):
            vid_path = json_path.replace('.json', '.mkv')
            base_out_path = json_path.replace('.json', '.seg.mkv')

        if not os.path.exists(vid_path):
            print(f'No video found for {json_path}, skipping.')
            continue

        if os.path.exists(base_out_path):
            print(f'Skipping {vid_path} (base output already exists)')
            continue

        try:
            w, h, fps = get_w_h_fps(vid_path)
        except Exception as e:
            print(f'Skipping {vid_path} due to metadata error: {e}')
            continue

        if h == 720:
            X, Y = 952, 347
            W, H = 300, 322
        elif h == 432:
            X, Y = 571, 208
            W, H = 180, 193
        else:
            print(f'Skipping {vid_path}, height {h} not supported')
            continue

        with open(json_path) as f:
            segment = json.load(f)

        # Track whether everything for this video was processed OK
        process_ok = True

        if not isinstance(segment, list):
            print(f'Processing {vid_path}...')
            process_ok = crop_segment(
                vid_path,
                base_out_path,
                segment['start'],
                segment['end'],
                fps,
                W,
                H,
                X,
                Y,
            )
            if process_ok:
                enqueue(base_out_path)
        else:
            print(f'Processing {vid_path} as {len(segment)} segments...')
            base_name, ext = os.path.splitext(base_out_path)

            if not segment:
                print('No segments defined; not deleting original.')
                process_ok = False
            else:
                for i, seg in enumerate(segment):
                    out_path_i = f'{base_name}_{i}{ext}'

                    if os.path.exists(out_path_i):
                        print(f'Skipping segment {i} (already exists)')
                        continue

                    ok = crop_segment(
                        vid_path,
                        out_path_i,
                        seg['start'],
                        seg['end'],
                        fps,
                        W,
                        H,
                        X,
                        Y,
                    )
                    if ok:
                        enqueue(out_path_i)
                    else:
                        process_ok = False

        if process_ok:
            try:
                os.remove(vid_path)
                print(f'Deleted original video {vid_path}')
            except OSError as e:
                print(f'Failed to delete original video {vid_path}: {e}')

        print(f'Finished {vid_path}')


Processing scraped/2022/05/03/12a79215-1497-4547-a42d-18511d15925b.mkv...
Deleted original video scraped/2022/05/03/12a79215-1497-4547-a42d-18511d15925b.mkv
Finished scraped/2022/05/03/12a79215-1497-4547-a42d-18511d15925b.mkv
Processing scraped/2022/05/31/9c495d48-13f6-439d-92a3-bfc09b5cba6e.mkv...
Deleted original video scraped/2022/05/31/9c495d48-13f6-439d-92a3-bfc09b5cba6e.mkv
Finished scraped/2022/05/31/9c495d48-13f6-439d-92a3-bfc09b5cba6e.mkv
Processing scraped/2022/05/23/914513d1-2aeb-48d2-a6c9-2738d533eb21.mkv...
Deleted original video scraped/2022/05/23/914513d1-2aeb-48d2-a6c9-2738d533eb21.mkv
Finished scraped/2022/05/23/914513d1-2aeb-48d2-a6c9-2738d533eb21.mkv
Processing scraped/2022/05/13/6d59c679-f5a6-468a-823a-491ee2b8df52.mkv...
Deleted original video scraped/2022/05/13/6d59c679-f5a6-468a-823a-491ee2b8df52.mkv
Finished scraped/2022/05/13/6d59c679-f5a6-468a-823a-491ee2b8df52.mkv
Processing scraped/2022/05/18/548a0e1c-c4f3-4c2c-a915-f46b303376c4.mkv...
Deleted original video