In [1]:
#%pip install easyocr tqdm scipy opencv-python numpy

In [2]:
import cv2
import numpy as np
import easyocr
import json
from tqdm import tqdm
from PIL import Image
from scipy.fft import dct
from difflib import SequenceMatcher #find similarity between 2 strings

In [3]:
video_path = "sample_short.mp4"
sample_rate= 1 #frames per second

In [4]:
def extract_frames(video_path, sample_rate=2):
    """yield (frame_no, timestamp_sec, gray_frame)."""
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    step = max(1, int(fps / sample_rate))
    frame_no = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if frame_no % step == 0:
            ts = frame_no / fps
            gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            yield frame_no, ts, gray
        frame_no += 1
    cap.release()


frames = list(extract_frames(video_path, sample_rate))
frames[:3]

[(0,
  0.0,
  array([[168, 168, 168, ..., 106,  99,  90],
         [168, 168, 168, ..., 115,  99,  96],
         [168, 168, 168, ..., 128, 103, 101],
         ...,
         [111, 110, 110, ..., 149, 144, 142],
         [111, 111, 111, ..., 148, 145, 144],
         [111, 111, 111, ..., 145, 145, 145]], shape=(900, 720), dtype=uint8)),
 (30,
  1.0,
  array([[224, 181, 124, ..., 103, 103, 102],
         [212, 167, 110, ..., 106, 106, 104],
         [185, 141,  84, ..., 109, 110, 110],
         ...,
         [ 58,  58,  57, ..., 146, 146, 146],
         [ 57,  57,  57, ..., 146, 146, 146],
         [ 56,  56,  56, ..., 146, 146, 146]], shape=(900, 720), dtype=uint8)),
 (60,
  2.0,
  array([[ 71,  71,  71, ..., 195, 195, 195],
         [ 71,  71,  71, ..., 194, 194, 194],
         [ 71,  71,  71, ..., 192, 192, 192],
         ...,
         [ 81,  81,  81, ..., 200, 200, 200],
         [ 82,  82,  82, ..., 199, 199, 199],
         [ 82,  82,  82, ..., 199, 199, 199]], shape=(900, 720), dtype

In [5]:
print(type(frames[0][2]))
print(frames[0][2].shape)
print(len(frames))

<class 'numpy.ndarray'>
(900, 720)
256


learned some new things here while looking for methods to filter duplicate images: Discrete Cosine Transform (was thinking about FOurier tranform). 
- DCT concentrates most of the image's "energy" (structural information) into the top-left coefficients of the transformed matrix.
- FFT distributes energy more evenly across the frequency spectrum, making it harder to isolate key low-frequency components.

The DCT is the core of JPEG compression, computationally efficient and numerically stable making it ideal for perceptual hashing, where the goal is to extract stable, low-frequency features while ignoring noise and minor distortions

In [6]:
def _phash(image, hash_size=8, highfreq_factor=4):
    """computes a perceptual hash (fingerprint) of an image"""
    img = cv2.resize(image, (hash_size * highfreq_factor,)*2)
    d = dct(dct(img.astype(float), axis=0), axis=1)
    low = d[:hash_size, :hash_size]
    return (low > np.median(low)).flatten()


def filter_duplicates(frames, diff_thresh=10):
    kept, prev = [], None
    for fn, ts, img in frames:
        h = _phash(img)
        if prev is None or np.count_nonzero(h != prev) > diff_thresh:
            kept.append((fn, ts, img))
            prev = h
    return kept

frames_filtered = filter_duplicates(frames)
frames_filtered[:3]

[(0,
  0.0,
  array([[168, 168, 168, ..., 106,  99,  90],
         [168, 168, 168, ..., 115,  99,  96],
         [168, 168, 168, ..., 128, 103, 101],
         ...,
         [111, 110, 110, ..., 149, 144, 142],
         [111, 111, 111, ..., 148, 145, 144],
         [111, 111, 111, ..., 145, 145, 145]], shape=(900, 720), dtype=uint8)),
 (30,
  1.0,
  array([[224, 181, 124, ..., 103, 103, 102],
         [212, 167, 110, ..., 106, 106, 104],
         [185, 141,  84, ..., 109, 110, 110],
         ...,
         [ 58,  58,  57, ..., 146, 146, 146],
         [ 57,  57,  57, ..., 146, 146, 146],
         [ 56,  56,  56, ..., 146, 146, 146]], shape=(900, 720), dtype=uint8)),
 (60,
  2.0,
  array([[ 71,  71,  71, ..., 195, 195, 195],
         [ 71,  71,  71, ..., 194, 194, 194],
         [ 71,  71,  71, ..., 192, 192, 192],
         ...,
         [ 81,  81,  81, ..., 200, 200, 200],
         [ 82,  82,  82, ..., 199, 199, 199],
         [ 82,  82,  82, ..., 199, 199, 199]], shape=(900, 720), dtype

In [7]:
print(frames_filtered[0][2].shape)
print(len(frames_filtered))

(900, 720)
198


In [8]:
def _crop_subtitle_band(frame, height_ratio=0.2):
    """Crop bottom `height_ratio` of the frame."""
    h = frame.shape[0]
    return frame[int(h * (1 - height_ratio)):, :]

def ocr_frames(frames, langs=['fr'], gpu=False):
    reader = easyocr.Reader(langs, gpu=gpu)
    results = []
    for fn, ts, img in tqdm(frames, desc="OCR"):
        crop = _crop_subtitle_band(img)
        arr = np.array(crop)
        try:
            boxes = reader.readtext(arr)
        except Exception:
            print(f"OCR error on frame {fn}: {e}")
            continue
        text = ' '.join(b[1] for b in boxes).strip()
        if text: #and not text.lower().endswith("clideo com"):
            results.append((fn, ts, text))
    return results


ocrs = ocr_frames(frames_filtered, gpu=True)
ocrs[:10]

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
OCR: 100%|██████████| 198/198 [01:27<00:00,  2.27it/s]


[(0, 0.0, 'clideo com'),
 (30, 1.0, 'clideo com'),
 (60, 2.0, 'Chef de faille clideo com'),
 (90, 3.0, 'clideocom'),
 (120, 4.0, 'je vous ai appelé avez-vous eu des nouvelles de Rose clideo com'),
 (150, 5.0, 'je vous &i appelé avez-vous eu des nouvelles de Rose clideo com'),
 (180, 6.0, 'je vous &i appelé avez-vous eu des nouvelles de Rose clideo.com'),
 (210, 7.0, 'je vous ai appelé avez-vous eu des nouvelles de Rose clideo com'),
 (240, 8.0, "Tu n'as pas de téléphone ? clideo com"),
 (270, 9.0, "Tu n'as pas de telephone ? clideo com")]

In [9]:
print(len(frames_filtered))

198


In [10]:
def _similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

def merge_segments(ocr_res, sim_thresh=0.8):
    if not ocr_res:
        return []
    segs = []
    cur_txt = ocr_res[0][2]
    start, end = ocr_res[0][1], ocr_res[0][1]
    for _, ts, txt in ocr_res[1:]:
        if _similar(cur_txt, txt) >= sim_thresh: #if text == cur_text:
            end = ts
            # keep longer
            if len(txt) > len(cur_txt):
                cur_txt = txt
        else:
            segs.append((start, end, cur_txt))
            cur_txt, start, end = txt, ts, ts
    segs.append((start, end, cur_txt))
    return segs

segs = merge_segments(ocrs)
segs[:10]

[(0.0, 1.0, 'clideo com'),
 (2.0, 2.0, 'Chef de faille clideo com'),
 (3.0, 3.0, 'clideocom'),
 (4.0, 7.0, 'je vous ai appelé avez-vous eu des nouvelles de Rose clideo com'),
 (8.0, 9.0, "Tu n'as pas de téléphone ? clideo com"),
 (11.0, 11.0, 'mais je vous ai dit que je Fai appelë clideo com'),
 (12.0, 13.0, "dans ce cas continue à l'appeler clideo.com"),
 (15.0, 15.0, 'comment ça continuer à appeler ? clideo com'),
 (16.0, 17.0, "vous devriez le faire c'est votre femme clideo.com"),
 (18.0,
  20.0,
  "dernièrement j'ai appele quelle est reparti chez sa tante clideo com")]

In [11]:
def _format_ts(sec):
    ms = int((sec - int(sec)) * 1000)
    h, rem = divmod(int(sec), 3600)
    m, s = divmod(rem, 60)
    return f"{h:02d}:{m:02d}:{s:02d}.{ms:03d}"

def video_to_subs_json(video_path, sample_rate=2, gpu=True):
    frames = list(extract_frames(video_path, sample_rate))
    frames = filter_duplicates(frames)
    ocrs = ocr_frames(frames, gpu=gpu)
    segs = merge_segments(ocrs)
    return [
        {"start_time": _format_ts(s), "end_time": _format_ts(e), "text": t}
        for s, e, t in segs
    ]


resultat = video_to_subs_json("sample_short.mp4", sample_rate=1)
with open("resultat_first_test.json", "w", encoding="utf-8") as f:
    json.dump(resultat, f, ensure_ascii=False, indent=2)

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
OCR: 100%|██████████| 198/198 [01:56<00:00,  1.70it/s]
