In [None]:
import cv2
import numpy as np
import json
import difflib #to find similarity between two strings
from IPython.display import clear_output

from easyocr import Reader

Using CPU. Note: This module is much faster with a GPU.


In [3]:
def _format_time(seconds):
    ms = int((seconds - int(seconds)) * 1000)
    h, rem = divmod(int(seconds), 3600)
    m, s = divmod(rem, 60)
    return f"{h:02d}:{m:02d}:{s:02d}.{ms:03d}"

def _process_frame(img, crop_area=None):
    if crop_area:
        x, y, w, h = crop_area
        img = img[y:y+h, x:x+w]
    result = reader.readtext(img, detail=0)
    return ' '.join(result).strip()

In [None]:
VIDEO_PATH = 'sample_short.mp4'
SAMPLE_RATE = 1      #frames per second
CROP_AREA = None     # (x, y, w, h) or None for bottom 25%
SIM_THRESHOLD = 0.8  #text similarity for merging

reader = Reader(['fr'], gpu=False) #gpu not working :( (should try with another conda env with cuda and cudnn install)

cap = cv2.VideoCapture(VIDEO_PATH)
fps = cap.get(cv2.CAP_PROP_FPS)
interval = max(1, int(fps // SAMPLE_RATE))

In [4]:
segments = []
current_text, start_frame, end_frame = None, 0, 0

frame_idx = 0
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        
    if frame_idx % interval == 0:
        text = _process_frame(frame, CROP_AREA)
        
        if text:
            if current_text:
                similarity = difflib.SequenceMatcher(None, current_text, text).ratio()
                if similarity >= SIM_THRESHOLD:
                    end_frame = frame_idx
                else:
                    segments.append({
                        'start': _format_time(start_frame/fps),
                        'end': _format_time(end_frame/fps),
                        'text': current_text
                    })
                    current_text = text
                    start_frame = frame_idx
            else:
                current_text = text
                start_frame = frame_idx
            end_frame = frame_idx
            
    frame_idx += 1

cap.release()
if current_text:
    segments.append({
        'start': _format_time(start_frame/fps),
        'end': _format_time(end_frame/fps),
        'text': current_text
    })



In [7]:
OUTPUT_FORMAT = 'json' 
OUTPUT_FILE = 'resultat_quick_try'

if OUTPUT_FORMAT == 'json':
    with open(f"{OUTPUT_FILE}.json", 'w', encoding='utf-8') as f:
        json.dump(segments, f, indent=2, ensure_ascii=False)
    
# elif OUTPUT_FORMAT == 'srt':
#     with open(f"{OUTPUT_FILE}.srt", 'w', encoding='utf-8') as f:
#         for i, seg in enumerate(segments, 1):
#             f.write(f"{i}\n")
#             f.write(f"{seg['start'].replace('.', ',')} --> {seg['end'].replace('.', ',')}\n")
#             f.write(f"{seg['text']}\n\n")