I had to install tesseract on my machine to make this work
```python
sudo pacman -S tesseract tesseract-data-eng
```

In [11]:
import cv2
import numpy as np
import pytesseract
import queue
import threading
import time
from PIL import Image
import io
import re

In [12]:
INPUT_VIDEO_PATH = "data/videos/Rec16-1.mp4"
OUTPUT_VIDEO_PATH = "output/Rec16-1_trimmed.mp4"
OUTPUT_TXT_PATH = "output/timestamps/Rec16-1_trimmed.txt"

In [13]:
timestamp_region = (100, 831, 193, 865)  # (x1, y1, x2, y2)

In [14]:
def read_frames(video_path, frame_queue, stop_event):
    cap = cv2.VideoCapture(video_path)
    while not stop_event.is_set():
        ret, frame = cap.read()
        if not ret:
            break
        frame_queue.put(frame)
    cap.release()
    frame_queue.put(None)  # Sentinel to signal end of video

In [15]:
def process_frames(frame_queue, output_queue, timestamp_queue, timestamp_region, stop_event):
    prev_timestamp = None
    while not stop_event.is_set():
        frame = frame_queue.get()
        if frame is None:
            break
        
        x1, y1, x2, y2 = timestamp_region
        timestamp_img = frame[y1:y2, x1:x2]
        
        gray = cv2.cvtColor(timestamp_img, cv2.COLOR_BGR2GRAY)
        thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
        
        timestamp = pytesseract.image_to_string(thresh, config='--psm 7').strip()
        
        if timestamp != prev_timestamp:
            output_queue.put((timestamp, frame))
            # Remove colons and keep only numbers
            cleaned_timestamp = re.sub(r'[^\d]', '', timestamp)
            print(f'Processed frame with timestamp: {cleaned_timestamp}')
            timestamp_queue.put(cleaned_timestamp)
            prev_timestamp = timestamp
        
        frame_queue.task_done()
    
    output_queue.put(None)  # Sentinel to signal end of processing
    timestamp_queue.put(None)  # Sentinel for timestamp queue

In [16]:
def write_video(output_queue, output_path, frame_size, fps, stop_event):
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, frame_size)
    frame_count = 0
    
    while not stop_event.is_set():
        item = output_queue.get()
        if item is None:
            break
        
        timestamp, frame = item
        out.write(frame)
        frame_count += 1
        
        if frame_count % 100 == 0:
            print(f"Processed {frame_count} unique frames")
        
        output_queue.task_done()
    
    out.release()
    print(f"Video processing complete. {frame_count} unique frames extracted.")


In [17]:
def write_timestamps(timestamp_queue, output_path, stop_event):
    with open(output_path, 'w') as f:
        while not stop_event.is_set():
            timestamp = timestamp_queue.get()
            if timestamp is None:
                break
            f.write(f"{timestamp}\n")
            timestamp_queue.task_done()
    print(f"Timestamp file created: {output_path}")

In [18]:
def extract_unique_timestamp_frames_to_video(input_video_path, output_video_path, output_timestamp_path, timestamp_region):
    cap = cv2.VideoCapture(input_video_path)
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    frame_size = (int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)))
    cap.release()

    frame_queue = queue.Queue(maxsize=100)
    output_queue = queue.Queue(maxsize=100)
    timestamp_queue = queue.Queue()
    stop_event = threading.Event()

    read_thread = threading.Thread(target=read_frames, args=(input_video_path, frame_queue, stop_event))
    process_thread = threading.Thread(target=process_frames, args=(frame_queue, output_queue, timestamp_queue, timestamp_region, stop_event))
    write_video_thread = threading.Thread(target=write_video, args=(output_queue, output_video_path, frame_size, fps, stop_event))
    write_timestamp_thread = threading.Thread(target=write_timestamps, args=(timestamp_queue, output_timestamp_path, stop_event))

    read_thread.start()
    process_thread.start()
    write_video_thread.start()
    write_timestamp_thread.start()

    try:
        read_thread.join()
        process_thread.join()
        write_video_thread.join()
        write_timestamp_thread.join()
    except KeyboardInterrupt:
        print("Stopping processing...")
        stop_event.set()
        read_thread.join()
        process_thread.join()
        write_video_thread.join()
        write_timestamp_thread.join()

    return output_video_path, output_timestamp_path


In [19]:
# for finding the timestamp region
def display_timestamp_region(video_path, timestamp_region):
    """
    Get the first frame of a video, draw a box around the timestamp region,
    and display the frame.

    :param video_path: Path to the input video file
    :param timestamp_region: Tuple of (x1, y1, x2, y2) specifying the region for timestamp
    """
    # Open the video file
    cap = cv2.VideoCapture(video_path)

    # Read the first frame
    ret, frame = cap.read()
    if not ret:
        print("Failed to read the video file.")
        return

    # Release the video capture object
    cap.release()

    # Draw a rectangle around the timestamp region
    x1, y1, x2, y2 = timestamp_region
    cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

    # Add text to indicate the timestamp region
    cv2.putText(frame, "Timestamp Region", (x1, y1 - 10),
                cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

    # Display the frame
    cv2.imshow("First Frame with Timestamp Region", frame)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

    # Save the frame as an image file
    cv2.imwrite("first_frame_with_timestamp.jpg", frame)
    print("Frame saved as 'first_frame_with_timestamp.jpg'")

# display_timestamp_region(INPUT_VIDEO_PATH, timestamp_region)

In [20]:
# result_video = extract_unique_timestamp_frames_to_video(INPUT_VIDEO_PATH, OUTPUT_VIDEO_PATH, OUTPUT_TXT_PATH, timestamp_region)
# print(f"Output video saved to: {result_video}")

In [21]:
import os
def process_video(video_config):
    input_video_path = video_config['input_path']
    video_id = os.path.splitext(os.path.basename(input_video_path))[0]
    output_dir = 'output'
    os.makedirs(output_dir, exist_ok=True)
    output_video_path = os.path.join(output_dir, f"{video_id}_trimmed.mp4")
    output_txt_path = os.path.join(output_dir, 'timestamps', f"{video_id}_trimmed.txt")
    os.makedirs(os.path.dirname(output_txt_path), exist_ok=True)
    timestamp_region = video_config['timestamp_region']
    
    extract_unique_timestamp_frames_to_video(input_video_path, output_video_path, output_txt_path, timestamp_region)

In [22]:
import concurrent.futures

def process_videos(video_configs):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_video, config) for config in video_configs]
        
        for future in concurrent.futures.as_completed(futures):
            try:
                future.result()
            except Exception as e:
                print(f"An error occurred: {str(e)}")

In [23]:
video_configs = [
    {
        'input_path': 'data/videos/Rec16-1.mp4',
        'timestamp_region': (100, 831, 193, 865)
    }
]

In [24]:
process_videos(video_configs)

Processed frame with timestamp: 000000120
Processed frame with timestamp: 000000140
Processed frame with timestamp: 000000160
Processed frame with timestamp: 000000200
Processed frame with timestamp: 000000280
Processed frame with timestamp: 000000320
Processed frame with timestamp: 000000400
Processed frame with timestamp: 000000460
Processed frame with timestamp: 000000480
Processed frame with timestamp: 000000500
Processed frame with timestamp: 000000520
Processed frame with timestamp: 000000540
Processed frame with timestamp: 000000560
Processed frame with timestamp: 000000580
Processed frame with timestamp: 000000600
Processed frame with timestamp: 000000620
Processed frame with timestamp: 000000639
Processed frame with timestamp: 000000679
Processed frame with timestamp: 000000699
Processed frame with timestamp: 000000719
Processed frame with timestamp: 000000759
Processed frame with timestamp: 000000779
Processed frame with timestamp: 000000799
Processed frame with timestamp: 00