In [1]:
import numpy as np
import cv2

print("NumPy version:", np.__version__)
print("OpenCV version:", cv2.__version__)


NumPy version: 1.26.4
OpenCV version: 4.11.0


In [None]:
import pandas as pd
import numpy as np
import whisper
import os
from whisper.utils import format_timestamp
import cv2
import mediapipe as mp
import os
import json

#### transcripts

In [None]:
model = whisper.load_model("small")

VIDEO_DIR = "/workspaces/Gesture-Language-Alignment-in-Instructional-Videos/00_videos/raw_videos"
SRT_OUTPUT_DIR = "/workspaces/Gesture-Language-Alignment-in-Instructional-Videos/00_videos/transcripts_with_timestamps"
os.makedirs(SRT_OUTPUT_DIR, exist_ok=True)

for filename in os.listdir(VIDEO_DIR):
    if filename.lower().endswith((".mp4", ".webm", ".mkv", ".mov")):
        video_path = os.path.join(VIDEO_DIR, filename)
        print(f"Transcribing to .srt: {filename}")
        
        result = model.transcribe(video_path)
        srt_path = os.path.join(SRT_OUTPUT_DIR, filename.rsplit(".", 1)[0] + ".srt")
        
        with open(srt_path, "w") as f:
            for i, segment in enumerate(result["segments"]):
                start = format_timestamp(segment['start'], always_include_hours=True)
                end = format_timestamp(segment['end'], always_include_hours=True)
                text = segment["text"].strip()
                
                f.write(f"{i + 1}\n{start} --> {end}\n{text}\n\n")

        print(f"Saved SRT: {srt_path}")

Transcribing to .srt: InasFavoriteThings.webm




Saved SRT: /workspaces/Gesture-Language-Alignment-in-Instructional-Videos/00_videos/InasFavoriteThings.srt
Transcribing to .srt: EasyBlueberryMuffinsRecipe.mp4




Saved SRT: /workspaces/Gesture-Language-Alignment-in-Instructional-Videos/00_videos/EasyBlueberryMuffinsRecipe.srt
Transcribing to .srt: PeruvianChickenRecipe.webm




Saved SRT: /workspaces/Gesture-Language-Alignment-in-Instructional-Videos/00_videos/PeruvianChickenRecipe.srt
Transcribing to .srt: PerfectWeeknightShrimpFriedRice.mkv




Saved SRT: /workspaces/Gesture-Language-Alignment-in-Instructional-Videos/00_videos/PerfectWeeknightShrimpFriedRice.srt


#### poses 

In [None]:
VIDEO_DIR = "/workspaces/Gesture-Language-Alignment-in-Instructional-Videos/00_videos/raw_videos"
OUTPUT_DIR = "/workspaces/Gesture-Language-Alignment-in-Instructional-Videos/00_videos/pose_outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

mp_pose = mp.solutions.pose
pose = mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5)
mp_drawing = mp.solutions.drawing_utils

for filename in os.listdir(VIDEO_DIR):
    if filename.lower().endswith((".mp4", ".webm", ".mkv", ".mov")):
        video_path = os.path.join(VIDEO_DIR, filename)
        cap = cv2.VideoCapture(video_path)

        frame_data = {}
        frame_idx = 0

        print(f"Processing: {filename}")

        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            results = pose.process(frame_rgb)

            if results.pose_landmarks:
                keypoints = []
                for lm in results.pose_landmarks.landmark:
                    keypoints.append({
                        "x": lm.x,
                        "y": lm.y,
                        "z": lm.z,
                        "visibility": lm.visibility
                    })
                frame_data[frame_idx] = keypoints

            frame_idx += 1

        cap.release()

        output_path = os.path.join(OUTPUT_DIR, filename.rsplit(".", 1)[0] + "_pose.json")
        with open(output_path, "w") as f:
            json.dump(frame_data, f)

        print(f"Saved pose data to: {output_path}\n")

pose.close()