In [3]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision.io import read_video
from collections import defaultdict

## Preprocess

In [4]:
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
import cv2

In [6]:
!wget -q -O detector.tflite -q https://storage.googleapis.com/mediapipe-models/face_detector/blaze_face_short_range/float16/1/blaze_face_short_range.tflite

In [12]:
model_path = 'detector.tflite'
IMAGE_FILE = '1405992.jpg'

In [13]:
base_options = python.BaseOptions(model_asset_path=model_path)
options = vision.FaceDetectorOptions(base_options=base_options)
detector = vision.FaceDetector.create_from_options(options)

# STEP 3: Load the input image.
image = mp.Image.create_from_file(IMAGE_FILE)

# STEP 4: Detect faces in the input image.
detection_result = detector.detect(image)

In [17]:
detection_result.detections[0].bounding_box

BoundingBox(origin_x=683, origin_y=215, width=240, height=240)

In [24]:
detection_result.detections[0].categories[0].score

0.9633539915084839

In [None]:
mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=numpy_frame_from_opencv)

In [26]:
import mediapipe as mp

BaseOptions = mp.tasks.BaseOptions
FaceDetector = mp.tasks.vision.FaceDetector
FaceDetectorOptions = mp.tasks.vision.FaceDetectorOptions
VisionRunningMode = mp.tasks.vision.RunningMode

# Create a face detector instance with the video mode:
options = FaceDetectorOptions(
    base_options=BaseOptions(model_asset_path=model_path),
    running_mode=VisionRunningMode.VIDEO)
with FaceDetector.create_from_options(options) as detector:
    detection_result = detector.detect(image)

ValueError: Task is not initialized with the image mode. Current running mode:VIDEO

In [None]:
import sys

if sys.version_info[0] < 3 and sys.version_info[1] < 2:
	raise Exception("Must be using >= Python 3.2")

from os import listdir, path

if not path.isfile('face_detection/detection/sfd/s3fd.pth'):
	raise FileNotFoundError('Save the s3fd model to face_detection/detection/sfd/s3fd.pth \
							before running this script!')

import multiprocessing as mp
from concurrent.futures import ThreadPoolExecutor, as_completed
import numpy as np
import argparse, os, cv2, traceback, subprocess
from tqdm import tqdm
from glob import glob
import audio
from hparams import hparams as hp

import face_detection

parser = argparse.ArgumentParser()

parser.add_argument('--ngpu', help='Number of GPUs across which to run in parallel', default=1, type=int)
parser.add_argument('--batch_size', help='Single GPU Face detection batch size', default=32, type=int)
parser.add_argument("--data_root", help="Root folder of the LRS2 dataset", required=True)
parser.add_argument("--preprocessed_root", help="Root folder of the preprocessed dataset", required=True)

args = parser.parse_args()

fa = [face_detection.FaceAlignment(face_detection.LandmarksType._2D, flip_input=False, 
									device='cuda:{}'.format(id)) for id in range(args.ngpu)]

template = 'ffmpeg -loglevel panic -y -i {} -strict -2 {}'
# template2 = 'ffmpeg -hide_banner -loglevel panic -threads 1 -y -i {} -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 {}'

def process_video_file(vfile, args, gpu_id):
	video_stream = cv2.VideoCapture(vfile)
	
	frames = []
	while 1:
		still_reading, frame = video_stream.read()
		if not still_reading:
			video_stream.release()
			break
		frames.append(frame)
	
	vidname = os.path.basename(vfile).split('.')[0]
	dirname = vfile.split('/')[-2]

	fulldir = path.join(args.preprocessed_root, dirname, vidname)
	os.makedirs(fulldir, exist_ok=True)

	batches = [frames[i:i + args.batch_size] for i in range(0, len(frames), args.batch_size)]

	i = -1
	for fb in batches:
		preds = fa[gpu_id].get_detections_for_batch(np.asarray(fb))

		for j, f in enumerate(preds):
			i += 1
			if f is None:
				continue

			x1, y1, x2, y2 = f
			cv2.imwrite(path.join(fulldir, '{}.jpg'.format(i)), fb[j][y1:y2, x1:x2])

def process_audio_file(vfile, args):
	vidname = os.path.basename(vfile).split('.')[0]
	dirname = vfile.split('/')[-2]

	fulldir = path.join(args.preprocessed_root, dirname, vidname)
	os.makedirs(fulldir, exist_ok=True)

	wavpath = path.join(fulldir, 'audio.wav')

	command = template.format(vfile, wavpath)
	subprocess.call(command, shell=True)

	
def mp_handler(job):
	vfile, args, gpu_id = job
	try:
		process_video_file(vfile, args, gpu_id)
	except KeyboardInterrupt:
		exit(0)
	except:
		traceback.print_exc()
		
def main(args):
	print('Started processing for {} with {} GPUs'.format(args.data_root, args.ngpu))

	filelist = glob(path.join(args.data_root, '*/*.mp4'))

	jobs = [(vfile, args, i%args.ngpu) for i, vfile in enumerate(filelist)]
	p = ThreadPoolExecutor(args.ngpu)
	futures = [p.submit(mp_handler, j) for j in jobs]
	_ = [r.result() for r in tqdm(as_completed(futures), total=len(futures))]

	print('Dumping audios...')

	for vfile in tqdm(filelist):
		try:
			process_audio_file(vfile, args)
		except KeyboardInterrupt:
			exit(0)
		except:
			traceback.print_exc()
			continue

if __name__ == '__main__':
	main(args)

In [None]:
def read_align_file(align_file_path):
    timestamps = []
    phonetic_labels = []

    with open(align_file_path, 'r') as align_file:
        for line in align_file:
            parts = line.strip().split()
            if len(parts) >= 3:
                timestamp = float(parts[0])
                label = parts[2]
                timestamps.append(timestamp)
                phonetic_labels.append(label)

    return timestamps, phonetic_labels

align_file_path = 'path_to_your_align_file.align'

timestamps, phonetic_labels = read_align_file(align_file_path)

for timestamp, label in zip(timestamps, phonetic_labels):
    print(f"Timestamp: {timestamp}, Label: {label}")


In [3]:
class CustomDataset(Dataset):
    def __init__(self, video_root, align_root):
        self.video_root = video_root
        self.align_root = align_root

        self.samples = self._prepare_samples()

    def _prepare_samples(self):
        samples = []
        speakers = os.listdir(self.video_root)
        for speaker in speakers:
            video_files = os.listdir(os.path.join(self.video_root, speaker))
            align_files = os.listdir(os.path.join(self.align_root, speaker))

            # Match video files with corresponding align files
            for video_file in video_files:
                video_id = os.path.splitext(video_file)[0]
                if f"{video_id}.align" in align_files:
                    samples.append((speaker, video_id))

        return samples

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        speaker, video_id = self.samples[idx]
        video_path = os.path.join(self.video_root, speaker, f"{video_id}.mpg")
        align_path = os.path.join(self.align_root, speaker, f"{video_id}.align")

        video, _, _ = read_video(video_path)
        timestamps, labels = self._read_align_file(align_path)

        return video, timestamps, labels

    def _read_align_file(self, align_file_path):
        timestamps = []
        labels = []

        with open(align_file_path, 'r') as align_file:
            for line in align_file:
                parts = line.strip().split()
                if len(parts) >= 3:
                    timestamp = float(parts[0])
                    label = parts[2]
                    timestamps.append(timestamp)
                    labels.append(label)

        return timestamps, labels

video_root = '/home2/souvikg544/gridcorpus/video'
align_root = '/home2/souvikg544/gridcorpus/transcription'

dataset = CustomDataset(video_root, align_root)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

# Example usage of DataLoader
# for video, timestamps, labels in dataloader:
#     # Process the video frames and labels here
#     print(f"Video shape: {video.shape}, Timestamps: {timestamps}, Labels: {labels}")


In [None]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision.io import read_video
from torchvision import transforms
from collections import defaultdict

class CustomDataset(Dataset):
    def __init__(self, video_root, align_root, transform=None):
        self.video_root = video_root
        self.align_root = align_root
        self.transform = transform

        self.samples = self._prepare_samples()

     def _prepare_samples(self):
        samples = []
        speakers = os.listdir(self.video_root)
        for speaker in speakers:
            video_files = os.listdir(os.path.join(self.video_root, speaker))
            align_files = os.listdir(os.path.join(self.align_root, speaker))

            # Match video files with corresponding align files
            for video_file in video_files:
                video_id = os.path.splitext(video_file)[0]
                if f"{video_id}.align" in align_files:
                    samples.append((speaker, video_id))

        return samples

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        speaker, video_id = self.samples[idx]
        video_path = os.path.join(self.video_root, speaker, f"{video_id}.mpg")
        align_path = os.path.join(self.align_root, speaker, f"{video_id}.align")

        video, _, _ = read_video(video_path)
        timestamps, labels = self._read_align_file(align_path)

        frame_tensors = []
        for timestamp in timestamps:
            frame_idx = int(timestamp * video.shape[0])
            frame = video[frame_idx]
            if self.transform:
                frame = self.transform(frame)
            frame_tensors.append(frame)

        frame_tensors = torch.stack(frame_tensors).to(torch.device("cuda"))  # Move to GPU

        label_tensors = torch.tensor(labels).to(torch.device("cuda"))  # Move to GPU

        return frame_tensors, label_tensors


     def _read_align_file(self, align_file_path):
        timestamps = []
        labels = []

        with open(align_file_path, 'r') as align_file:
            for line in align_file:
                parts = line.strip().split()
                if len(parts) >= 3:
                    timestamp = float(parts[0])
                    label = parts[2]
                    timestamps.append(timestamp)
                    labels.append(label)

        return timestamps, labels

# Set your image transformation
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

video_root = '/home2/souvikg544/gridcorpus/video'
align_root = '/home2/souvikg544/gridcorpus/transcription'

# Create dataset and dataloader
dataset = CustomDataset(video_root, align_root, transform=transform)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

# Example usage of DataLoader
for frames, labels in dataloader:
    frames = frames.to(torch.device("cuda"))  # Move to GPU
    labels = labels.to(torch.device("cuda"))  # Move to GPU

    # Process frames and labels on GPU
    print(f"Frames shape: {frames.shape}, Labels: {labels}")
