In [1]:
"""
The script allows to divide the WLASL dataset into sub-datasets. The division
is made according to the order indicated in the JSON file. This file is made
available by the authors of WLASL dataset.

Usage: python k_gloss_splitting.py param1 param2
 - param1: path to the full dataset (e.g. ./WLASL_full/)
 - param2: number of glosses to be considered for the split (e.g. 2000)
"""
import json
import os
import shutil
import sys

import cv2
from tqdm import tqdm

In [19]:
# global variables
PATH_JSON = r'../json/WLASL_v0.3.json'
path_dataset = r'../video_data/videos/'
glosses = 2000

In [20]:
def dataset_processing(glosses, path_k_glosses_dir, path_dataset):
    # read the json as a list of dictionaries
    wlasl_json = read_json(PATH_JSON)
    splitted_videos = splitting_train_val_test(wlasl_json, glosses)
    make_target_dirs(wlasl_json, glosses, path_k_glosses_dir)
    save_in_dirs(path_dataset, path_k_glosses_dir, splitted_videos)

def read_json(file_path):
    with open(file_path) as f:
        wlasl_json = json.load(f)
    return wlasl_json

def splitting_train_val_test(json_file, glosses):
    print('[log] > Splitting videos in train, val and test ...')
    videos_dict = {}
    for k, gloss in tqdm(enumerate(json_file)):
        if k < glosses:
            videos = gloss['instances']
            for video in videos:
                video_id = video['video_id']
                target_dir = video['split']
                gloss_name = gloss['gloss']
                videos_dict[video_id] = (target_dir, gloss_name)
        else:
            break

    return videos_dict

In [21]:
def make_target_dirs(json_file, glosses, path_k_glosses_dir):
    if os.path.isdir('./' + path_k_glosses_dir):
        shutil.rmtree(path_k_glosses_dir)
    os.mkdir(path_k_glosses_dir)
    # create the train, val and test dirs
    os.mkdir(path_k_glosses_dir + 'train')
    os.mkdir(path_k_glosses_dir + 'val')
    os.mkdir(path_k_glosses_dir + 'test')

    print('\n[log] > Creating dirs ...')
    for k, gloss in tqdm(enumerate(json_file)):
        if k < glosses:
            os.mkdir(path_k_glosses_dir + 'train/' + gloss['gloss'])
            os.mkdir(path_k_glosses_dir + 'val/' + gloss['gloss'])
            os.mkdir(path_k_glosses_dir + 'test/' + gloss['gloss'])
        else:
            break

def save_in_dirs(path_dataset, path_k_glosses_dir, videos):
    print('\n[log] > Copying videos in their own dir ...')
    for video_id, data in tqdm(videos.items()):
        source_url = path_dataset + video_id + '.mp4'
        if not os.path.exists('{}'.format(source_url)):
            continue
        destination_url = path_k_glosses_dir + data[0] + '/' + data[1] + '/'
        shutil.copy(source_url, destination_url)

In [22]:
def show_info(path_k_glosses_dir):
    print_entries(path_k_glosses_dir)
    print_videos_info(path_k_glosses_dir)

def print_entries(path_root):
    path_train = path_root + 'train/'
    path_val = path_root + 'val/'
    path_test = path_root + 'test/'

    n_tot = sum([len(files) for _, _, files in os.walk(path_root)])
    n_train = sum([len(files) for _, _, files in os.walk(path_train)])
    n_val = sum([len(files) for _, _, files in os.walk(path_val)])
    n_test = sum([len(files) for _, _, files in os.walk(path_test)])

    print('\n[log] > Dataset summary:')
    print(f'Total videos: {n_tot}')
    print(f'Videos in train: {n_train} - {(n_train / n_tot * 100):,.0f}%')
    print(f'Videos in val:   {n_val} - {(n_val / n_tot * 100):,.0f}%')
    print(f'Videos in test:  {n_test} - {(n_test / n_tot * 100):,.0f}%')


def print_videos_info(path_root):
    videos = get_videos_path(path_root)
    info = get_videos_info(videos)
    print('\n[log] > Dataset info:')
    print(
        f'The video {info[0][0]} has the MIN length: {info[0][1]} - '
        f'Total frames: {info[0][2]}'
    )
    print(
        f'The video {info[-1][0]} has the MAX length: {info[-1][1]} - '
        f'Total frames: {info[-1][2]}'
    )

In [23]:
def get_videos_path(path_root):
    # get videos path
    paths = []
    for root, dirs, files in os.walk(os.path.relpath(path_root)):
        for file in files:
            paths.append(os.path.join(root, file))

    return paths


def get_videos_info(videos):
    print('\n[log] > Retrieving videos metadata ...')
    lengths = [get_meta_data(vid_path) for vid_path in tqdm(videos)]
    return sorted(lengths, key=lambda x: x[1])
    
def get_meta_data(file_path):
    video_cap = cv2.VideoCapture(file_path)
    fps = video_cap.get(cv2.CAP_PROP_FPS)
    frame_count = int(video_cap.get(cv2.CAP_PROP_FRAME_COUNT))
    duration = frame_count / fps
    video_cap.release()
    file_name = os.path.basename(os.path.normpath(file_path))
    return file_name, duration, frame_count


In [24]:
def main():
    try:
        if not 1 <= glosses <= 2000:
            raise ValueError('\nInsert an integer: 1~2000')
        path_k_glosses_dir = r'../video_data/WLASL_' + str(glosses) + '/'
        print('[log] > START DATASET PROCESSING ...\n')
        dataset_processing(glosses, path_k_glosses_dir, path_dataset)
        show_info(path_k_glosses_dir)
        print('\n[log] > DONE!')
    except ValueError:
        print('Insert an integer: 1~2000')


In [25]:
if __name__ == '__main__':
    main()

[log] > START DATASET PROCESSING ...

[log] > Splitting videos in train, val and test ...


2000it [00:00, 370816.37it/s]



[log] > Creating dirs ...


2000it [00:01, 1093.27it/s]



[log] > Copying videos in their own dir ...


100%|██████████| 21083/21083 [00:36<00:00, 572.17it/s]



[log] > Dataset summary:
Total videos: 19049
Videos in train: 12859 - 68%
Videos in val:   3670 - 19%
Videos in test:  2520 - 13%

[log] > Retrieving videos metadata ...


100%|██████████| 19049/19049 [01:31<00:00, 207.30it/s]


[log] > Dataset info:
The video 15144.mp4 has the MIN length: 0.3003003003003003 - Total frames: 9
The video 57628.mp4 has the MAX length: 8.125 - Total frames: 195

[log] > DONE!



