# data reader

In [None]:
import json

file_path = 'WLASL_v0.3.json'

with open(file_path) as ipf:
    content = json.load(ipf)

cnt_train = 0
cnt_val = 0
cnt_test = 0

for ent in content:
    gloss = ent['gloss']

    for inst in ent['instances']:
        split = inst['split']

        if split == 'train':
            cnt_train += 1
        elif split == 'val':
            cnt_val += 1
        elif split == 'test':
            cnt_test += 1
        else:
            raise ValueError("Invalid split.")

print('total glosses: {}'.format(len(content)))
print('total samples: {}'.format(cnt_train + cnt_val + cnt_test))


# video downloader

In [None]:
import os
import json
import time
import sys
import urllib.request
from multiprocessing.dummy import Pool

import random

import logging
logging.basicConfig(filename='download_{}.log'.format(int(time.time())), filemode='w', level=logging.DEBUG)
logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))

# Set this to youtube-dl if you want to use youtube-dl.
# The the README for an explanation regarding yt-dlp vs youtube-dl.
youtube_downloader = "yt-dlp"

def request_video(url, referer=''):
    user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'

    headers = {'User-Agent': user_agent,
               }
    
    if referer:
        headers['Referer'] = referer

    request = urllib.request.Request(url, None, headers)  # The assembled request

    logging.info('Requesting {}'.format(url))
    response = urllib.request.urlopen(request)
    data = response.read()  # The data you need

    return data


def save_video(data, saveto):
    with open(saveto, 'wb+') as f:
        f.write(data)

    # please be nice to the host - take pauses and avoid spamming
    time.sleep(random.uniform(0.5, 1.5))


def download_youtube(url, dirname, video_id):
    raise NotImplementedError("Urllib cannot deal with YouTube links.")


def download_aslpro(url, dirname, video_id):
    saveto = os.path.join(dirname, '{}.swf'.format(video_id))
    if os.path.exists(saveto):
        logging.info('{} exists at {}'.format(video_id, saveto))
        return 

    data = request_video(url, referer='http://www.aslpro.com/cgi-bin/aslpro/aslpro.cgi')
    # data = request_video(url, referer='https://www.psl.org.pk/dictionary/category/9')
    save_video(data, saveto)


def download_others(url, dirname, video_id):
    saveto = os.path.join(dirname, '{}.mp4'.format(video_id))
    if os.path.exists(saveto):
        logging.info('{} exists at {}'.format(video_id, saveto))
        return 
    
    data = request_video(url)
    save_video(data, saveto)


def select_download_method(url):
    if 'aslpro' in url:
        return download_aslpro
    elif 'youtube' in url or 'youtu.be' in url:
        return download_youtube
    else:
        return download_others


def download_nonyt_videos(indexfile, saveto='raw_videos'):
    content = json.load(open(indexfile))

    if not os.path.exists(saveto):
        os.mkdir(saveto)

    for entry in content:
        gloss = entry['gloss']
        instances = entry['instances']

        for inst in instances:
            video_url = inst['url']
            video_id = inst['video_id']
            
            logging.info('gloss: {}, video: {}.'.format(gloss, video_id))

            download_method = select_download_method(video_url)    
            
            if download_method == download_youtube:
                logging.warning('Skipping YouTube video {}'.format(video_id))
                continue

            try:
                download_method(video_url, saveto, video_id)
            except Exception as e:
                logging.error('Unsuccessful downloading - video {}'.format(video_id))


def check_youtube_dl_version():
    ver = os.popen(f'{youtube_downloader} --version').read()

    assert ver, f"{youtube_downloader} cannot be found in PATH. Please verify your installation."


def download_yt_videos(indexfile, saveto='raw_videos'):
    content = json.load(open(indexfile))
    
    if not os.path.exists(saveto):
        os.mkdir(saveto)
    
    for entry in content:
        gloss = entry['gloss']
        instances = entry['instances']

        for inst in instances:
            video_url = inst['url']
            video_id = inst['video_id']

            if 'youtube' not in video_url and 'youtu.be' not in video_url:
                continue

            if os.path.exists(os.path.join(saveto, video_url[-11:] + '.mp4')) or os.path.exists(os.path.join(saveto, video_url[-11:] + '.mkv')):
                logging.info('YouTube videos {} already exists.'.format(video_url))
                continue
            else:
                cmd = f"{youtube_downloader} \"{{}}\" -o \"{{}}%(id)s.%(ext)s\""
                cmd = cmd.format(video_url, saveto + os.path.sep)

                rv = os.system(cmd)
                
                if not rv:
                    logging.info('Finish downloading youtube video url {}'.format(video_url))
                else:
                    logging.error('Unsuccessful downloading - youtube video url {}'.format(video_url))

                # please be nice to the host - take pauses and avoid spamming
                time.sleep(random.uniform(1.0, 1.5))
    

if __name__ == '__main__':
    logging.info('Start downloading non-youtube videos.')
    download_nonyt_videos('WLASL_v0.3.json')

    check_youtube_dl_version()
    logging.info('Start downloading youtube videos.')
    download_yt_videos('WLASL_v0.3.json')



# preprocessing

In [None]:
# # preprocessing script for WLASL dataset
# # 1. Convert .swf, .mkv file to mp4.
# # 2. Extract YouTube frames and create video instances.

import os
import json
import cv2
import shutil

# def convert_everything_to_mp4():
#     cmd = 'bash scripts/swf2mp4.sh'
#     os.system(cmd)

# def video_to_frames(video_path, size=None):
#     cap = cv2.VideoCapture(video_path)
#     frames = []
    
#     while True:
#         ret, frame = cap.read()
#         if ret:
#             if size:
#                 frame = cv2.resize(frame, size)
#             frames.append(frame)
#         else:
#             break

#     cap.release()

#     return frames

# import cv2

def video_to_frames(video_path):
    frames = []
    capture = cv2.VideoCapture(video_path)
    
    while True:
        ret, frame = capture.read()
        
        if not ret:
            break
        
        frames.append(frame)
    
    capture.release()
    return frames

import os
import subprocess

def convert_everything_to_mp4():
    # Convert .swf, .mkv files to mp4 format
    
    # Path to the directory containing the files to be converted
    source_directory = r"C:/Users/Pc/Desktop/WLASL/start_kit"
    
    # Path to the FFmpeg executable
    ffmpeg_path = r"C:/ffmpeg-6.1.1-essentials_build/bin/ffmpeg.exe"  # Replace with the actual path to ffmpeg.exe
    
    # Get a list of SWF files in the directory
    swf_files = [f for f in os.listdir(source_directory) if f.endswith('.swf')]
    
    # Convert each SWF file to MP4 format using FFmpeg
    for swf_file in swf_files:
        input_file = os.path.join(source_directory, swf_file)
        output_file = os.path.join(source_directory, f"{os.path.splitext(swf_file)[0]}.mp4")
        command = f'{ffmpeg_path} -i "{input_file}" -c:v libx264 -crf 23 -preset veryfast -c:a aac -b:a 128k -movflags +faststart "{output_file}"'
        try:
            subprocess.call(command, shell=True)
            print(f"File '{swf_file}' converted to MP4 format successfully.")
        except subprocess.CalledProcessError as e:
            print(f"Error occurred while converting file '{swf_file}' to MP4 format:", e)

def convert_frames_to_video(frame_array, path_out, size, fps=25):
    out = cv2.VideoWriter(path_out, cv2.VideoWriter_fourcc(*'mp4v'), fps, size)

    for i in range(len(frame_array)):
        # writing to an image array
        out.write(frame_array[i])
    out.release()

def extract_frame_as_video(src_video_path, start_frame, end_frame):
    frames = video_to_frames(src_video_path)
    return frames[start_frame: end_frame+1]

def extract_all_yt_instances(content):
    cnt = 1

    if not os.path.exists('videos'):
        os.mkdir('videos')

    for entry in content:
        instances = entry['instances']
        # print("Ins::",instances)

        for inst in instances:
            url = inst['url']
            video_id = inst['video_id']
            # print("Vid:;",video_id)

            if 'youtube' in url or 'youtu.be' in url:
                cnt += 1

                yt_identifier = url[-11:]

                src_video_path = os.path.join('raw_videos_mp4', yt_identifier + '.mp4')
                dst_video_path = os.path.join('videos', video_id + '.mp4')

                if not os.path.exists(src_video_path):
                    continue

                if os.path.exists(dst_video_path):
                    print('{} exists.'.format(dst_video_path))
                    continue

                # because the JSON file indexes from 1.
                start_frame = inst['frame_start'] - 1
                end_frame = inst['frame_end'] - 1

                if end_frame <= 0:
                    shutil.copyfile(src_video_path, dst_video_path)
                    continue

                selected_frames = extract_frame_as_video(src_video_path, start_frame, end_frame)
                print(selected_frames)
                
                # when OpenCV reads an image, it returns size in (h, w, c)
                # when OpenCV creates a writer, it requires size in (w, h).
                size = selected_frames[0].shape[:2][::-1]
                
                convert_frames_to_video(selected_frames, dst_video_path, size)

                save_raw_video(src_video_path, video_id)
                
                print(cnt, dst_video_path)
            else:
                cnt += 1

                src_video_path = os.path.join('raw_videos_mp4', video_id + '.mp4')
                dst_video_path = os.path.join('videos', video_id + '.mp4')

                if os.path.exists(dst_video_path):
                    print('{} exists.'.format(dst_video_path))
                    continue

                if not os.path.exists(src_video_path):
                    continue

                print(cnt, dst_video_path)
                shutil.copyfile(src_video_path, dst_video_path)

                save_raw_video(src_video_path, video_id)

def save_raw_video(raw_video_path, video_id):
    raw_videos_directory = 'new_raw_vid'

    if not os.path.exists(raw_videos_directory):
        os.makedirs(raw_videos_directory)

    new_raw_video_path = os.path.join(raw_videos_directory, video_id + '.mp4')
    shutil.copyfile(raw_video_path, new_raw_video_path)

def main():
    # 1. Convert .swf, .mkv file to mp4.
    convert_everything_to_mp4()

    content = json.load(open('WLASL_v0.3.json'))
    # print(content)
    extract_all_yt_instances(content)

    video_path = r"C:/Users/Pc/Desktop/WLASL/start_kit/022.mp4"

    # Process the provided video
    if video_path:
        video_id = os.path.splitext(os.path.basename(video_path))[0]
        print(video_id)
        dst_video_path = os.path.join('videos/', video_id + '.mp4')
        print(dst_video_path)

        if not os.path.exists(dst_video_path):
            frames = video_to_frames(video_path)
            print("Frames:::",frames)
            size = frames[0].shape[:2][::-1]
            convert_frames_to_video(frames, dst_video_path, size)
            save_raw_video(video_path, video_id)
            print("Video processed and saved:", dst_video_path)
        else:
            print("Video already exists:", dst_video_path)

if __name__ == "__main__":
    main()











# import os
# import json
# import cv2

# import shutil

# def convert_everything_to_mp4():
#     cmd = 'bash scripts/swf2mp4.sh'

#     os.system(cmd)


# def video_to_frames(video_path, size=None):
#     """
#     video_path -> str, path to video.
#     size -> (int, int), width, height.
#     """

#     cap = cv2.VideoCapture(video_path)

#     frames = []
    
#     while True:
#         ret, frame = cap.read()
    
#         if ret:
#             if size:
#                 frame = cv2.resize(frame, size)
#             frames.append(frame)
#         else:
#             break

#     cap.release()

#     return frames


# def convert_frames_to_video(frame_array, path_out, size, fps=25):
#     out = cv2.VideoWriter(path_out, cv2.VideoWriter_fourcc(*'mp4v'), fps, size)

#     for i in range(len(frame_array)):
#         # writing to a image array
#         out.write(frame_array[i])
#     out.release()


# def extract_frame_as_video(src_video_path, start_frame, end_frame):
#     frames = video_to_frames(src_video_path)
#     # print(frames)
#     return frames[start_frame: end_frame+1]


# def extract_all_yt_instances(content):
#     cnt = 1

#     if not os.path.exists('videos'):
#         os.mkdir('videos')

#     for entry in content:
#         instances = entry['instances']

#         for inst in instances:
#             url = inst['url']
#             video_id = inst['video_id']

#             if 'youtube' in url or 'youtu.be' in url:
#                 cnt += 1
                
#                 yt_identifier = url[-11:]

#                 src_video_path = os.path.join('raw_videos_mp4', yt_identifier + '.mp4')
#                 dst_video_path = os.path.join('videos', video_id + '.mp4')

#                 if not os.path.exists(src_video_path):
#                     continue

#                 if os.path.exists(dst_video_path):
#                     print('{} exists.'.format(dst_video_path))
#                     continue

#                 # because the JSON file indexes from 1.
#                 start_frame = inst['frame_start'] - 1
#                 end_frame = inst['frame_end'] - 1

#                 if end_frame <= 0:
#                     shutil.copyfile(src_video_path, dst_video_path)
#                     continue

#                 selected_frames = extract_frame_as_video(src_video_path, start_frame, end_frame)
                
#                 # when OpenCV reads an image, it returns size in (h, w, c)
#                 # when OpenCV creates a writer, it requres size in (w, h).
#                 size = selected_frames[0].shape[:2][::-1]
                
#                 convert_frames_to_video(selected_frames, dst_video_path, size)

#                 print(cnt, dst_video_path)
#             else:
#                 cnt += 1

#                 src_video_path = os.path.join('raw_videos_mp4', video_id + '.mp4')
#                 dst_video_path = os.path.join('videos', video_id + '.mp4')

#                 if os.path.exists(dst_video_path):
#                     print('{} exists.'.format(dst_video_path))
#                     continue

#                 if not os.path.exists(src_video_path):
#                     continue

#                 print(cnt, dst_video_path)
#                 shutil.copyfile(src_video_path, dst_video_path)

# # ===============new line ========================
#                 save_raw_video(dst_video_path, video_id)

# # ===============new function ========================
# def save_raw_video(raw_video_path, video_id):
#     raw_videos_directory = 'new_videos_for_processing'

#     if not os.path.exists(raw_videos_directory):
#         os.makedirs(raw_videos_directory)

#     new_raw_video_path = os.path.join(raw_videos_directory, video_id + '.mp4')
#     shutil.copyfile(raw_video_path, new_raw_video_path)

        
# def main():
#     # 1. Convert .swf, .mkv file to mp4.
#     convert_everything_to_mp4()

#     content = json.load(open('WLASL_v0.3.json'))
#     extract_all_yt_instances(content)


# if __name__ == "__main__":
#     main()

