In [1]:
%load_ext autoreload

%autoreload 2

from api import data_extractor, data_extractor__common, beam__common, fileio, fidscs_globals

from apache_beam.io.filesystems import FileSystems
from apache_beam.io.localfilesystem import LocalFileSystem

import google.cloud.storage as gcs
from apache_beam.io.gcp import gcsio

import zipfile
import cv2
import io

beam_gcp_project = 'sc-fids-capstone'
gcs_bucket = 'sc-fids-capstone-bucket-sc-fids-capstone'
fidscs_globals.WORK_DIR = 'gs://'+gcs_bucket
print(f"fidscs_globals.WORK_DIR: {fidscs_globals.WORK_DIR}")

fidscs_globals.DATA_ROOT_DIR = fidscs_globals.WORK_DIR+'/data'
print(f"fidscs_globals.DATA_ROOT_DIR: {fidscs_globals.DATA_ROOT_DIR}")

fidscs_globals.TMP_DIR = fidscs_globals.DATA_ROOT_DIR+'/tmp'
print(f"fidscs_globals.TMP_DIR: {fidscs_globals.TMP_DIR}")

gcp_auth_key_path = os.environ['GOOGLE_APPLICATION_CREDENTIALS']
fidscs_globals.GCS_CLIENT = gcs.Client.from_service_account_json(gcp_auth_key_path)
print(f"fidscs_globals.GCS_CLIENT: {fidscs_globals.GCS_CLIENT}")

fidscs_globals.GCS_IO = gcsio.GcsIO()   # gcsio.GcsIO(storage_client=fidscs_globals.GCS_CLIENT)
print(f"fidscs_globals.GCS_IO: {fidscs_globals.GCS_IO}")

fidscs_globals.WORK_DIR: gs://sc-fids-capstone-bucket-sc-fids-capstone
fidscs_globals.DATA_ROOT_DIR: gs://sc-fids-capstone-bucket-sc-fids-capstone/data
fidscs_globals.TMP_DIR: gs://sc-fids-capstone-bucket-sc-fids-capstone/data/tmp
fidscs_globals.GCS_CLIENT: <google.cloud.storage.client.Client object at 0x7f803d0f0e50>
fidscs_globals.GCS_IO: <apache_beam.io.gcp.gcsio.GcsIO object at 0x7f803d0f0fd0>


In [2]:
fidscs_globals.GCS_BUCKET = gcs.Bucket(fidscs_globals.GCS_CLIENT, name=gcs_bucket, user_project=beam_gcp_project)
print(f"fidscs_globals.GCS_BUCKET: {fidscs_globals.GCS_BUCKET}")

all_blobs = list(fidscs_globals.GCS_CLIENT.list_blobs(gcs_bucket))
print(f"\nblobs in gcs bucket ({gcs_bucket}):\n{all_blobs}")

fidscs_globals.GCS_BUCKET: <Bucket: sc-fids-capstone-bucket-sc-fids-capstone>

blobs in gcs bucket (sc-fids-capstone-bucket-sc-fids-capstone):
[<Blob: sc-fids-capstone-bucket-sc-fids-capstone, data/, 1612504101117991>, <Blob: sc-fids-capstone-bucket-sc-fids-capstone, data/stitched_video_frames/, 1612463029130356>, <Blob: sc-fids-capstone-bucket-sc-fids-capstone, data/tmp/, 1612504101237567>, <Blob: sc-fids-capstone-bucket-sc-fids-capstone, data/tmp/video_index-20120129/, 1612465940293460>, <Blob: sc-fids-capstone-bucket-sc-fids-capstone, data/tmp/video_index-20120129/files_by_video_name.csv, 1612504107881996>, <Blob: sc-fids-capstone-bucket-sc-fids-capstone, data/videos/, 1612463028877583>, <Blob: sc-fids-capstone-bucket-sc-fids-capstone, data/videos/539_219_small_0.mov, 1612463124996612>, <Blob: sc-fids-capstone-bucket-sc-fids-capstone, data/videos/539_219_small_1.mov, 1612463131509443>, <Blob: sc-fids-capstone-bucket-sc-fids-capstone, data/videos/539_219_small_2.mov, 1612463140920613

In [3]:
fileio.make_dirs(fidscs_globals.DATA_ROOT_DIR)
fileio.make_dirs(fidscs_globals.TMP_DIR)

TEST_FOLDER_PATH = 'gs://'+gcs_bucket+'/data/test_folder'
fileio.make_dirs(TEST_FOLDER_PATH)

In [4]:
l = fileio.list_dir(fidscs_globals.DATA_ROOT_DIR, exclude_subdir=False)
print(f"{l if len(l)>0 else 'empty'}")

['stitched_video_frames/', 'test_folder/', 'tmp/', 'tmp/video_index-20120129/', 'tmp/video_index-20120129/files_by_video_name.csv', 'videos/', 'videos/539_219_small_0.mov', 'videos/539_219_small_1.mov', 'videos/539_219_small_2.mov', 'videos/548_small_0.mov', 'videos/548_small_1.mov', 'videos/549_small_0.mov', 'videos/549_small_1.mov', 'videos/633_208_small_0.mov']


In [5]:
data_extractor__common.download('http://secrets.rutgers.edu/dai/xml/ncslgr-xml.zip', 'gs://'+gcs_bucket+'/data/test_folder/ncslgr-corpus-index.csv')

  2%|▏         | 16384/835568 [00:00<00:07, 111590.20it/s]Downloading http://secrets.rutgers.edu/dai/xml/ncslgr-xml.zip (filesize: 835568 bytes) to gs://sc-fids-capstone-bucket-sc-fids-capstone/data/test_folder/ncslgr-corpus-index.csv...

100%|██████████| 835568/835568 [00:03<00:00, 214296.78it/s]Successfully downloaded 835568/835568 bytes from URL file http://secrets.rutgers.edu/dai/xml/ncslgr-xml.zip to local file gs://sc-fids-capstone-bucket-sc-fids-capstone/data/test_folder/ncslgr-corpus-index.csv!



In [6]:
remote_archive_path = 'http://www.bu.edu/asllrp/ncslgr-for-download/'+fidscs_globals.VIDEO_INDEXES_ARCHIVE

memfile = data_extractor__common.download_to_memfile(remote_archive_path, block_sz=8192, display=False)
zip_ref = zipfile.ZipFile(memfile, 'r')
print(f"unzipping {remote_archive_path} in-memory...")
zip_ref.printdir()
vid_indexes_dir = fidscs_globals.TMP_DIR+'/'+fidscs_globals.VIDEO_INDEX_BASE
sel_vid_index_path = vid_indexes_dir+'/files_by_video_name.csv'     #d_vid_indexes_info['sel_vid_index_path']
sel_vid_index_path_suffix = fidscs_globals.VIDEO_INDEX_BASE+'/files_by_video_name.csv'
sel_vid_index_fname = sel_vid_index_path_suffix.split('/')[-1]
print(f"we need to pull {sel_vid_index_path_suffix} out of in-memory extracted archive")
bytes_unzipped = zip_ref.read(sel_vid_index_path_suffix)
zip_ref.close()
if not fileio.path_exists(vid_indexes_dir, is_dir=True):
    fileio.make_dirs(vid_indexes_dir)
with fileio.open_file_write(vid_indexes_dir+'/'+sel_vid_index_fname) as f:
    f.write(bytes_unzipped)
    f.close()
print(f"\tDONE")

unzipping http://www.bu.edu/asllrp/ncslgr-for-download/video_index-20120129.zip in-memory...
File Name                                             Modified             Size
video_index-20120129/                          2012-01-29 16:57:24            0
video_index-20120129/files_by_video_name.csv   2012-01-29 16:52:50       621910
video_index-20120129/files_by_video_name.xls   2012-01-29 16:54:24       878080
video_index-20120129/files_by_xml_file.csv     2012-01-29 16:50:56      2604016
video_index-20120129/files_by_xml_file.xls     2012-01-29 16:56:16      1391616
we need to pull video_index-20120129/files_by_video_name.csv out of in-memory extracted archive
	DONE


In [7]:
blob_test_folder = fidscs_globals.GCS_BUCKET.blob('test_folder/')
print(f"\ntest_folder: {blob_test_folder}, exists: {blob_test_folder.exists(fidscs_globals.GCS_CLIENT)}")

blob_not_exist_folder = fidscs_globals.GCS_BUCKET.blob('not_exist/')
print(f"not_exist_folder: {blob_not_exist_folder}, exists: {blob_not_exist_folder.exists(fidscs_globals.GCS_CLIENT)}")

blob_video_index_file = fidscs_globals.GCS_BUCKET.blob('test_folder/ncslgr-corpus-index.csv')
print(f"video_index-20120129.csv file: {blob_video_index_file}, exists: {blob_video_index_file.exists(fidscs_globals.GCS_CLIENT)}")

test_folder_blobs = list(fidscs_globals.GCS_CLIENT.list_blobs(gcs_bucket, prefix='test_folder/'))
print(f"\nblobs in gcs bucket ({gcs_bucket}) test_folder:\n{test_folder_blobs}")

test_folder_sub_dir_blobs = list(fidscs_globals.GCS_CLIENT.list_blobs(gcs_bucket, prefix='test_folder/sub_dir'))
print(f"\nblobs in gcs bucket ({gcs_bucket}) test_folder_sub_dir_blobs:\n{test_folder_sub_dir_blobs}")

blob_data_folder = fidscs_globals.GCS_BUCKET.blob('data/')
print(f"\ndata_folder: {blob_data_folder}, exists: {blob_data_folder.exists(fidscs_globals.GCS_CLIENT)}")

data_folder_blobs = list(fidscs_globals.GCS_CLIENT.list_blobs(gcs_bucket, prefix='data/'))
print(f"\nblobs in gcs bucket ({gcs_bucket}) data/:\n{data_folder_blobs}")


test_folder: <Blob: sc-fids-capstone-bucket-sc-fids-capstone, test_folder/, None>, exists: False
not_exist_folder: <Blob: sc-fids-capstone-bucket-sc-fids-capstone, not_exist/, None>, exists: False
video_index-20120129.csv file: <Blob: sc-fids-capstone-bucket-sc-fids-capstone, test_folder/ncslgr-corpus-index.csv, None>, exists: False

blobs in gcs bucket (sc-fids-capstone-bucket-sc-fids-capstone) test_folder:
[]

blobs in gcs bucket (sc-fids-capstone-bucket-sc-fids-capstone) test_folder_sub_dir_blobs:
[]

data_folder: <Blob: sc-fids-capstone-bucket-sc-fids-capstone, data/, None>, exists: True

blobs in gcs bucket (sc-fids-capstone-bucket-sc-fids-capstone) data/:
[<Blob: sc-fids-capstone-bucket-sc-fids-capstone, data/, 1612504241615677>, <Blob: sc-fids-capstone-bucket-sc-fids-capstone, data/stitched_video_frames/, 1612463029130356>, <Blob: sc-fids-capstone-bucket-sc-fids-capstone, data/test_folder/, 1612504241838199>, <Blob: sc-fids-capstone-bucket-sc-fids-capstone, data/test_folder/ncs

In [8]:
l = fileio.list_dir(TEST_FOLDER_PATH, exclude_subdir=True)
print(f"{l if len(l)>0 else 'empty'}")

['ncslgr-corpus-index.csv']


In [9]:
fileio.delete_file(TEST_FOLDER_PATH, recursive=True)

In [10]:
fileio.path_exists(TEST_FOLDER_PATH)

(False, 'gs://sc-fids-capstone-bucket-sc-fids-capstone/data/test_folder/')

In [11]:
l = fileio.list_dir(fidscs_globals.DATA_ROOT_DIR, exclude_subdir=False)
print(f"{l if len(l)>0 else 'empty'}")

['stitched_video_frames/', 'tmp/', 'tmp/video_index-20120129/', 'tmp/video_index-20120129/files_by_video_name.csv', 'videos/', 'videos/539_219_small_0.mov', 'videos/539_219_small_1.mov', 'videos/539_219_small_2.mov', 'videos/548_small_0.mov', 'videos/548_small_1.mov', 'videos/549_small_0.mov', 'videos/549_small_1.mov', 'videos/633_208_small_0.mov']


In [12]:
fidscs_globals.VIDEO_DIR = fidscs_globals.DATA_ROOT_DIR+'/videos'
exists, canon_videos_dir_path = fileio.path_exists(fidscs_globals.VIDEO_DIR, is_dir=True)
# print(f"{fidscs_globals.VIDEO_DIR} (canon path: {canon_videos_dir_path}) exists: {exists}")
fidscs_globals.STICHED_VIDEO_FRAMES_DIR = fidscs_globals.DATA_ROOT_DIR+'/'+fidscs_globals.STICHED_VIDEO_FRAMES_DIR_BASE

# this section assumes the following:
    # 1. fidscs_globals.VIDEO_DIR already exists
    # 2. videos have already been downloaded to it

existing_video_fnames = fileio.list_dir(fidscs_globals.VIDEO_DIR, exclude_subdir=True)
local_vid_segment_paths = [fileio.path_join(fidscs_globals.VIDEO_DIR, existing_video_fname) for existing_video_fname in existing_video_fnames]
# print(f"\nvideos (in {fidscs_globals.VIDEO_DIR}):\n{local_vid_segment_paths}")

truly_local_vid_dir = None
if fidscs_globals.GCS_CLIENT:
    truly_local_vid_dir = '~/tmp'+'/'.join(fidscs_globals.VIDEO_DIR.split('/')[1:])
    # print(f"\nGCS storage detected! Using truly_local_vid_dir: {truly_local_vid_dir}")
    # print(f"\t\t{truly_local_vid_dir} exists: {fileio.path_exists(truly_local_vid_dir, is_dir=True)}")

    if not fileio.path_exists(truly_local_vid_dir, is_dir=True)[0]:
        # print(f"\tcreating {truly_local_vid_dir}...")
        truly_local_vid_dir_path_segs = truly_local_vid_dir.split('/')
        # print(f"\t\ttruly_local_vid_dir_path_segs: {truly_local_vid_dir_path_segs}")
        s_cum_path = '~'
        for i, truly_local_vid_dir_path_seg in enumerate(truly_local_vid_dir_path_segs[1:]):
            s_cum_path += '/'+truly_local_vid_dir_path_seg
            fileio.make_dirs(s_cum_path)
        # print(f"\t\t{s_cum_path} exists: {fileio.path_exists(s_cum_path, is_dir=True)}")

for local_vid_segment_path in local_vid_segment_paths:
    video_fname = local_vid_segment_path.split('/')[-1]

    truly_local_id_segment_path = None
    truly_local_target_video_frames_dir = None

    if fidscs_globals.GCS_CLIENT:
        # then we need to save the video locally - this is especially import (required) for GCP DataFlow workers
        # print(f"\n\n\tattempting to open video {local_vid_segment_path} for reading...")
        with fileio.open_file_read(local_vid_segment_path) as f:
            # print(f"\t\tSUCCESS")
            buffer = f.read()
            truly_local_id_segment_path = truly_local_vid_dir+'/'+video_fname
            # print(f"\t\tattempting to write {truly_local_id_segment_path} (truly) locally...")
            with fileio.open_file_write(truly_local_id_segment_path) as f_local:
                f_local.write(buffer)
                f_local.close()
                # print(f"\t\t\tSUCCESS")
            f.close()
            local_vid_segment_path = truly_local_id_segment_path
            truly_local_target_video_frames_dir = truly_local_vid_dir+'/'+fidscs_globals.STICHED_VIDEO_FRAMES_DIR_BASE+'/'+video_fname.split('.')[0]
            # print(f"\t\t\tattempting to create directory {truly_local_target_video_frames_dir} (truly_local_target_video_frames_dir) for frames extracted from video {local_vid_segment_path}...")
            if not fileio.path_exists(truly_local_target_video_frames_dir, is_dir=True)[0]:
                # print(f"\t\t\t\tcreating {truly_local_target_video_frames_dir}...")
                fileio.make_dirs(truly_local_target_video_frames_dir)
            truly_local_target_video_frames_dir_exists = fileio.path_exists(truly_local_target_video_frames_dir, is_dir=True)
            # print(f"\t\t\t\t\t{truly_local_target_video_frames_dir} exists: {truly_local_target_video_frames_dir_exists}")
            if not truly_local_target_video_frames_dir_exists:
                raise Exception(f"required directory truly_local_target_video_frames_dir {truly_local_target_video_frames_dir_exists} does not exist")

    # print(f"\t\t\tattempting to capture (cv2.VideoCapture) video {local_vid_segment_path})...")
    vid_cap = cv2.VideoCapture(local_vid_segment_path)
    vid_cap.set(cv2.CAP_PROP_FPS, fidscs_globals.FPS)
    # print(f"\t\t\t\tSUCCESS")

    frame_count = int(vid_cap.get(cv2.CAP_PROP_FRAME_COUNT))
    # print(f"\t\t\t\tvideo {local_vid_segment_path} has {frame_count} frames")

    target_video_frames_dir = fidscs_globals.STICHED_VIDEO_FRAMES_DIR+'/'+video_fname.split('.')[0]
    # print(f"\t\t\t\tattempting to create directory {target_video_frames_dir} (target_video_frames_dir) for frames extracted from video {local_vid_segment_path}...")
    fileio.make_dirs(target_video_frames_dir)
    target_video_frames_dir_exists = fileio.path_exists(target_video_frames_dir, is_dir=True)
    # print(f"\t\t\t\t\t{target_video_frames_dir} exists: {target_video_frames_dir_exists}")
    if not target_video_frames_dir_exists:
        raise Exception(f"required directory target_video_frames_dir {target_video_frames_dir} does not exist")

    n_stitched_frames = 0

    success, frame = vid_cap.read()
    n_frames_extracted = 0
    while success:
        local_frame_path = fileio.path_join(target_video_frames_dir, f"{n_stitched_frames}.jpg") # this is the final frame path

        if truly_local_target_video_frames_dir is not None:
            # write truly local frame file
            truly_local_frame_path = truly_local_target_video_frames_dir+'/'+f"{n_stitched_frames}.jpg"
            # print(f"\t\t\t\t\t\tattempting to write {truly_local_frame_path} frame from video {local_vid_segment_path}...")
            cv2.imwrite(truly_local_frame_path, frame)
            # print(f"\t\t\t\t\t\t\tSUCCESS")
            # print(f"\t\t\t\t\t\t\tattempting to open {truly_local_frame_path} for read...")
            with fileio.open_file_read(truly_local_frame_path) as f_truly_local_frame:
                buffer = f_truly_local_frame.read()
                # print(f"\t\t\t\t\t\t\t\tSUCCESS")
                # print(f"\t\t\t\t\t\t\t\t\tattempting to open {local_frame_path} for final write...")
                with fileio.open_file_write(local_frame_path) as f_frame_final:
                    f_frame_final.write(buffer)
                    f_frame_final.close()
                    # print(f"\t\t\t\t\t\t\t\t\t\tSUCCESS")
                f_truly_local_frame.close()

        else:
            # print(f"\t\t\t\t\t\t\t\t\tattempting to open {local_frame_path} for final write...")
            cv2.imwrite(local_frame_path, frame)
            # print(f"\t\t\t\t\t\t\t\t\t\tSUCCESS")
            cv2.imwrite(fileio.path_join(target_video_frames_dir, f"{n_stitched_frames}.jpg"), frame)

        n_frames_extracted += 1
        n_stitched_frames += 1
        success, frame = vid_cap.read()

    if n_frames_extracted != frame_count:
        print(f"\t\t\t\t\t{local_vid_segment_path} has {frame_count} but we only extracted/wrote {n_frames_extracted} frames!")
    else:
        print(f"\t\t\t\t\tsuccessfully extracted all {frame_count} from {frame_count} from {local_vid_segment_path} to ")

if truly_local_vid_dir is not None and fileio.path_exists(truly_local_vid_dir, is_dir=True)[0]:
    print(f"\n\n\tdeleting {truly_local_vid_dir}...")
    fileio.delete_file(truly_local_vid_dir, recursive=True)
    print(f"\t\t{truly_local_vid_dir} exists: {fileio.path_exists(truly_local_vid_dir, is_dir=True)}")

					successfully extracted all 61 from 61 from ~/tmp/sc-fids-capstone-bucket-sc-fids-capstone/data/videos/539_219_small_0.mov to 
					successfully extracted all 61 from 61 from ~/tmp/sc-fids-capstone-bucket-sc-fids-capstone/data/videos/539_219_small_1.mov to 
					successfully extracted all 61 from 61 from ~/tmp/sc-fids-capstone-bucket-sc-fids-capstone/data/videos/539_219_small_2.mov to 
					successfully extracted all 734 from 734 from ~/tmp/sc-fids-capstone-bucket-sc-fids-capstone/data/videos/548_small_0.mov to 
					successfully extracted all 734 from 734 from ~/tmp/sc-fids-capstone-bucket-sc-fids-capstone/data/videos/548_small_1.mov to 
					successfully extracted all 221 from 221 from ~/tmp/sc-fids-capstone-bucket-sc-fids-capstone/data/videos/549_small_0.mov to 
					successfully extracted all 221 from 221 from ~/tmp/sc-fids-capstone-bucket-sc-fids-capstone/data/videos/549_small_1.mov to 
					successfully extracted all 84 from 84 from ~/tmp/sc-fids-capstone-bucket-sc-fids-capst