In [1]:
import argparse
import os

import apache_beam as beam
import tensorflow as tf
from apache_beam.options.pipeline_options import PipelineOptions
import apache_beam.runners.interactive.interactive_beam as ib
import apache_beam.transforms.sql

import beam__common
import fidscs_globals

import data_extractor
import preprocessor

In [2]:
data_dir = "/tmp/fids-capstone-data/data"

In [3]:
data_extractor.run(max_target_videos=-1, data_dir=data_dir, use_beam=True)

use_beam: True
Found dataset /tmp/fids-capstone-data/data/consultant-index.csv
Found dataset /tmp/fids-capstone-data/data/document-consultant-targetvideo-index.csv
Found dataset /tmp/fids-capstone-data/data/document-consultant-utterance-index.csv
Found dataset /tmp/fids-capstone-data/data/document-consultant-utterance-targetvideo-index.csv
Found dataset /tmp/fids-capstone-data/data/document-consultant-utterance-token-index.csv
Found dataset /tmp/fids-capstone-data/data/ncslgr-corpus-index.csv
Found dataset /tmp/fids-capstone-data/data/document-consultant-index.csv
Found dataset /tmp/fids-capstone-data/data/document-consultant-targetvideo-utterance-token-frame-index.csv
Found dataset /tmp/fids-capstone-data/data/vocabulary-index.csv


In [4]:
preprocessor.run(data_dir=data_dir)

Found train/val dataset /tmp/fids-capstone-data/data/train-assoc.csv
Found train/val dataset /tmp/fids-capstone-data/data/val.csv
Found train/val dataset /tmp/fids-capstone-data/data/train.csv


In [5]:
options = {
    'project': 'my-project', # change
    'runner': 'InteractiveRunner',
    'direct_num_workers': 0, # 0 is use all available cores
    'direct_running_mode': 'multi_threading', # ['in_memory', 'multi_threading', 'multi_processing'] # 'multi_processing' doesn't seem to work for DirectRunner?
    'streaming': False # set to True if data source is unbounded (e.g. GCP PubSub)
}
pipeline_options = PipelineOptions(flags=[], **options) # easier to pass in options from command-line this way
print(f"PipelineOptions:\n{pipeline_options.get_all_options()}\n")

PipelineOptions:
{'runner': 'InteractiveRunner', 'streaming': False, 'beam_services': {}, 'type_check_strictness': 'DEFAULT_TO_ANY', 'type_check_additional': '', 'pipeline_type_check': True, 'runtime_type_check': False, 'performance_runtime_type_check': False, 'direct_runner_use_stacked_bundle': True, 'direct_runner_bundle_repeat': 0, 'direct_num_workers': 0, 'direct_running_mode': 'multi_threading', 'dataflow_endpoint': 'https://dataflow.googleapis.com', 'project': 'my-project', 'job_name': None, 'staging_location': None, 'temp_location': None, 'region': None, 'service_account_email': None, 'no_auth': False, 'template_location': None, 'labels': None, 'update': False, 'transform_name_mapping': None, 'enable_streaming_engine': False, 'dataflow_kms_key': None, 'flexrs_goal': None, 'hdfs_host': None, 'hdfs_port': None, 'hdfs_user': None, 'hdfs_full_urls': False, 'num_workers': None, 'max_num_workers': None, 'autoscaling_algorithm': None, 'machine_type': None, 'disk_size_gb': None, 'disk_t

In [6]:
fidscs_globals.DATA_ROOT_DIR = data_dir

In [7]:
can_proceed = True

if not tf.io.gfile.exists(fidscs_globals.DATA_ROOT_DIR) or len(tf.io.gfile.listdir(fidscs_globals.DATA_ROOT_DIR))==0:
    print(f"{fidscs_globals.VALIDATION_FATAL_ERROR_TEXT} data directory does not exist or is empty!")
    can_proceed = False
else:
    fidscs_globals.VIDEO_DIR = os.path.join(fidscs_globals.DATA_ROOT_DIR, 'videos')
    fidscs_globals.STICHED_VIDEO_FRAMES_DIR = os.path.join(fidscs_globals.DATA_ROOT_DIR, 'stitched_video_frames')
    fidscs_globals.CORPUS_DS_PATH = os.path.join(fidscs_globals.DATA_ROOT_DIR, fidscs_globals.CORPUS_DS_FNAME)
    fidscs_globals.DOCUMENT_ASL_CONSULTANT_DS_PATH = os.path.join(fidscs_globals.DATA_ROOT_DIR, fidscs_globals.DOCUMENT_ASL_CONSULTANT_DS_FNAME)
    fidscs_globals.ASL_CONSULTANT_DS_PATH = os.path.join(fidscs_globals.DATA_ROOT_DIR, fidscs_globals.ASL_CONSULTANT_DS_FNAME)
    fidscs_globals.VIDEO_DS_PATH = os.path.join(fidscs_globals.DATA_ROOT_DIR, fidscs_globals.VIDEO_DS_FNAME)
    fidscs_globals.VIDEO_SEGMENT_DS_PATH = os.path.join(fidscs_globals.DATA_ROOT_DIR, fidscs_globals.VIDEO_SEGMENT_DS_FNAME)
    fidscs_globals.VIDEO_FRAME_DS_PATH = os.path.join(fidscs_globals.DATA_ROOT_DIR, fidscs_globals.VIDEO_FRAME_DS_FNAME)
    fidscs_globals.UTTERANCE_DS_PATH = os.path.join(fidscs_globals.DATA_ROOT_DIR, fidscs_globals.UTTERANCE_DS_FNAME)
    fidscs_globals.UTTERANCE_VIDEO_DS_PATH = os.path.join(fidscs_globals.DATA_ROOT_DIR, fidscs_globals.UTTERANCE_VIDEO_DS_FNAME)
    fidscs_globals.UTTERANCE_TOKEN_DS_PATH = os.path.join(fidscs_globals.DATA_ROOT_DIR, fidscs_globals.UTTERANCE_TOKEN_DS_FNAME)
    fidscs_globals.UTTERANCE_TOKEN_FRAME_DS_PATH = os.path.join(fidscs_globals.DATA_ROOT_DIR, fidscs_globals.UTTERANCE_TOKEN_FRAME_DS_FNAME)
    fidscs_globals.VOCABULARY_DS_PATH = os.path.join(fidscs_globals.DATA_ROOT_DIR, fidscs_globals.VOCABULARY_DS_FNAME)
    fidscs_globals.TRAIN_ASSOC_DS_PATH = os.path.join(fidscs_globals.DATA_ROOT_DIR, fidscs_globals.TRAIN_FRAME_SEQ_ASSOC_DS_FNAME)
    fidscs_globals.VAL_DS_PATH = os.path.join(fidscs_globals.DATA_ROOT_DIR, fidscs_globals.VAL_FRAME_SEQ_DS_FNAME)
    fidscs_globals.TRAIN_DS_PATH = os.path.join(fidscs_globals.DATA_ROOT_DIR, fidscs_globals.TRAIN_FRAME_SEQ_DS_FNAME)

In [8]:
pl = beam.Pipeline(options=pipeline_options)

def train_val_index_schemad_pcoll_rows__to__ordered_tuples(train_val_index_schemad_pcoll_row):
    """
    """
    return (
        # SCHEMA_COL_NAMES__TRAIN_OR_VAL_INDEX = [
        #     'TokenID',
        #     'CameraPerspective',
        #     'ASLConsultantID',
        #     'TargetVideoFilename',
        #     'UtteranceSequence',
        #     'TokenSequence',
        #     'FrameSequence'
        # ]
        train_val_index_schemad_pcoll_row.TokenID,
        train_val_index_schemad_pcoll_row.CameraPerspective,
        train_val_index_schemad_pcoll_row.ASLConsultantID,
        train_val_index_schemad_pcoll_row.TargetVideoFilename,
        train_val_index_schemad_pcoll_row.UtteranceSequence,
        train_val_index_schemad_pcoll_row.TokenSequence,
        train_val_index_schemad_pcoll_row.FrameSequence
    )

train_frame_sequences__assoc_index_schemad_pcoll = beam__common.pl__1__read_train_frame_sequences__assoc_index_csv(pl)
train_frame_sequences__assoc_index = (
    train_frame_sequences__assoc_index_schemad_pcoll
    | "Beam PL: transform train_frame_sequences__assoc_index_schemad_pcoll rows to ordered tuples (according to schema)" >> beam.Map(train_val_index_schemad_pcoll_rows__to__ordered_tuples)
)

val_frame_sequences_index_schemad_pcoll = beam__common.pl__1__read_val_frame_sequences__index_csv(pl)
val_frame_sequences_index = (
    val_frame_sequences_index_schemad_pcoll
    | "Beam PL: transform val_frame_sequences_index_schemad_pcoll rows to ordered tuples (according to schema)" >> beam.Map(train_val_index_schemad_pcoll_rows__to__ordered_tuples)
)

train_frame_sequences_index_schemad_pcoll = beam__common.pl__1__read_train_frame_sequences_index_csv(pl)
train_frame_sequences_index = (
    train_frame_sequences_index_schemad_pcoll
    | "Beam PL: transform train_frame_sequences_index_schemad_pcoll rows to ordered tuples (according to schema)" >> beam.Map(train_val_index_schemad_pcoll_rows__to__ordered_tuples)
)

In [9]:
# we require this in order to make use of ib.show() (which provides visualization of the pcolls specified) or ib.collect() (which creates a pandas dataframe from a pcoll)
    # but all pcolls we wish to visualize must be created prior to executing the following line
ib.watch(locals())

In [10]:
df_train_frame_sequences__assoc_index = ib.collect(train_frame_sequences__assoc_index)

In [11]:
df_train_frame_sequences__assoc_index.columns = fidscs_globals.SCHEMA_COL_NAMES__TRAIN_OR_VAL_INDEX
df_train_frame_sequences__assoc_index.set_index(fidscs_globals.SCHEMA_PK__TRAIN_OR_VAL_INDEX, inplace=True)
df_train_frame_sequences__assoc_index.sort_values(axis=0, by=[fidscs_globals.SCHEMA_COL_NAMES__TRAIN_OR_VAL_INDEX[6]], ignore_index=False, inplace=True)
df_train_frame_sequences__assoc_index.sort_index(inplace=True)

df_train_frame_sequences__assoc_index

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,FrameSequence
TokenID,CameraPerspective,ASLConsultantID,TargetVideoFilename,UtteranceSequence,TokenSequence,Unnamed: 6_level_1
1,0,3,lapd_story_1083_small_0.mov,15,2,1218
1,0,3,lapd_story_1083_small_0.mov,15,2,1219
1,0,3,lapd_story_1083_small_0.mov,15,2,1220
1,0,3,lapd_story_1083_small_0.mov,15,2,1221
1,0,3,lapd_story_1083_small_0.mov,15,2,1222
...,...,...,...,...,...,...
2409,2,3,boston-la_1088_small_2.mov,4,12,884
2409,2,3,boston-la_1088_small_2.mov,4,12,885
2409,2,3,boston-la_1088_small_2.mov,4,12,886
2409,2,3,boston-la_1088_small_2.mov,4,12,887


In [12]:
df_val_frame_sequences_index = ib.collect(val_frame_sequences_index)

In [13]:
df_val_frame_sequences_index.columns = fidscs_globals.SCHEMA_COL_NAMES__TRAIN_OR_VAL_INDEX
df_val_frame_sequences_index.set_index(fidscs_globals.SCHEMA_PK__TRAIN_OR_VAL_INDEX, inplace=True)
df_val_frame_sequences_index.sort_values(axis=0, by=[fidscs_globals.SCHEMA_COL_NAMES__TRAIN_OR_VAL_INDEX[6]], ignore_index=False, inplace=True)
df_val_frame_sequences_index.sort_index(inplace=True)

df_val_frame_sequences_index

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,FrameSequence
TokenID,CameraPerspective,ASLConsultantID,TargetVideoFilename,UtteranceSequence,TokenSequence,Unnamed: 6_level_1
1,0,3,dorm_prank_1053_small_0.mov,32,1,3206
1,0,3,dorm_prank_1053_small_0.mov,32,1,3207
1,0,3,dorm_prank_1053_small_0.mov,32,1,3208
1,0,3,dorm_prank_1053_small_0.mov,32,1,3209
1,0,3,dorm_prank_1053_small_0.mov,32,1,3210
...,...,...,...,...,...,...
2409,2,3,boston-la_1088_small_2.mov,6,5,1189
2409,2,3,boston-la_1088_small_2.mov,6,5,1190
2409,2,3,boston-la_1088_small_2.mov,6,5,1191
2409,2,3,boston-la_1088_small_2.mov,6,5,1192


In [14]:
df_train_frame_sequences_index = ib.collect(train_frame_sequences_index)

In [15]:
df_train_frame_sequences_index.columns = fidscs_globals.SCHEMA_COL_NAMES__TRAIN_OR_VAL_INDEX
df_train_frame_sequences_index.set_index(fidscs_globals.SCHEMA_PK__TRAIN_OR_VAL_INDEX, inplace=True)
df_train_frame_sequences_index.sort_values(axis=0, by=[fidscs_globals.SCHEMA_COL_NAMES__TRAIN_OR_VAL_INDEX[6]], ignore_index=False, inplace=True)
df_train_frame_sequences_index.sort_index(inplace=True)

df_train_frame_sequences_index

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,FrameSequence
TokenID,CameraPerspective,ASLConsultantID,TargetVideoFilename,UtteranceSequence,TokenSequence,Unnamed: 6_level_1
0,0,7,ben_story_439_small_0.mov,7,0,585
0,0,7,ben_story_439_small_0.mov,7,0,586
0,0,7,ben_story_439_small_0.mov,7,0,587
0,0,7,ben_story_439_small_0.mov,7,0,588
0,0,7,ben_story_439_small_0.mov,7,0,589
...,...,...,...,...,...,...
2411,2,3,boston-la_1088_small_2.mov,81,5,13186
2411,2,3,boston-la_1088_small_2.mov,81,5,13187
2411,2,3,boston-la_1088_small_2.mov,81,5,13188
2411,2,3,boston-la_1088_small_2.mov,81,5,13189
