In [1]:
import argparse
import os

import apache_beam as beam
import tensorflow as tf
from apache_beam.options.pipeline_options import PipelineOptions
import apache_beam.runners.interactive.interactive_beam as ib
import apache_beam.transforms.sql

import beam__common
import fidscs_globals
import random

import data_extractor

In [None]:
data_dir = "/tmp/fids-capstone-data/data"

In [2]:
data_extractor.run(max_target_videos=-1, data_dir=data_dir, use_beam=True)

use_beam: True
INFO:tensorflow:Using MirroredStrategy with devices ('/device:CPU:0',)
INFO:tensorflow:Single-worker MultiWorkerMirroredStrategy with local_devices = ('/device:CPU:0',), communication = CollectiveCommunication.AUTO
Number of devices available for parallel processing: 1
PipelineOptions:
{'runner': 'DirectRunner', 'streaming': False, 'beam_services': {}, 'type_check_strictness': 'DEFAULT_TO_ANY', 'type_check_additional': '', 'pipeline_type_check': True, 'runtime_type_check': False, 'performance_runtime_type_check': False, 'direct_runner_use_stacked_bundle': True, 'direct_runner_bundle_repeat': 0, 'direct_num_workers': 0, 'direct_running_mode': 'multi_threading', 'dataflow_endpoint': 'https://dataflow.googleapis.com', 'project': 'my-project', 'job_name': None, 'staging_location': None, 'temp_location': None, 'region': None, 'service_account_email': None, 'no_auth': False, 'template_location': None, 'labels': None, 'update': False, 'transform_name_mapping': None, 'enable_str

In [3]:
options = {
    'project': 'my-project', # change
    'runner': 'InteractiveRunner',
    'direct_num_workers': 0, # 0 is use all available cores
    'direct_running_mode': 'multi_threading', # ['in_memory', 'multi_threading', 'multi_processing'] # 'multi_processing' doesn't seem to work for DirectRunner?
    'streaming': False # set to True if data source is unbounded (e.g. GCP PubSub)
}
pipeline_options = PipelineOptions(flags=[], **options) # easier to pass in options from command-line this way
print(f"PipelineOptions:\n{pipeline_options.get_all_options()}\n")

PipelineOptions:
{'runner': 'InteractiveRunner', 'streaming': False, 'beam_services': {}, 'type_check_strictness': 'DEFAULT_TO_ANY', 'type_check_additional': '', 'pipeline_type_check': True, 'runtime_type_check': False, 'performance_runtime_type_check': False, 'direct_runner_use_stacked_bundle': True, 'direct_runner_bundle_repeat': 0, 'direct_num_workers': 0, 'direct_running_mode': 'multi_threading', 'dataflow_endpoint': 'https://dataflow.googleapis.com', 'project': 'my-project', 'job_name': None, 'staging_location': None, 'temp_location': None, 'region': None, 'service_account_email': None, 'no_auth': False, 'template_location': None, 'labels': None, 'update': False, 'transform_name_mapping': None, 'enable_streaming_engine': False, 'dataflow_kms_key': None, 'flexrs_goal': None, 'hdfs_host': None, 'hdfs_port': None, 'hdfs_user': None, 'hdfs_full_urls': False, 'num_workers': None, 'max_num_workers': None, 'autoscaling_algorithm': None, 'machine_type': None, 'disk_size_gb': None, 'disk_t

In [4]:
fidscs_globals.DATA_ROOT_DIR = data_dir

In [5]:
can_proceed = True

if not tf.io.gfile.exists(fidscs_globals.DATA_ROOT_DIR) or len(tf.io.gfile.listdir(fidscs_globals.DATA_ROOT_DIR))==0:
    print(f"{fidscs_globals.VALIDATION_FATAL_ERROR_TEXT} data directory does not exist or is empty!")
    can_proceed = False
else:
    fidscs_globals.VIDEO_DIR = os.path.join(fidscs_globals.DATA_ROOT_DIR, 'videos')
    fidscs_globals.STICHED_VIDEO_FRAMES_DIR = os.path.join(fidscs_globals.DATA_ROOT_DIR, 'stitched_video_frames')
    fidscs_globals.CORPUS_DS_PATH = os.path.join(fidscs_globals.DATA_ROOT_DIR, fidscs_globals.CORPUS_DS_FNAME)
    fidscs_globals.DOCUMENT_ASL_CONSULTANT_DS_PATH = os.path.join(fidscs_globals.DATA_ROOT_DIR, fidscs_globals.DOCUMENT_ASL_CONSULTANT_DS_FNAME)
    fidscs_globals.ASL_CONSULTANT_DS_PATH = os.path.join(fidscs_globals.DATA_ROOT_DIR, fidscs_globals.ASL_CONSULTANT_DS_FNAME)
    fidscs_globals.VIDEO_DS_PATH = os.path.join(fidscs_globals.DATA_ROOT_DIR, fidscs_globals.VIDEO_DS_FNAME)
    fidscs_globals.VIDEO_SEGMENT_DS_PATH = os.path.join(fidscs_globals.DATA_ROOT_DIR, fidscs_globals.VIDEO_SEGMENT_DS_FNAME)
    fidscs_globals.VIDEO_FRAME_DS_PATH = os.path.join(fidscs_globals.DATA_ROOT_DIR, fidscs_globals.VIDEO_FRAME_DS_FNAME)
    fidscs_globals.UTTERANCE_DS_PATH = os.path.join(fidscs_globals.DATA_ROOT_DIR, fidscs_globals.UTTERANCE_DS_FNAME)
    fidscs_globals.UTTERANCE_VIDEO_DS_PATH = os.path.join(fidscs_globals.DATA_ROOT_DIR, fidscs_globals.UTTERANCE_VIDEO_DS_FNAME)
    fidscs_globals.UTTERANCE_TOKEN_DS_PATH = os.path.join(fidscs_globals.DATA_ROOT_DIR, fidscs_globals.UTTERANCE_TOKEN_DS_FNAME)
    fidscs_globals.UTTERANCE_TOKEN_FRAME_DS_PATH = os.path.join(fidscs_globals.DATA_ROOT_DIR, fidscs_globals.UTTERANCE_TOKEN_FRAME_DS_FNAME)
    fidscs_globals.VOCABULARY_DS_PATH = os.path.join(fidscs_globals.DATA_ROOT_DIR, fidscs_globals.VOCABULARY_DS_FNAME)
    fidscs_globals.TRAIN_ASSOC_DS_PATH = os.path.join(fidscs_globals.DATA_ROOT_DIR, fidscs_globals.TRAIN_FRAME_SEQ_ASSOC_DS_FNAME)
    fidscs_globals.VAL_DS_PATH = os.path.join(fidscs_globals.DATA_ROOT_DIR, fidscs_globals.VAL_FRAME_SEQ_DS_FNAME)
    fidscs_globals.TRAIN_DS_PATH = os.path.join(fidscs_globals.DATA_ROOT_DIR, fidscs_globals.TRAIN_FRAME_SEQ_DS_FNAME)

In [6]:
pl = beam.Pipeline(options=pipeline_options)

# full_target_vid_index_schemad_pcoll = beam__common.pl__1__read_target_vid_index_csv(pl)
# corpus_index_schemad_pcoll = beam__common.pl__1__read_corpus_index_csv(pl) # XML is base-64 encode but we no longer need it (to decode it) since it is only used to create the datasets
# # corpus_index_decoded_XML_pcoll = pl__2__decode_XML(corpus_index_schemad_pcoll) # see above

# asl_consultant_index_schemad_pcoll = beam__common.pl__1__read_asl_consultant_index_csv(pl)
# document_asl_consultant_utterance_index_schemad_pcoll = beam__common.pl__1__read_document_asl_consultant_utterance_index_csv(pl)
# document_asl_consultant_target_video_index_schemad_pcoll = beam__common.pl__1__read_document_asl_consultant_target_video_index_csv(pl)
# document_asl_consultant_utterance_video_index_schemad_pcoll = beam__common.pl__1__read_document_asl_consultant_utterance_video_index_csv(pl)
# document_target_video_segment_index_schemad_pcoll = beam__common.pl__1__read_document_target_video_segment_index_csv(pl)
# vocabulary_index_schemad_pcoll = beam__common.pl__1__read_vocabulary_index_csv(pl)
# document_asl_consultant_utterance_token_index_schemad_pcoll = beam__common.pl__1__read_document_asl_consultant_utterance_token_index_csv(pl)
# document_asl_consultant_target_video_frame_index_schemad_pcoll = beam__common.pl__1__read_document_asl_consultant_target_video_frame_index_csv(pl)

# as it turns it, this is all we need
document_asl_consultant_target_video_utterance_token_frame_index_schemad_pcoll = beam__common.pl__1__read_document_asl_consultant_target_video_utterance_token_frame_index_csv(pl)

In [7]:
# document_asl_consultant_target_video_utterance_token_frame_index_schemad_pcoll is the main table we use for training.
#     This will ultimately provide which frame sequences correspond to individual tokens.

# But our first measure is to build train and validation sets (for tokens).
#   In order to split up train vs validation sets, we need to compare "apples to apples".
#   That is, in order for a token (TokenID) to be considered a candidate for the split,
#   we require at least two of the same (TokenID, CameraPerspective) wherein the ASL
#   consultant for each differs.  We would prefer more than two of these tuples, each
#   having unique ASL consultants in the set of occurrences, with the majority of said
#   tuples being assigned to the training set and the remainder (at least one) being
#   assigned to the validation set.  We would like to achieve a 90/10 split, ideally,
#   but we will take what we get.

# document_asl_consultant_target_video_utterance_token_frame_index_schemad_pcoll:
    # beam.Row(
    #   DocumentID=int(d_document_asl_consultant_target_video_utterance_token_frame_info[fidscs_globals.SCHEMA_COL_NAMES__UTTERANCE_TOKEN_FRAME_DS[0]]),
    #   ASLConsultantID=int(d_document_asl_consultant_target_video_utterance_token_frame_info[fidscs_globals.SCHEMA_COL_NAMES__UTTERANCE_TOKEN_FRAME_DS[1]]),
    #   CameraPerspective=int(d_document_asl_consultant_target_video_utterance_token_frame_info[fidscs_globals.SCHEMA_COL_NAMES__UTTERANCE_TOKEN_FRAME_DS[2]]),
    #   TargetVideoFilename=str(d_document_asl_consultant_target_video_utterance_token_frame_info[fidscs_globals.SCHEMA_COL_NAMES__UTTERANCE_TOKEN_FRAME_DS[3]]),
    #   UtteranceSequence=int(d_document_asl_consultant_target_video_utterance_token_frame_info[fidscs_globals.SCHEMA_COL_NAMES__UTTERANCE_TOKEN_FRAME_DS[4]]),
    #   TokenSequence=int(d_document_asl_consultant_target_video_utterance_token_frame_info[fidscs_globals.SCHEMA_COL_NAMES__UTTERANCE_TOKEN_FRAME_DS[5]]),
    #   FrameSequence=int(d_document_asl_consultant_target_video_utterance_token_frame_info[fidscs_globals.SCHEMA_COL_NAMES__UTTERANCE_TOKEN_FRAME_DS[6]]),
    #   TokenID=int(d_document_asl_consultant_target_video_utterance_token_frame_info[fidscs_globals.SCHEMA_COL_NAMES__UTTERANCE_TOKEN_FRAME_DS[7]])
    # )

# We will transform this into tuples of the form:
    # [
    #     'TokenID', 
    #     'CameraPerspective', 
    #     'DocumentID', 
    #     'ASLConsultantID', 
    #     'TargetVideoFilename', 
    #     'UtteranceSequence', 
    #     'TokenSequence',

    #     'FrameSequence'
    # ]

dctvustsfs = (
    document_asl_consultant_target_video_utterance_token_frame_index_schemad_pcoll
    | "Beam PL: extract (TokenID,CameraPerspective,ASLConsultantID,TargetVideoFilename,UtteranceSequence,TokenSequence,FrameSequence) from dctvustsfs schemad pcoll" >> beam.Map(
            lambda dctvustsfs_row: (
                dctvustsfs_row.TokenID,
                dctvustsfs_row.CameraPerspective,
                dctvustsfs_row.ASLConsultantID,
                dctvustsfs_row.TargetVideoFilename,
                dctvustsfs_row.UtteranceSequence,
                dctvustsfs_row.TokenSequence,
                dctvustsfs_row.FrameSequence
            )
        )
)

# for train-validation split, we want to key/group by (TokenID, CameraPerspective) with lists of unique (ASLConsultantID, TargetVideoFilename, UtteranceSequence, TokenSequence) > 1
ctvusts_by_tcp = (
    dctvustsfs
    | "Beam PL: extract ((TokenID,CameraPerspective), (ASLConsultantID,TargetVideoFilename,UtteranceSequence,TokenSequence)) from dctvustsfs" >> beam.Map(
            lambda dctvustsfs_row_tpl: (
                (
                    dctvustsfs_row_tpl[0],
                    dctvustsfs_row_tpl[1]
                ),
                (
                    dctvustsfs_row_tpl[2],
                    dctvustsfs_row_tpl[3],
                    dctvustsfs_row_tpl[4],
                    dctvustsfs_row_tpl[5]
                )
            )
        )
    | "Beam PL: select distinct ((TokenID,CameraPerspective), (ASLConsultantID,TargetVideoFilename,UtteranceSequence,TokenSequence)) from ctvusts_by_tcp" >> beam.Distinct()
    | "Beam PL: group (ASLConsultantID,TargetVideoFilename,UtteranceSequence,TokenSequence) by key (TokenID,CameraPerspective)" >> beam.GroupByKey() 
    # the above produces tuples of the form:
        # (
        #     (
        #         TokenID,
        #         CameraPerspective
        #     ),
        #     listof(
        #       (ASLConsultantID,TargetVideoFilename,UtteranceSequence,TokenSequence)
        #     )
        # )
)


def flatten_ctvusts_by_tcp(ctvusts_by_tcp_tpl):
    return [
        (
            ctvusts_by_tcp_tpl[0][0],   # TokenID
            ctvusts_by_tcp_tpl[0][1],   # CameraPerspective
            ctvusts_tpl[0],             # ASLConsultantID
            ctvusts_tpl[1],             # TargetVideoFilename
            ctvusts_tpl[2],             # UtteranceSequence
            ctvusts_tpl[3]              # TokenSequence
        ) for ctvusts_tpl in ctvusts_by_tcp_tpl[1]
    ]

ctvusts_by_tcp__gt_1 = (
    ctvusts_by_tcp
    | "Beam PL: filter candidate (TokenID,CameraPerspective) for test-validation split" >> beam.Filter(
            lambda list_ctvusts_by_tcp_tpl: len(set(list_ctvusts_by_tcp_tpl[1])) > 1
        )
    | "Beam PL: flatten filtered (TokenID,CameraPerspective) candidates for test-validation split" >> beam.FlatMap(flatten_ctvusts_by_tcp)
)

ctvusts_by_tcp__lte_1 = (
    ctvusts_by_tcp
    | "Beam PL: filter non-candidate (TokenID,CameraPerspective) for test-validation split" >> beam.Filter(
            lambda list_ctvusts_by_tcp_tpl: len(set(list_ctvusts_by_tcp_tpl[1])) <= 1
        )
    | "Beam PL: flatten filtered (TokenID,CameraPerspective) non-candidates for test-validation split" >> beam.FlatMap(flatten_ctvusts_by_tcp)
)

<p><br>

#### Finally, execute validation/train split on ctvusts_by_tcp__gt_1

In [8]:
# first, we need to put ctvusts_by_tcp__gt_1 back into ((TokenID, CameraPerspective), (ASLConsultantID, TargetVideoFilename, UtteranceSequence, TokenSequence)) form
def rekey_ctvusts_by_tcp(ctvusts_by_tcp_tpl):
    return (
        (
            ctvusts_by_tcp_tpl[0],  # TokenID
            ctvusts_by_tcp_tpl[1]   # CameraPerspective
        ),
        (
            ctvusts_by_tcp_tpl[2],  # ASLConsultantID
            ctvusts_by_tcp_tpl[3],  # TargetVideoFilename
            ctvusts_by_tcp_tpl[4],  # UtteranceSequence
            ctvusts_by_tcp_tpl[5]   # TokenSequence
        )
    )

def val_train_split__ctvusts_by_tcp__gt_1__tpl(ctvusts_list__by__tcp__gt_1__tpl):
    """
    ctvusts_list__by__tcp__gt_1__tpl
        (
            (TokenID,CameraPerspective), # key
            listof(
                (ASLConsultantID,TargetVideoFilename,UtteranceSequence,TokenSequence)
            )
        )
    """
    ctvusts_list = ctvusts_list__by__tcp__gt_1__tpl[1].copy() # we need a copy since we want to shuffle
    random.shuffle(ctvusts_list)
    len_ctvusts_list = len(ctvusts_list)
    val_len_ctvusts_list = int(len_ctvusts_list*fidscs_globals.VALIDATION_SIZE_RATIO) if len_ctvusts_list > int(((1-fidscs_globals.VALIDATION_SIZE_RATIO)*100)/10) else 1
    train__ctvusts_list, val__ctvusts_list = ctvusts_list[val_len_ctvusts_list:], ctvusts_list[:val_len_ctvusts_list]
    return (
        (
            ctvusts_list__by__tcp__gt_1__tpl[0][0],    # TokenID
            ctvusts_list__by__tcp__gt_1__tpl[0][1]     # CameraPerspective
        ),
        (
            train__ctvusts_list,
            val__ctvusts_list
        )
    )

val_train_split_basis__ctvusts_by_tcp__gt_1 = (
    ctvusts_by_tcp__gt_1
    | "Beam PL: rekey ctvusts_by_tcp__gt_1 for validation/train split" >> beam.Map(rekey_ctvusts_by_tcp)
    | "Beam PL: group (ASLConsultantID,TargetVideoFilename,UtteranceSequence,TokenSequence) rekeyed by (TokenID,CameraPerspective)" >> beam.GroupByKey()
    # the above produces tuples of the form:
        # (
        #     (TokenID,CameraPerspective), # key
        #     listof(
        #       (ASLConsultantID,TargetVideoFilename,UtteranceSequence,TokenSequence)
        #     )
        # )
    | "Beam PL: split rekeyed ctvusts_list_by_tcp__gt_1" >> beam.Map(val_train_split__ctvusts_by_tcp__gt_1__tpl)
    # the above produces tuples of the form:
        # (
        #     (TokenID,CameraPerspective), # key
        #     (
        #       test_list_of(ASLConsultantID,TargetVideoFilename,UtteranceSequence,TokenSequence),
        #       val_list_of(ASLConsultantID,TargetVideoFilename,UtteranceSequence,TokenSequence),
        #     )
        # )
)

train__ctvusts_by_tcp__gt_1 = (
    val_train_split_basis__ctvusts_by_tcp__gt_1
    | "Beam PL: select train sublist from val_train_split_basis__ctvusts_by_tcp__gt_1" >> beam.Map(
            lambda val_train_split_basis__ctvusts_by_tcp__gt_1_tpl: [
                (
                    val_train_split_basis__ctvusts_by_tcp__gt_1_tpl[0][0],  # TokenID
                    val_train_split_basis__ctvusts_by_tcp__gt_1_tpl[0][1],  # CameraPerspective
                    train_ctvusts_tpl[0],                                   # ASLConsultantID
                    train_ctvusts_tpl[1],                                   # TargetVideoFilename
                    train_ctvusts_tpl[2],                                   # UtteranceSequence
                    train_ctvusts_tpl[3]                                    # TokenSequence
                ) for train_ctvusts_tpl in val_train_split_basis__ctvusts_by_tcp__gt_1_tpl[1][0] # index [1][0] points to train sublist
            ]
        )
    | "Beam PL: 'explode list_train__ctvusts_by_tcp__gt_1_tpl" >> beam.FlatMap(lambda list_train__ctvusts_by_tcp__gt_1_tpl: list_train__ctvusts_by_tcp__gt_1_tpl)
)

val__ctvusts_by_tcp__gt_1 = (
    val_train_split_basis__ctvusts_by_tcp__gt_1
    | "Beam PL: select validation sublist from val_train_split_basis__ctvusts_by_tcp__gt_1" >> beam.Map(
            lambda val_train_split_basis__ctvusts_by_tcp__gt_1_tpl: [
                (
                    val_train_split_basis__ctvusts_by_tcp__gt_1_tpl[0][0],  # TokenID
                    val_train_split_basis__ctvusts_by_tcp__gt_1_tpl[0][1],  # CameraPerspective
                    val_ctvusts_tpl[0],                                     # ASLConsultantID
                    val_ctvusts_tpl[1],                                     # TargetVideoFilename
                    val_ctvusts_tpl[2],                                     # UtteranceSequence
                    val_ctvusts_tpl[3]                                      # TokenSequence
                ) for val_ctvusts_tpl in val_train_split_basis__ctvusts_by_tcp__gt_1_tpl[1][1] # index [1][1] points to validation sublist
            ]
        )
    | "Beam PL: 'explode list_val__ctvusts_by_tcp__gt_1_tpl" >> beam.FlatMap(lambda list_val__ctvusts_by_tcp__gt_1_tpl: list_val__ctvusts_by_tcp__gt_1_tpl)
)

In [9]:
# join train__ctvusts_by_tcp__gt_1 to dctvustsfs
train__ctvusts_by_tcp__gt_1__keys = (
    train__ctvusts_by_tcp__gt_1
    | "Beam PL: extract ((TokenID,CameraPerspective,ASLConsultantID,TargetVideoFilename,UtteranceSequence,TokenSequence), '<train__ctvusts_by_tcp__gt_1__has_key>') for join to dctvustsfs" >> beam.Map(
            lambda train__ctvusts_by_tcp__gt_1_tpl : (
                (
                    train__ctvusts_by_tcp__gt_1_tpl[0], # TokenID
                    train__ctvusts_by_tcp__gt_1_tpl[1], # CameraPerspective
                    train__ctvusts_by_tcp__gt_1_tpl[2], # ASLConsultantID
                    train__ctvusts_by_tcp__gt_1_tpl[3], # TargetVideoFilename
                    train__ctvusts_by_tcp__gt_1_tpl[4], # UtteranceSequence
                    train__ctvusts_by_tcp__gt_1_tpl[5]  # TokenSequence
                ),
                "<train__ctvusts_by_tcp__gt_1__has_key>"
            )
        )
)

dctvustsfs__frame_sequences = (
    dctvustsfs
    | "Beam PL: extract ((TokenID,CameraPerspective,ASLConsultantID,TargetVideoFilename,UtteranceSequence,TokenSequence), FrameSequence) for join to train__ctvusts_by_tcp__gt_1/val__ctvusts_by_tcp__gt_1" >> beam.Map(
            lambda dctvustsfs_tpl: (
                (
                    dctvustsfs_tpl[0],  # TokenID
                    dctvustsfs_tpl[1],  # CameraPerspective
                    dctvustsfs_tpl[2],  # ASLConsultantID
                    dctvustsfs_tpl[3],  # TargetVideoFilename
                    dctvustsfs_tpl[4],  # UtteranceSequence
                    dctvustsfs_tpl[5]   # TokenSequence
                ),
                dctvustsfs_tpl[6]       # FrameSequence
            )
        )
)

train_dctvustsfs__gt__1 = (
    ({
      'has_key': train__ctvusts_by_tcp__gt_1__keys,
      'frame_sequences': dctvustsfs__frame_sequences
    })
    | "Beam PL: join train__ctvusts_by_tcp__gt_1 to dctvustsfs" >> beam.CoGroupByKey()
    # the above produces tuples of the form:
        # (
        #     (
        #         <TokenID>,
        #         <CameraPerspective>,
        #         <ASLConsultantID>,
        #         <TargetVideoFilename>,
        #         <UtteranceSequence>,
        #         <TokenSequence>
        #     ),
        #     {
        #         'has_key': listof('<train__ctvusts_by_tcp__gt_1__has_key>'),    # should have only one/single element
        #         'frame_sequences': listof(<FrameSequence>)                      # many
        #     }
        # )
    | "Beam PL: filter out mismatches from joined train__ctvusts_by_tcp__gt_1 to dctvustsfs" >> beam.Filter(
            lambda joined__train__ctvusts_by_tcp__gt_1__to__dctvustsfs__tpl: 
                len(joined__train__ctvusts_by_tcp__gt_1__to__dctvustsfs__tpl[1]['has_key'])>0 and \
                    len(joined__train__ctvusts_by_tcp__gt_1__to__dctvustsfs__tpl[1]['frame_sequences'])>0
        )
    | "Beam PL: 'explode' listof(<FrameSequence>) from joined train__ctvusts_by_tcp__gt_1 to dctvustsfs to list of tuples" >> beam.Map(
            lambda joined__train__ctvusts_by_tcp__gt_1__to__dctvustsfs__tpl: [
                (
                    joined__train__ctvusts_by_tcp__gt_1__to__dctvustsfs__tpl[0][0], # TokenID
                    joined__train__ctvusts_by_tcp__gt_1__to__dctvustsfs__tpl[0][1], # CameraPerspective
                    joined__train__ctvusts_by_tcp__gt_1__to__dctvustsfs__tpl[0][2], # ASLConsultantID
                    joined__train__ctvusts_by_tcp__gt_1__to__dctvustsfs__tpl[0][3], # TargetVideoFilename
                    joined__train__ctvusts_by_tcp__gt_1__to__dctvustsfs__tpl[0][4], # UtteranceSequence
                    joined__train__ctvusts_by_tcp__gt_1__to__dctvustsfs__tpl[0][5], # TokenSequence
                    frame_seq
                ) for frame_seq in sorted(joined__train__ctvusts_by_tcp__gt_1__to__dctvustsfs__tpl[1]['frame_sequences'])
            ]
        )
    | "Beam PL: 'explode' listof((TokenID,CameraPerspective,ASLConsultantID,TargetVideoFilename,UtteranceSequence,TokenSequence, FrameSequence)) from joined train__ctvusts_by_tcp__gt_1 to dctvustsfs" >> beam.FlatMap(
            lambda list_joined__train__ctvusts_by_tcp__gt_1__to__dctvustsfs__tpl: list_joined__train__ctvusts_by_tcp__gt_1__to__dctvustsfs__tpl
        )
)

In [10]:
# join val__ctvusts_by_tcp__gt_1 to dctvustsfs
val__ctvusts_by_tcp__gt_1__keys = (
    val__ctvusts_by_tcp__gt_1
    | "Beam PL: extract ((TokenID,CameraPerspective,ASLConsultantID,TargetVideoFilename,UtteranceSequence,TokenSequence), '<val__ctvusts_by_tcp__gt_1__has_key>') for join to dctvustsfs" >> beam.Map(
            lambda val__ctvusts_by_tcp__gt_1_tpl : (
                (
                    val__ctvusts_by_tcp__gt_1_tpl[0], # TokenID
                    val__ctvusts_by_tcp__gt_1_tpl[1], # CameraPerspective
                    val__ctvusts_by_tcp__gt_1_tpl[2], # ASLConsultantID
                    val__ctvusts_by_tcp__gt_1_tpl[3], # TargetVideoFilename
                    val__ctvusts_by_tcp__gt_1_tpl[4], # UtteranceSequence
                    val__ctvusts_by_tcp__gt_1_tpl[5]  # TokenSequence
                ),
                "<val__ctvusts_by_tcp__gt_1__has_key>"
            )
        )
)

val_dctvustsfs__gt__1 = (
    ({
      'has_key': val__ctvusts_by_tcp__gt_1__keys,
      'frame_sequences': dctvustsfs__frame_sequences
    })
    | "Beam PL: join val__ctvusts_by_tcp__gt_1 to dctvustsfs" >> beam.CoGroupByKey()
    # the above produces tuples of the form:
        # (
        #     (
        #         <TokenID>,
        #         <CameraPerspective>,
        #         <ASLConsultantID>,
        #         <TargetVideoFilename>,
        #         <UtteranceSequence>,
        #         <TokenSequence>
        #     ),
        #     {
        #         'has_key': listof('<val__ctvusts_by_tcp__gt_1__has_key>'),    # should have only one/single element
        #         'frame_sequences': listof(<FrameSequence>)                      # many
        #     }
        # )
    | "Beam PL: filter out mismatches from joined val__ctvusts_by_tcp__gt_1 to dctvustsfs" >> beam.Filter(
            lambda joined__val__ctvusts_by_tcp__gt_1__to__dctvustsfs__tpl: 
                len(joined__val__ctvusts_by_tcp__gt_1__to__dctvustsfs__tpl[1]['has_key'])>0 and \
                    len(joined__val__ctvusts_by_tcp__gt_1__to__dctvustsfs__tpl[1]['frame_sequences'])>0
        )
    | "Beam PL: 'explode' listof(<FrameSequence>) from joined val__ctvusts_by_tcp__gt_1 to dctvustsfs to list of tuples" >> beam.Map(
            lambda joined__val__ctvusts_by_tcp__gt_1__to__dctvustsfs__tpl: [
                (
                    joined__val__ctvusts_by_tcp__gt_1__to__dctvustsfs__tpl[0][0],   # TokenID
                    joined__val__ctvusts_by_tcp__gt_1__to__dctvustsfs__tpl[0][1],   # CameraPerspective
                    joined__val__ctvusts_by_tcp__gt_1__to__dctvustsfs__tpl[0][2],   # ASLConsultantID
                    joined__val__ctvusts_by_tcp__gt_1__to__dctvustsfs__tpl[0][3],   # TargetVideoFilename
                    joined__val__ctvusts_by_tcp__gt_1__to__dctvustsfs__tpl[0][4],   # UtteranceSequence
                    joined__val__ctvusts_by_tcp__gt_1__to__dctvustsfs__tpl[0][5],   # TokenSequence
                    frame_seq                                                       # FrameSequence
                ) for frame_seq in sorted(joined__val__ctvusts_by_tcp__gt_1__to__dctvustsfs__tpl[1]['frame_sequences'])
            ]
        )
    | "Beam PL: 'explode' listof((TokenID,CameraPerspective,ASLConsultantID,TargetVideoFilename,UtteranceSequence,TokenSequence, FrameSequence)) from joined val__ctvusts_by_tcp__gt_1 to dctvustsfs" >> beam.FlatMap(
            lambda list_joined__val__ctvusts_by_tcp__gt_1__to__dctvustsfs__tpl: list_joined__val__ctvusts_by_tcp__gt_1__to__dctvustsfs__tpl
        )
)

In [11]:
train__ctvusts_by_tcp__lte_1__keys = (
    ctvusts_by_tcp__lte_1
    | "Beam PL: extract ((TokenID,CameraPerspective,ASLConsultantID,TargetVideoFilename,UtteranceSequence,TokenSequence), '<ctvusts_by_tcp__lte_1_tpl__has_key>') for join to dctvustsfs" >> beam.Map(
            lambda ctvusts_by_tcp__lte_1_tpl : (
                (
                    ctvusts_by_tcp__lte_1_tpl[0], # TokenID
                    ctvusts_by_tcp__lte_1_tpl[1], # CameraPerspective
                    ctvusts_by_tcp__lte_1_tpl[2], # ASLConsultantID
                    ctvusts_by_tcp__lte_1_tpl[3], # TargetVideoFilename
                    ctvusts_by_tcp__lte_1_tpl[4], # UtteranceSequence
                    ctvusts_by_tcp__lte_1_tpl[5]  # TokenSequence
                ),
                "<ctvusts_by_tcp__lte_1_tpl__has_key>"
            )
        )
)

train_dctvustsfs__lte_1 = (
    ({
      'has_key': train__ctvusts_by_tcp__lte_1__keys,
      'frame_sequences': dctvustsfs__frame_sequences
    })
    | "Beam PL: join ctvusts_by_tcp__lte_1 to dctvustsfs" >> beam.CoGroupByKey()
    # the above produces tuples of the form:
        # (
        #     (
        #         <TokenID>,
        #         <CameraPerspective>,
        #         <ASLConsultantID>,
        #         <TargetVideoFilename>,
        #         <UtteranceSequence>,
        #         <TokenSequence>
        #     ),
        #     {
        #         'has_key': listof('<ctvusts_by_tcp__lte_1_tpl__has_key>'),    # should have only one/single element
        #         'frame_sequences': listof(<FrameSequence>)                      # many
        #     }
        # )
    | "Beam PL: filter out mismatches from joined train__ctvusts_by_tcp__lte_1 to dctvustsfs" >> beam.Filter(
            lambda joined__train__ctvusts_by_tcp__lte_1__to__dctvustsfs__tpl: 
                len(joined__train__ctvusts_by_tcp__lte_1__to__dctvustsfs__tpl[1]['has_key'])>0 and \
                    len(joined__train__ctvusts_by_tcp__lte_1__to__dctvustsfs__tpl[1]['frame_sequences'])>0
        )
    | "Beam PL: 'explode' listof(<FrameSequence>) from joined train__ctvusts_by_tcp__lte_1 to dctvustsfs to list of tuples" >> beam.Map(
            lambda joined__train__ctvusts_by_tcp__lte_1__to__dctvustsfs__tpl: [
                (
                    joined__train__ctvusts_by_tcp__lte_1__to__dctvustsfs__tpl[0][0], # TokenID
                    joined__train__ctvusts_by_tcp__lte_1__to__dctvustsfs__tpl[0][1], # CameraPerspective
                    joined__train__ctvusts_by_tcp__lte_1__to__dctvustsfs__tpl[0][2], # ASLConsultantID
                    joined__train__ctvusts_by_tcp__lte_1__to__dctvustsfs__tpl[0][3], # TargetVideoFilename
                    joined__train__ctvusts_by_tcp__lte_1__to__dctvustsfs__tpl[0][4], # UtteranceSequence
                    joined__train__ctvusts_by_tcp__lte_1__to__dctvustsfs__tpl[0][5], # TokenSequence
                    frame_seq
                ) for frame_seq in sorted(joined__train__ctvusts_by_tcp__lte_1__to__dctvustsfs__tpl[1]['frame_sequences'])
            ]
        )
    | "Beam PL: 'explode' listof((TokenID,CameraPerspective,ASLConsultantID,TargetVideoFilename,UtteranceSequence,TokenSequence, FrameSequence)) from joined ttrain__ctvusts_by_tcp__lte_1 to dctvustsfs" >> beam.FlatMap(
            lambda list_joined__train__ctvusts_by_tcp__lte_1__to__dctvustsfs__tpl: list_joined__train__ctvusts_by_tcp__lte_1__to__dctvustsfs__tpl
        )
)

train_dctvustsfs__all = (
    (train_dctvustsfs__gt__1, train_dctvustsfs__lte_1) 
    | f"Beam PL: merge train_dctvustsfs__gt__1 with train_dctvustsfs__lte_1" >> beam.Flatten() 
)

In [12]:
# find all COMPLETE utterances that can be formed with token-cameraperspective pairs from the validation set

val_tcp__gt__1 = (
    val_dctvustsfs__gt__1
    | "Beam PL: extract (TokenID, CameraPerspective) from val_dctvustsfs__gt__1" >> beam.Map(
            lambda tpl: (
                tpl[0], # TokenID
                tpl[1]  # CameraPerspective
            )
        )
    | "Beam PL: select distinct (TokenID, CameraPerspective) from val_dctvustsfs__gt__1" >> beam.Distinct()
)

complete_utterances__with__val_tcp__gt__1 = (
    dctvustsfs
    | "Beam PL: extract (ASLConsultantID,TargetVideoFilename,CameraPerspective,UtteranceSequence,TokenSequence,TokenID) from dctvustsfs" >> beam.Map(
            lambda tpl: (
                tpl[2], # <ASLConsultantID>
                tpl[3], # <TargetVideoFilename>
                tpl[4], # <UtteranceSequence>
                tpl[1], # <CameraPerspective>

                tpl[5], # <TokenSequence>
                tpl[0]  # <TokenID>
            )
        )
    | "Beam PL: select distinct (ASLConsultantID,TargetVideoFilename,CameraPerspective,UtteranceSequence,TokenSequence,TokenID) from dctvustsfs" >> beam.Distinct()
    | "Beam PL: transform distinct ctvcpustst tuples to tst_by_ctvuscp" >> beam.Map(
            lambda tpl: (
                (
                    tpl[0], # <ASLConsultantID>
                    tpl[1], # <TargetVideoFilename>
                    tpl[2], # <UtteranceSequence>
                    tpl[3]  # <CameraPerspective>
                ),
                (
                    tpl[4], # <TokenSequence>
                    tpl[5]  # <TokenID>
                )
            )
        )
    | "Beam PL: collect list of tokenseq-tokenid for each (<ASLConsultantID>, <TargetVideoFilename>, <UtteranceSequence>, <CameraPerspective>)" >> beam.GroupByKey()
    # the above produces tuples of the form:
        # (
        #     (<ASLConsultantID>,<TargetVideoFilename>,<UtteranceSequence>,<CameraPerspective>), # key
        #     listof((<TokenSequence>,<TokenID>))
        # )
    | "Beam PL: sort list of tokenseq-tokenid by tokenseq for each (<ASLConsultantID>, <TargetVideoFilename>, <UtteranceSequence>, <CameraPerspective>)" >> beam.Map(
            lambda tpl: (
                (
                    tpl[0][0], # <ASLConsultantID>
                    tpl[0][1], # <TargetVideoFilename>
                    tpl[0][2], # <UtteranceSequence>
                    tpl[0][3]  # <CameraPerspective>
                ),
                [(tst_tpl[1], tpl[0][3]) for tst_tpl in sorted(tpl[1], key=lambda tst_tpl: tst_tpl[0])]
            )
        )
    # the above produces tuples of the form:
        # (
        #     (<ASLConsultantID>,<TargetVideoFilename>,<UtteranceSequence>,<CameraPerspective>), # key
        #     listof((<TokenID>, <CameraPerspective>)) # sorted by <TokenSequence>
        # )

    # now we need to filter all of the above (<ASLConsultantID>,<TargetVideoFilename>,<UtteranceSequence>,<CameraPerspective>) where every (<TokenID>, <CameraPerspective>) in the corresponding list exists in val_tcp__gt__1
    | "Beam PL: filter matching rows from vid index" >> beam.Filter(
        lambda list_tcp_tpl__by__ctvuscp__tpl, existing_val_tcp_tpls: all(tcp_tpl in existing_val_tcp_tpls for tcp_tpl in list_tcp_tpl__by__ctvuscp__tpl[1]),
        existing_val_tcp_tpls=beam.pvalue.AsIter(val_tcp__gt__1)
      )
    | "Beam PL: extract (<ASLConsultantID>,<TargetVideoFilename>,<UtteranceSequence>,<CameraPerspective>,listof(<TokenID>))" >> beam.Map(
            lambda tpl: (
                tpl[0][0],  # <ASLConsultantID>
                tpl[0][1],  # <TargetVideoFilename>
                tpl[0][2],  # <UtteranceSequence>
                tpl[0][3],  # <CameraPerspective>
                [tcp_tpl[0] for tcp_tpl in tpl[1]] # listof(<TokenID>)
            ) 
        )
)

In [13]:
# we require this in order to make use of ib.show() (which provides visualization of the pcolls specified) or ib.collect() (which creates a pandas dataframe from a pcoll)
    # but all pcolls we wish to visualize must be created prior to executing the following line
ib.watch(locals())

#### Show those with counts > 1

In [14]:
df_ctvusts_by_tcp__gt_1 = ib.collect(ctvusts_by_tcp__gt_1)

In [15]:
df_ctvusts_by_tcp__gt_1.columns = ['TokenID', 'CameraPerspective', 'ASLConsultantID', 'TargetVideoFilename', 'UtteranceSequence', 'TokenSequence']
df_ctvusts_by_tcp__gt_1.set_index(['TokenID', 'CameraPerspective'], inplace=True)
df_ctvusts_by_tcp__gt_1.sort_values(axis=0, by=['ASLConsultantID', 'TargetVideoFilename', 'UtteranceSequence', 'TokenSequence'], ignore_index=False, inplace=True)
df_ctvusts_by_tcp__gt_1.sort_index(inplace=True)
df_ctvusts_by_tcp__gt_1

Unnamed: 0_level_0,Unnamed: 1_level_0,ASLConsultantID,TargetVideoFilename,UtteranceSequence,TokenSequence
TokenID,CameraPerspective,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0,3,dorm_prank_1053_small_0.mov,32,1
1,0,3,lapd_story_1083_small_0.mov,15,2
1,0,7,ben_story_439_small_0.mov,10,1
1,0,7,ben_story_439_small_0.mov,29,1
1,1,7,ben_story_439_small_1.mov,10,1
...,...,...,...,...,...
2406,3,3,scary_story_1048_small_3.mov,35,0
2409,0,3,boston-la_1088_small_0.mov,4,12
2409,0,3,boston-la_1088_small_0.mov,6,5
2409,2,3,boston-la_1088_small_2.mov,4,12


In [16]:
# df_ctvusts_by_tcp__gt_1.loc[
#     (
#         [2369],         # TokenID
#         [2]             # CameraPerspective
#     ), 
#     :
# ].sort_index(ascending=[True, True])

In [17]:
df_ctvusts_by_tcp__gt_1__count = df_ctvusts_by_tcp__gt_1.reset_index().groupby(['TokenID', 'CameraPerspective']).count()
df_ctvusts_by_tcp__gt_1__count = df_ctvusts_by_tcp__gt_1__count[['ASLConsultantID']]
df_ctvusts_by_tcp__gt_1__count.columns = ['count']
df_ctvusts_by_tcp__gt_1__count.sort_values(axis=0, by=['count'], ascending=False, inplace=True)
# df_ctvusts_by_tcp__gt_1__count.sort_index(inplace=True)
df_ctvusts_by_tcp__gt_1__count

Unnamed: 0_level_0,Unnamed: 1_level_0,count
TokenID,CameraPerspective,Unnamed: 2_level_1
365,0,685
1809,0,621
365,2,574
1809,2,553
380,0,413
...,...,...
1221,0,2
1751,0,2
1759,0,2
1759,2,2


#### Now show those with counts <= 1

In [18]:
df_ctvusts_by_tcp__lte_1 = ib.collect(ctvusts_by_tcp__lte_1)

In [19]:
df_ctvusts_by_tcp__lte_1.columns = ['TokenID', 'CameraPerspective', 'ASLConsultantID', 'TargetVideoFilename', 'UtteranceSequence', 'TokenSequence']
df_ctvusts_by_tcp__lte_1.set_index(['TokenID', 'CameraPerspective'], inplace=True)
df_ctvusts_by_tcp__lte_1.sort_values(axis=0, by=['ASLConsultantID', 'TargetVideoFilename', 'UtteranceSequence', 'TokenSequence'], ignore_index=False, inplace=True)
df_ctvusts_by_tcp__lte_1.sort_index(inplace=True)
df_ctvusts_by_tcp__lte_1

Unnamed: 0_level_0,Unnamed: 1_level_0,ASLConsultantID,TargetVideoFilename,UtteranceSequence,TokenSequence
TokenID,CameraPerspective,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,7,ben_story_439_small_0.mov,7,0
0,1,7,ben_story_439_small_1.mov,7,0
0,2,7,ben_story_439_small_2.mov,7,0
0,3,7,ben_story_439_small_3.mov,7,0
4,0,3,siblings_1066_small_0.mov,21,1
...,...,...,...,...,...
2408,0,0,DSP%2520Immigrants.mov,16,5
2410,0,3,boston-la_1088_small_0.mov,39,5
2410,2,3,boston-la_1088_small_2.mov,39,5
2411,0,3,boston-la_1088_small_0.mov,81,5


In [20]:
df_ctvusts_by_tcp__lte_1__count = df_ctvusts_by_tcp__lte_1.reset_index().groupby(['TokenID', 'CameraPerspective']).count()
df_ctvusts_by_tcp__lte_1__count = df_ctvusts_by_tcp__lte_1__count[['ASLConsultantID']]
df_ctvusts_by_tcp__lte_1__count.columns = ['count']
df_ctvusts_by_tcp__lte_1__count.sort_values(axis=0, by=['count'], ascending=False, inplace=True)
# df_ctvusts_by_tcp__gt_1__count.sort_index(inplace=True)
df_ctvusts_by_tcp__lte_1__count

Unnamed: 0_level_0,Unnamed: 1_level_0,count
TokenID,CameraPerspective,Unnamed: 2_level_1
0,0,1
1637,0,1
1638,3,1
1639,0,1
1639,2,1
...,...,...
888,2,1
888,3,1
889,0,1
889,1,1


In [21]:
df_ctvusts_by_tcp__intersection = df_ctvusts_by_tcp__gt_1.join(df_ctvusts_by_tcp__lte_1, how='inner', lsuffix='_left', rsuffix='_right')
df_ctvusts_by_tcp__intersection

Unnamed: 0_level_0,Unnamed: 1_level_0,ASLConsultantID_left,TargetVideoFilename_left,UtteranceSequence_left,TokenSequence_left,ASLConsultantID_right,TargetVideoFilename_right,UtteranceSequence_right,TokenSequence_right
TokenID,CameraPerspective,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1


#### Now show train/validation split

In [22]:
df_train__ctvusts_by_tcp__gt_1 = ib.collect(train__ctvusts_by_tcp__gt_1)

In [23]:
df_train__ctvusts_by_tcp__gt_1.columns = ['TokenID', 'CameraPerspective', 'ASLConsultantID', 'TargetVideoFilename', 'UtteranceSequence', 'TokenSequence']
df_train__ctvusts_by_tcp__gt_1.set_index(['TokenID', 'CameraPerspective'], inplace=True)
df_train__ctvusts_by_tcp__gt_1.sort_values(axis=0, by=['ASLConsultantID', 'TargetVideoFilename', 'UtteranceSequence', 'TokenSequence'], ignore_index=False, inplace=True)
df_train__ctvusts_by_tcp__gt_1.sort_index(inplace=True)
df_train__ctvusts_by_tcp__gt_1

Unnamed: 0_level_0,Unnamed: 1_level_0,ASLConsultantID,TargetVideoFilename,UtteranceSequence,TokenSequence
TokenID,CameraPerspective,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0,3,dorm_prank_1053_small_0.mov,32,1
1,0,7,ben_story_439_small_0.mov,10,1
1,0,7,ben_story_439_small_0.mov,29,1
1,1,7,ben_story_439_small_1.mov,29,1
1,2,3,dorm_prank_1053_small_2.mov,32,1
...,...,...,...,...,...
2406,0,3,scary_story_1048_small_0.mov,35,0
2406,2,3,scary_story_1048_small_2.mov,21,0
2406,3,3,scary_story_1048_small_3.mov,35,0
2409,0,3,boston-la_1088_small_0.mov,4,12


In [24]:
df_val__ctvusts_by_tcp__gt_1 = ib.collect(val__ctvusts_by_tcp__gt_1)

In [25]:
df_val__ctvusts_by_tcp__gt_1.columns = ['TokenID', 'CameraPerspective', 'ASLConsultantID', 'TargetVideoFilename', 'UtteranceSequence', 'TokenSequence']
df_val__ctvusts_by_tcp__gt_1.set_index(['TokenID', 'CameraPerspective'], inplace=True)
df_val__ctvusts_by_tcp__gt_1.sort_values(axis=0, by=['ASLConsultantID', 'TargetVideoFilename', 'UtteranceSequence', 'TokenSequence'], ignore_index=False, inplace=True)
df_val__ctvusts_by_tcp__gt_1.sort_index(inplace=True)
df_val__ctvusts_by_tcp__gt_1

Unnamed: 0_level_0,Unnamed: 1_level_0,ASLConsultantID,TargetVideoFilename,UtteranceSequence,TokenSequence
TokenID,CameraPerspective,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0,3,lapd_story_1083_small_0.mov,15,2
1,1,7,ben_story_439_small_1.mov,10,1
1,2,3,lapd_story_1083_small_2.mov,15,2
1,3,3,dorm_prank_1053_small_3.%2520mov,32,1
2,0,7,ben_story_439_small_0.mov,43,8
...,...,...,...,...,...
2406,0,3,scary_story_1048_small_0.mov,21,0
2406,2,3,scary_story_1048_small_2.mov,35,0
2406,3,3,scary_story_1048_small_3.mov,21,0
2409,0,3,boston-la_1088_small_0.mov,6,5


In [26]:
df_train__ctvusts_by_tcp__gt_1.loc[
    (
        [2409],         # TokenID
        [0]             # CameraPerspective
    ), 
    :
].sort_index(ascending=[True, True])

Unnamed: 0_level_0,Unnamed: 1_level_0,ASLConsultantID,TargetVideoFilename,UtteranceSequence,TokenSequence
TokenID,CameraPerspective,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2409,0,3,boston-la_1088_small_0.mov,4,12


In [27]:
df_val__ctvusts_by_tcp__gt_1.loc[
    (
        [2409],         # TokenID
        [0]             # CameraPerspective
    ), 
    :
].sort_index(ascending=[True, True])

Unnamed: 0_level_0,Unnamed: 1_level_0,ASLConsultantID,TargetVideoFilename,UtteranceSequence,TokenSequence
TokenID,CameraPerspective,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2409,0,3,boston-la_1088_small_0.mov,6,5


<p><br>

#### View final training/validation sets (with associated frame sequences)

<p><br>

##### Training (sub) set (that has at least one corresponding token/camera perspective in the validation set)

In [28]:
df_train_dctvustsfs__gt__1 = ib.collect(train_dctvustsfs__gt__1)

In [29]:
df_train_dctvustsfs__gt__1.columns = ['TokenID', 'CameraPerspective', 'ASLConsultantID', 'TargetVideoFilename', 'UtteranceSequence', 'TokenSequence', 'FrameSequence']
df_train_dctvustsfs__gt__1.set_index(['TokenID', 'CameraPerspective'], inplace=True)
df_train_dctvustsfs__gt__1.sort_values(axis=0, by=['ASLConsultantID', 'TargetVideoFilename', 'UtteranceSequence', 'TokenSequence', 'FrameSequence'], ignore_index=False, inplace=True)
df_train_dctvustsfs__gt__1.sort_index(inplace=True)
df_train_dctvustsfs__gt__1

Unnamed: 0_level_0,Unnamed: 1_level_0,ASLConsultantID,TargetVideoFilename,UtteranceSequence,TokenSequence,FrameSequence
TokenID,CameraPerspective,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,3,dorm_prank_1053_small_0.mov,32,1,3206
1,0,3,dorm_prank_1053_small_0.mov,32,1,3207
1,0,3,dorm_prank_1053_small_0.mov,32,1,3208
1,0,3,dorm_prank_1053_small_0.mov,32,1,3209
1,0,3,dorm_prank_1053_small_0.mov,32,1,3210
...,...,...,...,...,...,...
2409,2,3,boston-la_1088_small_2.mov,4,12,884
2409,2,3,boston-la_1088_small_2.mov,4,12,885
2409,2,3,boston-la_1088_small_2.mov,4,12,886
2409,2,3,boston-la_1088_small_2.mov,4,12,887


<p><br>

##### Validation set

In [30]:
df_val_dctvustsfs__gt__1 = ib.collect(val_dctvustsfs__gt__1)

In [31]:
df_val_dctvustsfs__gt__1.columns = ['TokenID', 'CameraPerspective', 'ASLConsultantID', 'TargetVideoFilename', 'UtteranceSequence', 'TokenSequence', 'FrameSequence']
df_val_dctvustsfs__gt__1.set_index(['TokenID', 'CameraPerspective'], inplace=True)
df_val_dctvustsfs__gt__1.sort_values(axis=0, by=['ASLConsultantID', 'TargetVideoFilename', 'UtteranceSequence', 'TokenSequence', 'FrameSequence'], ignore_index=False, inplace=True)
df_val_dctvustsfs__gt__1.sort_index(inplace=True)
df_val_dctvustsfs__gt__1

Unnamed: 0_level_0,Unnamed: 1_level_0,ASLConsultantID,TargetVideoFilename,UtteranceSequence,TokenSequence,FrameSequence
TokenID,CameraPerspective,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,3,lapd_story_1083_small_0.mov,15,2,1218
1,0,3,lapd_story_1083_small_0.mov,15,2,1219
1,0,3,lapd_story_1083_small_0.mov,15,2,1220
1,0,3,lapd_story_1083_small_0.mov,15,2,1221
1,0,3,lapd_story_1083_small_0.mov,15,2,1222
...,...,...,...,...,...,...
2409,2,3,boston-la_1088_small_2.mov,6,5,1189
2409,2,3,boston-la_1088_small_2.mov,6,5,1190
2409,2,3,boston-la_1088_small_2.mov,6,5,1191
2409,2,3,boston-la_1088_small_2.mov,6,5,1192


##### The complete training set (union of training subset - token/camera perspectives with corresponding validation set tuples - with training subset with no corresponding validation set tuples)

In [32]:
df_train_dctvustsfs__all = ib.collect(train_dctvustsfs__all)

In [33]:
df_train_dctvustsfs__all.columns = ['TokenID', 'CameraPerspective', 'ASLConsultantID', 'TargetVideoFilename', 'UtteranceSequence', 'TokenSequence', 'FrameSequence']
df_train_dctvustsfs__all.set_index(['TokenID', 'CameraPerspective'], inplace=True)
df_train_dctvustsfs__all.sort_values(axis=0, by=['ASLConsultantID', 'TargetVideoFilename', 'UtteranceSequence', 'TokenSequence', 'FrameSequence'], ignore_index=False, inplace=True)
df_train_dctvustsfs__all.sort_index(inplace=True)
df_train_dctvustsfs__all

Unnamed: 0_level_0,Unnamed: 1_level_0,ASLConsultantID,TargetVideoFilename,UtteranceSequence,TokenSequence,FrameSequence
TokenID,CameraPerspective,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,7,ben_story_439_small_0.mov,7,0,585
0,0,7,ben_story_439_small_0.mov,7,0,586
0,0,7,ben_story_439_small_0.mov,7,0,587
0,0,7,ben_story_439_small_0.mov,7,0,588
0,0,7,ben_story_439_small_0.mov,7,0,589
...,...,...,...,...,...,...
2411,2,3,boston-la_1088_small_2.mov,81,5,13186
2411,2,3,boston-la_1088_small_2.mov,81,5,13187
2411,2,3,boston-la_1088_small_2.mov,81,5,13188
2411,2,3,boston-la_1088_small_2.mov,81,5,13189


<p><br>

##### Show (complete) utterances that can be represented by token-cameraperspective tuples from the validation set

In [34]:
df_complete_utterances__with__val_tcp__gt__1 = ib.collect(complete_utterances__with__val_tcp__gt__1)

In [35]:
df_complete_utterances__with__val_tcp__gt__1.columns = ['ASLConsultantID', 'TargetVideoFilename', 'UtteranceSequence', 'CameraPerspective', 'TokenIDSequence']
df_complete_utterances__with__val_tcp__gt__1.set_index(['ASLConsultantID', 'TargetVideoFilename', 'UtteranceSequence', 'CameraPerspective'], inplace=True)
df_complete_utterances__with__val_tcp__gt__1.sort_index(inplace=True)
df_complete_utterances__with__val_tcp__gt__1

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,TokenIDSequence
ASLConsultantID,TargetVideoFilename,UtteranceSequence,CameraPerspective,Unnamed: 4_level_1
0,DSP%2520Immigrants.mov,4,0,"[846, 1512, 357]"
0,DSP%2520Immigrants.mov,11,0,"[846, 1512, 357, 365]"
0,DSP%2520Immigrants.mov,17,0,"[367, 2156, 1673, 846, 1814, 1250]"
1,640_master_small.mov,0,0,"[380, 234, 1835, 17, 845, 2150]"
1,640_master_small.mov,3,0,"[382, 1190, 264, 2097, 2150]"
...,...,...,...,...
7,ch7-607_197_small_1.mov,7,1,"[1210, 380, 24]"
7,ch7-607_197_small_2.mov,7,2,"[1210, 380, 24]"
7,ch7-608_198_small_0.mov,24,0,"[1210, 380, 24]"
7,ch7-608_198_small_1.mov,24,1,"[1210, 380, 24]"
