In [1]:
import argparse
import os

import apache_beam as beam
import tensorflow as tf
from apache_beam.options.pipeline_options import PipelineOptions
import apache_beam.runners.interactive.interactive_beam as ib
import apache_beam.transforms.sql

import beam__common
import fidscs_globals

from importlib import import_module
data_extractor = import_module('data-extractor', '.')

In [2]:
data_dir = "/tmp/fids-capstone-data/data"

data_extractor.run(max_data_files=-1, data_dir=data_dir, use_beam=True)

use_beam: True
INFO:tensorflow:Using MirroredStrategy with devices ('/device:CPU:0',)
INFO:tensorflow:Single-worker MultiWorkerMirroredStrategy with local_devices = ('/device:CPU:0',), communication = CollectiveCommunication.AUTO
Number of devices available for parallel processing: 1
PipelineOptions:
{'runner': 'DirectRunner', 'streaming': False, 'beam_services': {}, 'type_check_strictness': 'DEFAULT_TO_ANY', 'type_check_additional': '', 'pipeline_type_check': True, 'runtime_type_check': False, 'performance_runtime_type_check': False, 'direct_runner_use_stacked_bundle': True, 'direct_runner_bundle_repeat': 0, 'direct_num_workers': 0, 'direct_running_mode': 'multi_threading', 'dataflow_endpoint': 'https://dataflow.googleapis.com', 'project': 'my-project', 'job_name': None, 'staging_location': None, 'temp_location': None, 'region': None, 'service_account_email': None, 'no_auth': False, 'template_location': None, 'labels': None, 'update': False, 'transform_name_mapping': None, 'enable_str

In [3]:
options = {
    'project': 'my-project', # change
    # 'runner': 'DirectRunner',
    'runner': 'InteractiveRunner',
    'direct_num_workers': 0, # 0 is use all available cores
    'direct_running_mode': 'multi_threading', # ['in_memory', 'multi_threading', 'multi_processing'] # 'multi_processing' doesn't seem to work for DirectRunner?
    'streaming': False # set to True if data source is unbounded (e.g. GCP PubSub)
}
pipeline_options = PipelineOptions(flags=[], **options) # easier to pass in options from command-line this way
print(f"PipelineOptions:\n{pipeline_options.get_all_options()}\n")

PipelineOptions:
{'runner': 'InteractiveRunner', 'streaming': False, 'beam_services': {}, 'type_check_strictness': 'DEFAULT_TO_ANY', 'type_check_additional': '', 'pipeline_type_check': True, 'runtime_type_check': False, 'performance_runtime_type_check': False, 'direct_runner_use_stacked_bundle': True, 'direct_runner_bundle_repeat': 0, 'direct_num_workers': 0, 'direct_running_mode': 'multi_threading', 'dataflow_endpoint': 'https://dataflow.googleapis.com', 'project': 'my-project', 'job_name': None, 'staging_location': None, 'temp_location': None, 'region': None, 'service_account_email': None, 'no_auth': False, 'template_location': None, 'labels': None, 'update': False, 'transform_name_mapping': None, 'enable_streaming_engine': False, 'dataflow_kms_key': None, 'flexrs_goal': None, 'hdfs_host': None, 'hdfs_port': None, 'hdfs_user': None, 'hdfs_full_urls': False, 'num_workers': None, 'max_num_workers': None, 'autoscaling_algorithm': None, 'machine_type': None, 'disk_size_gb': None, 'disk_t

In [4]:
fidscs_globals.DATA_ROOT_DIR = data_dir

In [5]:
can_proceed = True

if not tf.io.gfile.exists(fidscs_globals.DATA_ROOT_DIR) or len(tf.io.gfile.listdir(fidscs_globals.DATA_ROOT_DIR))==0:
    print(f"{fidscs_globals.VALIDATION_FATAL_ERROR_TEXT} data directory does not exist or is empty!")
    can_proceed = False
else:
    fidscs_globals.VIDEO_DIR = os.path.join(fidscs_globals.DATA_ROOT_DIR, 'videos')
    fidscs_globals.STICHED_VIDEO_FRAMES_DIR = os.path.join(fidscs_globals.DATA_ROOT_DIR, 'stitched_video_frames')
    fidscs_globals.CORPUS_DS_PATH = os.path.join(fidscs_globals.DATA_ROOT_DIR, fidscs_globals.CORPUS_DS_FNAME)
    fidscs_globals.DOCUMENT_ASL_CONSULTANT_DS_PATH = os.path.join(fidscs_globals.DATA_ROOT_DIR, fidscs_globals.DOCUMENT_ASL_CONSULTANT_DS_FNAME)
    fidscs_globals.ASL_CONSULTANT_DS_PATH = os.path.join(fidscs_globals.DATA_ROOT_DIR, fidscs_globals.ASL_CONSULTANT_DS_FNAME)
    fidscs_globals.VIDEO_DS_PATH = os.path.join(fidscs_globals.DATA_ROOT_DIR, fidscs_globals.VIDEO_DS_FNAME)
    fidscs_globals.VIDEO_SEGMENT_DS_PATH = os.path.join(fidscs_globals.DATA_ROOT_DIR, fidscs_globals.VIDEO_SEGMENT_DS_FNAME)
    fidscs_globals.VIDEO_FRAME_DS_PATH = os.path.join(fidscs_globals.DATA_ROOT_DIR, fidscs_globals.VIDEO_FRAME_DS_FNAME)
    fidscs_globals.UTTERANCE_DS_PATH = os.path.join(fidscs_globals.DATA_ROOT_DIR, fidscs_globals.UTTERANCE_DS_FNAME)
    fidscs_globals.UTTERANCE_VIDEO_DS_PATH = os.path.join(fidscs_globals.DATA_ROOT_DIR, fidscs_globals.UTTERANCE_VIDEO_DS_FNAME)
    fidscs_globals.UTTERANCE_TOKEN_DS_PATH = os.path.join(fidscs_globals.DATA_ROOT_DIR, fidscs_globals.UTTERANCE_TOKEN_DS_FNAME)
    fidscs_globals.UTTERANCE_TOKEN_FRAME_DS_PATH = os.path.join(fidscs_globals.DATA_ROOT_DIR, fidscs_globals.UTTERANCE_TOKEN_FRAME_DS_FNAME)
    fidscs_globals.VOCABULARY_DS_PATH = os.path.join(fidscs_globals.DATA_ROOT_DIR, fidscs_globals.VOCABULARY_DS_FNAME)

In [6]:
pl = beam.Pipeline(options=pipeline_options)

full_target_vid_index_schemad_pcoll = beam__common.pl__1__read_target_vid_index_csv(pl)
corpus_index_schemad_pcoll = beam__common.pl__1__read_corpus_index_csv(pl) # XML is base-64 encode but we no longer need it (to decode it) since it is only used to create the datasets
# corpus_index_decoded_XML_pcoll = pl__2__decode_XML(corpus_index_schemad_pcoll) # see above

asl_consultant_index_schemad_pcoll = beam__common.pl__1__read_asl_consultant_index_csv(pl)
document_asl_consultant_utterance_index_schemad_pcoll = beam__common.pl__1__read_document_asl_consultant_utterance_index_csv(pl)
document_asl_consultant_target_video_index_schemad_pcoll = beam__common.pl__1__read_document_asl_consultant_target_video_index_csv(pl)
document_asl_consultant_utterance_video_index_schemad_pcoll = beam__common.pl__1__read_document_asl_consultant_utterance_video_index_csv(pl)
document_target_video_segment_index_schemad_pcoll = beam__common.pl__1__read_document_target_video_segment_index_csv(pl)
vocabulary_index_schemad_pcoll = beam__common.pl__1__read_vocabulary_index_csv(pl)
document_asl_consultant_utterance_token_index_schemad_pcoll = beam__common.pl__1__read_document_asl_consultant_utterance_token_index_csv(pl)
document_asl_consultant_target_video_frame_index_schemad_pcoll = beam__common.pl__1__read_document_asl_consultant_target_video_frame_index_csv(pl)
document_asl_consultant_target_video_utterance_token_frame_index_schemad_pcoll = beam__common.pl__1__read_document_asl_consultant_target_video_utterance_token_frame_index_csv(pl)

In [7]:
# document_asl_consultant_target_video_utterance_token_frame_index_schemad_pcoll is the main table we use for training.
#     This will ultimately provide which frame sequences correspond to individual tokens.

# But our first measure is to build train and validation sets (for tokens).
#   In order to split up train vs validation sets, we need to compare "apples to apples".
#   That is, in order for a token (TokenID) to be considered a candidate for the split,
#   we require at least two of the same (TokenID, CameraPerspective) wherein the ASL
#   consultant for each differs.  We would prefer more than two of these tuples, each
#   having unique ASL consultants in the set of occurrences, with the majority of said
#   tuples being assigned to the training set and the remainder (at least one) being
#   assigned to the validation set.  We would like to achieve a 90/10 split, ideally,
#   but we will take what we get.

# document_asl_consultant_target_video_utterance_token_frame_index_schemad_pcoll:
    # beam.Row(
    #   DocumentID=int(d_document_asl_consultant_target_video_utterance_token_frame_info[fidscs_globals.SCHEMA_COL_NAMES__UTTERANCE_TOKEN_FRAME_DS[0]]),
    #   ASLConsultantID=int(d_document_asl_consultant_target_video_utterance_token_frame_info[fidscs_globals.SCHEMA_COL_NAMES__UTTERANCE_TOKEN_FRAME_DS[1]]),
    #   CameraPerspective=int(d_document_asl_consultant_target_video_utterance_token_frame_info[fidscs_globals.SCHEMA_COL_NAMES__UTTERANCE_TOKEN_FRAME_DS[2]]),
    #   TargetVideoFilename=str(d_document_asl_consultant_target_video_utterance_token_frame_info[fidscs_globals.SCHEMA_COL_NAMES__UTTERANCE_TOKEN_FRAME_DS[3]]),
    #   UtteranceSequence=int(d_document_asl_consultant_target_video_utterance_token_frame_info[fidscs_globals.SCHEMA_COL_NAMES__UTTERANCE_TOKEN_FRAME_DS[4]]),
    #   TokenSequence=int(d_document_asl_consultant_target_video_utterance_token_frame_info[fidscs_globals.SCHEMA_COL_NAMES__UTTERANCE_TOKEN_FRAME_DS[5]]),
    #   FrameSequence=int(d_document_asl_consultant_target_video_utterance_token_frame_info[fidscs_globals.SCHEMA_COL_NAMES__UTTERANCE_TOKEN_FRAME_DS[6]]),
    #   TokenID=int(d_document_asl_consultant_target_video_utterance_token_frame_info[fidscs_globals.SCHEMA_COL_NAMES__UTTERANCE_TOKEN_FRAME_DS[7]])
    # )
distinct_consultant_targetvideo_by_token_camera_perspective_pcoll = (
    document_asl_consultant_target_video_utterance_token_frame_index_schemad_pcoll
    | "Beam PL: extract ((TokenID, CameraPerspective), (ASLConsultantID, TargetVideoFilename)) from document_asl_consultant_target_video_utterance_token_frame_index_schemad_pcoll" >> beam.Map(
            lambda schemad_pcoll_row: ((schemad_pcoll_row.TokenID, schemad_pcoll_row.CameraPerspective), (schemad_pcoll_row.ASLConsultantID, schemad_pcoll_row.TargetVideoFilename))
        )
    | "Beam PL: select distinct ((TokenID, CameraPerspective), (ASLConsultantID, TargetVideoFilename)) from document_asl_consultant_target_video_utterance_token_frame_index_schemad_pcoll" >> beam.Distinct()
)

doc_consultant_targetvideo_utterance_tokenseq_frameseq_by_token_cameraperspective = (
    document_asl_consultant_target_video_utterance_token_frame_index_schemad_pcoll
        # beam.Row(
        #   DocumentID=int(d_document_asl_consultant_target_video_utterance_token_frame_info[fidscs_globals.SCHEMA_COL_NAMES__UTTERANCE_TOKEN_FRAME_DS[0]]),
        #   ASLConsultantID=int(d_document_asl_consultant_target_video_utterance_token_frame_info[fidscs_globals.SCHEMA_COL_NAMES__UTTERANCE_TOKEN_FRAME_DS[1]]),
        #   CameraPerspective=int(d_document_asl_consultant_target_video_utterance_token_frame_info[fidscs_globals.SCHEMA_COL_NAMES__UTTERANCE_TOKEN_FRAME_DS[2]]),
        #   TargetVideoFilename=str(d_document_asl_consultant_target_video_utterance_token_frame_info[fidscs_globals.SCHEMA_COL_NAMES__UTTERANCE_TOKEN_FRAME_DS[3]]),
        #   UtteranceSequence=int(d_document_asl_consultant_target_video_utterance_token_frame_info[fidscs_globals.SCHEMA_COL_NAMES__UTTERANCE_TOKEN_FRAME_DS[4]]),
        #   TokenSequence=int(d_document_asl_consultant_target_video_utterance_token_frame_info[fidscs_globals.SCHEMA_COL_NAMES__UTTERANCE_TOKEN_FRAME_DS[5]]),
        #   FrameSequence=int(d_document_asl_consultant_target_video_utterance_token_frame_info[fidscs_globals.SCHEMA_COL_NAMES__UTTERANCE_TOKEN_FRAME_DS[6]]),
        #   TokenID=int(d_document_asl_consultant_target_video_utterance_token_frame_info[fidscs_globals.SCHEMA_COL_NAMES__UTTERANCE_TOKEN_FRAME_DS[7]])
        # )
    | "Beam PL: transform document_asl_consultant_target_video_utterance_token_frame_index_schemad_pcoll to ((TokenID, CameraPerspective), (DocumentID, ASLConsultantID, TargetVideoFilename, UtteranceSequence, TokenSequence, FrameSequence))" >> beam.Map(
            lambda document_asl_consultant_target_video_utterance_token_frame_index_schemad_pcoll_row: (
                (
                    document_asl_consultant_target_video_utterance_token_frame_index_schemad_pcoll_row.TokenID,
                    document_asl_consultant_target_video_utterance_token_frame_index_schemad_pcoll_row.CameraPerspective
                ),
                (
                    document_asl_consultant_target_video_utterance_token_frame_index_schemad_pcoll_row.DocumentID,
                    document_asl_consultant_target_video_utterance_token_frame_index_schemad_pcoll_row.ASLConsultantID,
                    document_asl_consultant_target_video_utterance_token_frame_index_schemad_pcoll_row.TargetVideoFilename,
                    document_asl_consultant_target_video_utterance_token_frame_index_schemad_pcoll_row.UtteranceSequence,
                    document_asl_consultant_target_video_utterance_token_frame_index_schemad_pcoll_row.TokenSequence,
                    document_asl_consultant_target_video_utterance_token_frame_index_schemad_pcoll_row.FrameSequence
                )
            )
        )
)

In [8]:
consultant_target_videos_GROUPED_by_token_camera_perspective_pcoll = (
    distinct_consultant_targetvideo_by_token_camera_perspective_pcoll
    | "Beam PL: group (ASLConsultantID, TargetVideoFilename) by (TokenID, CameraPerspective)" >> beam.GroupByKey()
    # the above produces tuples of the form:
    #   ((<TokenID>, <CameraPerspective>), listof((<ASLConsultantID>, <TargetVideoFilename>)))
)

def flatten_ctvgbtcpp_tpl(ctvgbtcpp_tpl):
    return [
        (
            ctvgbtcpp_tpl[0][0],            # TokenID
            ctvgbtcpp_tpl[0][1],            # CameraPerspective
            consultant_targetvideo_tpl[0],  # ASLConsultantID
            consultant_targetvideo_tpl[1]   # TargetVideoFilename
        ) for consultant_targetvideo_tpl in ctvgbtcpp_tpl[1]
    ]

In [9]:
flattened_ctvgbtcpp__lte_1 = (
    consultant_target_videos_GROUPED_by_token_camera_perspective_pcoll
    | "Beam PL: filter non-candidates for test-validation split" >> beam.Filter(
            lambda ctvgbtcppt: len(set(ctvgbtcppt[1]))<=1
        )
    | "Beam PL: flatten filter non-candidates for test-validation split" >> beam.FlatMap(flatten_ctvgbtcpp_tpl)
    # the above produces tuples of the form:
        # (
        #     <TokenID>,                # TokenID
        #     <CameraPerspective>,      # CameraPerspective
        #     <ASLConsultantID>,        # ASLConsultantID
        #     <TargetVideoFilename>     # TargetVideoFilename
        # )
)

token_camera_perspective_keys__for__consultant_targetvideo__lte_1__pcoll = (
    flattened_ctvgbtcpp__lte_1
    # the above has tuples of the form:
        # (
        #     <TokenID>,                # TokenID
        #     <CameraPerspective>,      # CameraPerspective
        #     <ASLConsultantID>,        # ASLConsultantID
        #     <TargetVideoFilename>     # TargetVideoFilename
        # )
    | "Beam PL: extract ((TokenID, CameraPerspective), 'TokenID_CameraPerspective___with__ASLConsultantID_TargetVideoFilename__lte_1') from flattened_ctvgbtcpp__lte_1" >> beam.Map(
            lambda flattened_ctvgbtcpp__lte_1_tpl: ((flattened_ctvgbtcpp__lte_1_tpl[0], flattened_ctvgbtcpp__lte_1_tpl[1]), 'TokenID_CameraPerspective___with__ASLConsultantID_TargetVideoFilename__lte_1')
        )
    | "Beam PL: select distinct ((TokenID, CameraPerspective), 'TokenID_CameraPerspective___with__ASLConsultantID_TargetVideoFilename__lte_1') from flattened_ctvgbtcpp__lte_1" >> beam.Distinct()
)

doc_consultant_targetvideo_utteranceseq_tokenseq_cameraperspective_tokenid_frameseq__lte_1 = (
    ({
      'token_camera_perspective_keys__for__consultant_targetvideo__lte_1__pcoll': token_camera_perspective_keys__for__consultant_targetvideo__lte_1__pcoll,
      'doc_consultant_targetvideo_utterance_tokenseq_frameseq_map': doc_consultant_targetvideo_utterance_tokenseq_frameseq_by_token_cameraperspective
    })
    | "Beam PL: join token_camera_perspective_keys__for__consultant_targetvideo__lte_1__pcoll with doc_consultant_targetvideo_utterance_tokenseq_frameseq_by_token_cameraperspective" >> beam.CoGroupByKey()
        # the above produces tuples of the form:
        # (
        #     (<TokenID>, <CameraPerspective>), # key
        #     {
        #         'token_camera_perspective_keys__for__consultant_targetvideo__lte_1__pcoll': listof('TokenID_CameraPerspective___with__ASLConsultantID_TargetVideoFilename__lte_1'),
        #         'doc_consultant_targetvideo_utterance_tokenseq_frameseq_map': listof(
        #             (
        #                 <DocumentID>,
        #                 <ASLConsultantID>,
        #                 <TargetVideoFilename>,
        #                 <UtteranceSequence>,
        #                 <TokenSequence>,
        #                 <FrameSequence>
        #             )
        #         )
        #     }
        # )
    | "Beam PL: 'explode' doc_consultant_targetvideo_utterance_tokenseq_frameseq_map in tcptdctvustsfsm__lte_1__tpl to list of tuples" >> beam.Map(
            lambda tcptdctvustsfsm__lte_1__tpl: [
                (
                    doc_consultant_targetvideo_utterance_tokenseq_frameseq_tpl[0],  # <DocumentID>
                    doc_consultant_targetvideo_utterance_tokenseq_frameseq_tpl[1],  # <ASLConsultantID>
                    doc_consultant_targetvideo_utterance_tokenseq_frameseq_tpl[2],  # <TargetVideoFilename>
                    doc_consultant_targetvideo_utterance_tokenseq_frameseq_tpl[3],  # <UtteranceSequence>
                    doc_consultant_targetvideo_utterance_tokenseq_frameseq_tpl[4],  # <TokenSequence>
                    doc_consultant_targetvideo_utterance_tokenseq_frameseq_tpl[5],  # <FrameSequence>
                    tcptdctvustsfsm__lte_1__tpl[0][1],                              # <CameraPerspective>
                    tcptdctvustsfsm__lte_1__tpl[0][0]                               # <TokenID>
                ) for doc_consultant_targetvideo_utterance_tokenseq_frameseq_tpl in tcptdctvustsfsm__lte_1__tpl[1]['doc_consultant_targetvideo_utterance_tokenseq_frameseq_map']
            ]
        )
    | "Beam PL: 'explode' list_tcptdctvustsfsm__lte_1__tpl to tuples" >> beam.FlatMap(lambda list_dctvustscptifs__lte_1__tpl: list_dctvustscptifs__lte_1__tpl)
  )
doc_consultant_targetvideo_utteranceseq_tokenseq_cameraperspective_tokenid_frameseq__lte_1 = beam__common.pl__X__sort_pcoll(doc_consultant_targetvideo_utteranceseq_tokenseq_cameraperspective_tokenid_frameseq__lte_1, "doc_consultant_targetvideo_utteranceseq_tokenseq_cameraperspective_tokenid_frameseq__lte_1")

In [10]:
flattened_ctvgbtcpp__gt_1 = (
    consultant_target_videos_GROUPED_by_token_camera_perspective_pcoll
    | "Beam PL: filter candidates for test-validation split" >> beam.Filter(
            lambda ctvgbtcppt: len(set(ctvgbtcppt[1]))>1
        )
    | "Beam PL: flatten filter candidates for test-validation split" >> beam.FlatMap(flatten_ctvgbtcpp_tpl)
    # the above produces tuples of the form:
        # (
        #     <TokenID>,                # TokenID
        #     <CameraPerspective>,      # CameraPerspective
        #     <ASLConsultantID>,        # ASLConsultantID
        #     <TargetVideoFilename>     # TargetVideoFilename
        # )
)

token_camera_perspective_keys__for__consultant_targetvideo__gt_1__pcoll = (
    flattened_ctvgbtcpp__gt_1
    # the above has tuples of the form:
        # (
        #     <TokenID>,                # TokenID
        #     <CameraPerspective>,      # CameraPerspective
        #     <ASLConsultantID>,        # ASLConsultantID
        #     <TargetVideoFilename>     # TargetVideoFilename
        # )
    | "Beam PL: extract ((TokenID, CameraPerspective), 'TokenID_CameraPerspective___with__ASLConsultantID_TargetVideoFilename__gt_1') from flattened_ctvgbtcpp__gt_1" >> beam.Map(
            lambda flattened_ctvgbtcpp__gt_1_tpl: ((flattened_ctvgbtcpp__gt_1_tpl[0], flattened_ctvgbtcpp__gt_1_tpl[1]), 'TokenID_CameraPerspective___with__ASLConsultantID_TargetVideoFilename__gt_1')
        )
    | "Beam PL: select distinct ((TokenID, CameraPerspective), 'TokenID_CameraPerspective___with__ASLConsultantID_TargetVideoFilename__gt_1') from flattened_ctvgbtcpp__gt_1" >> beam.Distinct()
)

doc_consultant_targetvideo_utteranceseq_tokenseq_cameraperspective_tokenid_frameseq__gt_1 = (
    ({
      'token_camera_perspective_keys__for__consultant_targetvideo__gt_1__pcoll': token_camera_perspective_keys__for__consultant_targetvideo__gt_1__pcoll,
      'doc_consultant_targetvideo_utterance_tokenseq_frameseq_map': doc_consultant_targetvideo_utterance_tokenseq_frameseq_by_token_cameraperspective
    })
    | "Beam PL: join token_camera_perspective_keys__for__consultant_targetvideo__gt_1__pcoll with doc_consultant_targetvideo_utterance_tokenseq_frameseq_by_token_cameraperspective" >> beam.CoGroupByKey()
        # the above produces tuples of the form:
        # (
        #     (<TokenID>, <CameraPerspective>), # key
        #     {
        #         'token_camera_perspective_keys__for__consultant_targetvideo__gt_1__pcoll': listof('TokenID_CameraPerspective___with__ASLConsultantID_TargetVideoFilename__gt_1'),
        #         'doc_consultant_targetvideo_utterance_tokenseq_frameseq_map': listof(
        #             (
        #                 <DocumentID>,
        #                 <ASLConsultantID>,
        #                 <TargetVideoFilename>,
        #                 <UtteranceSequence>,
        #                 <TokenSequence>,
        #                 <FrameSequence>
        #             )
        #         )
        #     }
        # )
    | "Beam PL: 'explode' doc_consultant_targetvideo_utterance_tokenseq_frameseq_map in tcptdctvustsfsm__gt_1__tpl to list of tuples" >> beam.Map(
            lambda tcptdctvustsfsm__gt_1__tpl: [
                (
                    doc_consultant_targetvideo_utterance_tokenseq_frameseq_tpl[0],  # <DocumentID>
                    doc_consultant_targetvideo_utterance_tokenseq_frameseq_tpl[1],  # <ASLConsultantID>
                    doc_consultant_targetvideo_utterance_tokenseq_frameseq_tpl[2],  # <TargetVideoFilename>
                    doc_consultant_targetvideo_utterance_tokenseq_frameseq_tpl[3],  # <UtteranceSequence>
                    doc_consultant_targetvideo_utterance_tokenseq_frameseq_tpl[4],  # <TokenSequence>
                    doc_consultant_targetvideo_utterance_tokenseq_frameseq_tpl[5],  # <FrameSequence>
                    tcptdctvustsfsm__gt_1__tpl[0][1],                               # <CameraPerspective>
                    tcptdctvustsfsm__gt_1__tpl[0][0]                                # <TokenID>
                ) for doc_consultant_targetvideo_utterance_tokenseq_frameseq_tpl in tcptdctvustsfsm__gt_1__tpl[1]['doc_consultant_targetvideo_utterance_tokenseq_frameseq_map']
            ]
        )
    | "Beam PL: 'explode' list_tcptdctvustsfsm__gt_1__tpl to tuples" >> beam.FlatMap(lambda list_dctvustscptifs__gt_1__tpl: list_dctvustscptifs__gt_1__tpl)
  )
doc_consultant_targetvideo_utteranceseq_tokenseq_cameraperspective_tokenid_frameseq__gt_1 = beam__common.pl__X__sort_pcoll(doc_consultant_targetvideo_utteranceseq_tokenseq_cameraperspective_tokenid_frameseq__gt_1, "doc_consultant_targetvideo_utteranceseq_tokenseq_cameraperspective_tokenid_frameseq__gt_1")

In [11]:
# document_asl_consultant_utterance_token_index_schemad_pcoll
    # beam.Row(
    #     DocumentID=document_asl_consultant_utterance_token_tpl[0],
    #     DocumentFilename=document_asl_consultant_utterance_token_tpl[1],
    #     ASLConsultantID=document_asl_consultant_utterance_token_tpl[2],
    #     ParticipantName=document_asl_consultant_utterance_token_tpl[3],
    #     UtteranceSequence=document_asl_consultant_utterance_token_tpl[4],
    #     TokenSequence=document_asl_consultant_utterance_token_tpl[7],
    #     StartTime=document_asl_consultant_utterance_token_tpl[8],
    #     EndTime=document_asl_consultant_utterance_token_tpl[9],
    #     TokenID=document_asl_consultant_utterance_token_tpl[5],
    #     Field='', # blank for now
    #     FieldValue='' # blank for now
    # )

# join:
    # document_asl_consultant_utterance_token_index_schemad_pcoll:  
        # beam.Row(
        #     <DocumentID>,
        #     <DocumentFilename>,
        #     <ASLConsultantID>,
        #     <ParticipantName>,
        #     <UtteranceSequence>,
        #     <TokenSequence>,
        #     <StartTime>,
        #     <EndTime>,
        #     <TokenID>,
        #     <Field>,
        #     <FieldValue>
        # )

        # keyed by:
            # (<DocumentID>, <ASLConsultantID>, <UtteranceSequence>, <TokenSequence>, <TokenID>)

        # we'll call this data set:
            # document_consultant_completeutterance_tokenseq_token_keys
document_consultant_completeutterance_tokenseq_token_keys = (
    document_asl_consultant_utterance_token_index_schemad_pcoll
    | "Beam PL: extract (<DocumentID>, <ASLConsultantID>, <UtteranceSequence>, <TokenSequence>, <TokenID>) from document_asl_consultant_utterance_token_index_schemad_pcoll" >> beam.Map(
            lambda document_asl_consultant_utterance_token_index_schemad_pcoll_row: (
                document_asl_consultant_utterance_token_index_schemad_pcoll_row.DocumentID,
                document_asl_consultant_utterance_token_index_schemad_pcoll_row.ASLConsultantID,
                document_asl_consultant_utterance_token_index_schemad_pcoll_row.UtteranceSequence,
                document_asl_consultant_utterance_token_index_schemad_pcoll_row.TokenSequence,
                document_asl_consultant_utterance_token_index_schemad_pcoll_row.TokenID
            )
        )
)
document_consultant_completeutterance_tokenseq_token_keys = beam__common.pl__X__sort_pcoll(document_consultant_completeutterance_tokenseq_token_keys, "document_consultant_completeutterance_tokenseq_token_keys")

    # to 

    # doc_consultant_targetvideo_utteranceseq_tokenseq_cameraperspective_tokenid_frameseq__gt_1:
        # (
        #     <DocumentID>,
        #     <ASLConsultantID>,
        #     <TargetVideoFilename>,
        #     <UtteranceSequence>,
        #     <TokenSequence>,
        #     <FrameSequence>,
        #     <CameraPerspective>,
        #     <TokenID>,
        # )

        # keyed by:
            # (<DocumentID>, <ASLConsultantID>, <UtteranceSequence>, <TokenSequence>, <TokenID>)

        # data:
            # (<FrameSequence>, <TargetVideoFilename>, <CameraPerspective>)

        # we'll call this data set:
            # frameseq_targetvideo_cameraperspective__by__document_consultant_utterance_tokenseq_token__gt_1
frameseq_targetvideo_cameraperspective__by__document_consultant_utterance_tokenseq_token__gt_1 = (
    doc_consultant_targetvideo_utteranceseq_tokenseq_cameraperspective_tokenid_frameseq__gt_1
    | "Beam PL: extract ((<DocumentID>, <ASLConsultantID>, <UtteranceSequence>, <TokenSequence>, <TokenID>), (<FrameSequence>, <TargetVideoFilename>, <CameraPerspective>)) from doc_consultant_targetvideo_utteranceseq_tokenseq_cameraperspective_tokenid_frameseq__gt_1" >> beam.Map(
            lambda doc_consultant_targetvideo_utteranceseq_tokenseq_cameraperspective_tokenid_frameseq__gt_1_tpl: (
                (
                    doc_consultant_targetvideo_utteranceseq_tokenseq_cameraperspective_tokenid_frameseq__gt_1_tpl[0],   # <DocumentID>
                    doc_consultant_targetvideo_utteranceseq_tokenseq_cameraperspective_tokenid_frameseq__gt_1_tpl[1],   # <ASLConsultantID>
                    doc_consultant_targetvideo_utteranceseq_tokenseq_cameraperspective_tokenid_frameseq__gt_1_tpl[3],   # <UtteranceSequence>
                    doc_consultant_targetvideo_utteranceseq_tokenseq_cameraperspective_tokenid_frameseq__gt_1_tpl[4],   # <TokenSequence>
                    doc_consultant_targetvideo_utteranceseq_tokenseq_cameraperspective_tokenid_frameseq__gt_1_tpl[7]    # <TokenID>
                ),
                (
                    doc_consultant_targetvideo_utteranceseq_tokenseq_cameraperspective_tokenid_frameseq__gt_1_tpl[5],   # <FrameSequence>
                    doc_consultant_targetvideo_utteranceseq_tokenseq_cameraperspective_tokenid_frameseq__gt_1_tpl[2],   # <TargetVideoFilename>
                    doc_consultant_targetvideo_utteranceseq_tokenseq_cameraperspective_tokenid_frameseq__gt_1_tpl[6],   # <CameraPerspective>
                )
            )
        )
)

# we MUST find all records from frameseq_targetvideo_cameraperspective__by__document_consultant_utterance_tokenseq_token__gt_1
    # with COMPLETE sequences in document_consultant_completeutterance_tokenseq_token_keys

    # we'll call this data set:
        # doc_consultant_utteranceseq_tokenseq_frameseq_targetvideo_cameraperspective_token__gt_1
doc_consultant_utteranceseq_tokenseq_frameseq_targetvideo_cameraperspective_token__gt_1 = (
    frameseq_targetvideo_cameraperspective__by__document_consultant_utterance_tokenseq_token__gt_1
    | "Beam PL: filter tuples from fstvcpbdcustst_gt_1 with complete utterances" >> beam.Filter(
        lambda fstvcpbdcustst_gt_1_entry, matching_document_consultant_completeutterance_tokenseq_token_keys: (fstvcpbdcustst_gt_1_entry[0][0], fstvcpbdcustst_gt_1_entry[0][1], fstvcpbdcustst_gt_1_entry[0][2], fstvcpbdcustst_gt_1_entry[0][3], fstvcpbdcustst_gt_1_entry[0][4]) in matching_document_consultant_completeutterance_tokenseq_token_keys,
        matching_document_consultant_completeutterance_tokenseq_token_keys=beam.pvalue.AsIter(document_consultant_completeutterance_tokenseq_token_keys),
      )
    | "Beam PL: transform filtered tuples from fstvcpbdcustst_gt_1 with complete utterances to (<DocumentID>, <ASLConsultantID>, <UtteranceSequence>, <TokenSequence>, <FrameSequence>, <TargetVideoFilename>, <CameraPerspective>, <TokenID>)" >> beam.Map(
            lambda fstvcpbdcustst_gt_1_entry: (
                fstvcpbdcustst_gt_1_entry[0][0],    # <DocumentID>
                fstvcpbdcustst_gt_1_entry[0][1],    # <ASLConsultantID>
                fstvcpbdcustst_gt_1_entry[0][2],    # <UtteranceSequence>
                fstvcpbdcustst_gt_1_entry[0][3],    # <TokenSequence>
                fstvcpbdcustst_gt_1_entry[1][0],    # <FrameSequence>
                fstvcpbdcustst_gt_1_entry[1][1],    # <TargetVideoFilename>
                fstvcpbdcustst_gt_1_entry[1][2],    # <CameraPerspective>
                fstvcpbdcustst_gt_1_entry[0][4]     # <TokenID>
            )
        )
)

In [12]:
# we require this in order to make use of ib.show() (which provides visualization/to_df of the pcolls specified) or ib.collect() (which creates a pandas dataframe from a pcoll)
    # but all pcolls we wish to visualize must be created prior to executing the following line
ib.watch(locals())

In [13]:
# ib.show(ctvgbtcpp__gt_1, visualize_data=True)
df_ctvgbtcpp__gt_1 = ib.collect(flattened_ctvgbtcpp__gt_1)
# ib.evict_recorded_data(pl)

In [14]:
df_ctvgbtcpp__gt_1.columns = ['TokenID', 'CameraPerspective', 'ASLConsultantID', 'TargetVideoFilename']
df_ctvgbtcpp__gt_1.set_index(['TokenID', 'CameraPerspective'], inplace=True)
df_ctvgbtcpp__gt_1.sort_index(inplace=True)
df_ctvgbtcpp__gt_1

Unnamed: 0_level_0,Unnamed: 1_level_0,ASLConsultantID,TargetVideoFilename
TokenID,CameraPerspective,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,4,roadtrip2_1051_small_0.mov
0,0,1,ben_story_439_small_0.mov
0,0,1,ben_story_441_small_0.mov
0,1,1,ben_story_439_small_1.mov
0,1,1,ben_story_441_small_1.mov
...,...,...,...
2375,1,4,_1450_small_1.mov
2375,3,4,_1397_small_3.mov
2375,3,4,_1450_small_3.mov
2401,0,4,muhammed_ali_1052_small_0.mov


In [15]:
df_ctvgbtcpp__gt_1__count = df_ctvgbtcpp__gt_1.reset_index().groupby(['TokenID', 'CameraPerspective']).count()
df_ctvgbtcpp__gt_1__count = df_ctvgbtcpp__gt_1__count[['ASLConsultantID']]
df_ctvgbtcpp__gt_1__count.columns = ['count']
df_ctvgbtcpp__gt_1__count

Unnamed: 0_level_0,Unnamed: 1_level_0,count
TokenID,CameraPerspective,Unnamed: 2_level_1
0,0,3
0,1,2
0,2,3
0,3,3
1,0,5
...,...,...
2357,2,2
2375,0,2
2375,1,2
2375,3,2


In [16]:
df_dctvustscptifs__gt_1 = ib.collect(doc_consultant_targetvideo_utteranceseq_tokenseq_cameraperspective_tokenid_frameseq__gt_1)

In [17]:
df_dctvustscptifs__gt_1.columns = ['DocumentID', 'ASLConsultantID', 'TargetVideoFilename', 'UtteranceSequence', 'TokenSequence', 'FrameSequence', 'CameraPerspective', 'TokenID']
df_dctvustscptifs__gt_1.set_index(['DocumentID', 'ASLConsultantID', 'TargetVideoFilename', 'CameraPerspective', 'UtteranceSequence', 'TokenSequence', 'FrameSequence'], inplace=True)
df_dctvustscptifs__gt_1.sort_index(inplace=True)
df_dctvustscptifs__gt_1

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,TokenID
DocumentID,ASLConsultantID,TargetVideoFilename,CameraPerspective,UtteranceSequence,TokenSequence,FrameSequence,Unnamed: 7_level_1
0,1,ben_story_439_small_0.mov,0,0,0,20,935
0,1,ben_story_439_small_0.mov,0,0,0,21,935
0,1,ben_story_439_small_0.mov,0,0,1,31,728
0,1,ben_story_439_small_0.mov,0,0,1,32,728
0,1,ben_story_439_small_0.mov,0,0,1,33,728
...,...,...,...,...,...,...,...
37,4,biker_buddy_1069_small_2.mov,2,15,14,2359,676
37,4,biker_buddy_1069_small_2.mov,2,15,14,2360,676
37,4,biker_buddy_1069_small_2.mov,2,15,14,2361,676
37,4,biker_buddy_1069_small_2.mov,2,15,14,2362,676


In [18]:
df_ctvgbtcpp__lte_1 = ib.collect(flattened_ctvgbtcpp__lte_1)

In [19]:
df_ctvgbtcpp__lte_1.columns = ['TokenID', 'CameraPerspective', 'ASLConsultantID', 'TargetVideoFilename']
df_ctvgbtcpp__lte_1.set_index(['TokenID', 'CameraPerspective'], inplace=True)
df_ctvgbtcpp__lte_1.sort_index(inplace=True)
df_ctvgbtcpp__lte_1

Unnamed: 0_level_0,Unnamed: 1_level_0,ASLConsultantID,TargetVideoFilename
TokenID,CameraPerspective,Unnamed: 2_level_1,Unnamed: 3_level_1
2,0,1,ben_story_439_small_0.mov
2,1,1,ben_story_439_small_1.mov
2,2,1,ben_story_439_small_2.mov
2,3,1,ben_story_439_small_3.mov
12,0,4,dorm_prank_1053_small_0.mov
...,...,...,...
2409,0,5,DSP%2520Immigrants.mov
2410,0,4,boston-la_1088_small_0.mov
2410,2,4,boston-la_1088_small_2.mov
2411,0,4,biker_buddy_1069_small_0.mov


In [20]:
df_ctvgbtcpp__lte_1_count = df_ctvgbtcpp__lte_1.reset_index().groupby(['TokenID', 'CameraPerspective']).count()
df_ctvgbtcpp__lte_1_count = df_ctvgbtcpp__lte_1_count[['ASLConsultantID']]
df_ctvgbtcpp__lte_1_count.columns = ['count']
df_ctvgbtcpp__lte_1_count

Unnamed: 0_level_0,Unnamed: 1_level_0,count
TokenID,CameraPerspective,Unnamed: 2_level_1
2,0,1
2,1,1
2,2,1
2,3,1
12,0,1
...,...,...
2409,0,1
2410,0,1
2410,2,1
2411,0,1


In [21]:
df_dctvustscptifs__lte_1 = ib.collect(doc_consultant_targetvideo_utteranceseq_tokenseq_cameraperspective_tokenid_frameseq__lte_1)

In [22]:
df_dctvustscptifs__lte_1.columns = ['DocumentID', 'ASLConsultantID', 'TargetVideoFilename', 'UtteranceSequence', 'TokenSequence', 'FrameSequence', 'CameraPerspective', 'TokenID']
df_dctvustscptifs__lte_1.set_index(['DocumentID', 'ASLConsultantID', 'TargetVideoFilename', 'CameraPerspective', 'UtteranceSequence', 'TokenSequence', 'FrameSequence'], inplace=True)
df_dctvustscptifs__lte_1.sort_index(inplace=True)
df_dctvustscptifs__lte_1

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,TokenID
DocumentID,ASLConsultantID,TargetVideoFilename,CameraPerspective,UtteranceSequence,TokenSequence,FrameSequence,Unnamed: 7_level_1
0,1,ben_story_439_small_0.mov,0,0,0,20,935
0,1,ben_story_439_small_0.mov,0,0,0,21,935
0,1,ben_story_439_small_0.mov,0,0,1,31,728
0,1,ben_story_439_small_0.mov,0,0,1,32,728
0,1,ben_story_439_small_0.mov,0,0,1,33,728
...,...,...,...,...,...,...,...
37,4,biker_buddy_1069_small_2.mov,2,15,14,2359,676
37,4,biker_buddy_1069_small_2.mov,2,15,14,2360,676
37,4,biker_buddy_1069_small_2.mov,2,15,14,2361,676
37,4,biker_buddy_1069_small_2.mov,2,15,14,2362,676


#### Now we need to find all tokens corresponding to complete utterances from the filtered tokens with more than one occurrence (of unique consultant/camera perspectives)

In [23]:
df_dcustsfstvcpt__gt_1 = ib.collect(doc_consultant_utteranceseq_tokenseq_frameseq_targetvideo_cameraperspective_token__gt_1)

In [24]:
df_dcustsfstvcpt__gt_1.columns = ['DocumentID', 'ASLConsultantID', 'UtteranceSequence', 'TokenSequence', 'FrameSequence', 'TargetVideoFilename', 'CameraPerspective', 'TokenID']
df_dcustsfstvcpt__gt_1.set_index(['DocumentID', 'ASLConsultantID', 'TargetVideoFilename', 'CameraPerspective', 'UtteranceSequence', 'TokenSequence', 'FrameSequence'], inplace=True)
df_dcustsfstvcpt__gt_1.sort_index(inplace=True)
df_dcustsfstvcpt__gt_1

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,TokenID
DocumentID,ASLConsultantID,TargetVideoFilename,CameraPerspective,UtteranceSequence,TokenSequence,FrameSequence,Unnamed: 7_level_1
0,1,ben_story_439_small_0.mov,0,0,0,20,935
0,1,ben_story_439_small_0.mov,0,0,0,21,935
0,1,ben_story_439_small_0.mov,0,0,1,31,728
0,1,ben_story_439_small_0.mov,0,0,1,32,728
0,1,ben_story_439_small_0.mov,0,0,1,33,728
...,...,...,...,...,...,...,...
37,4,biker_buddy_1069_small_2.mov,2,15,14,2359,676
37,4,biker_buddy_1069_small_2.mov,2,15,14,2360,676
37,4,biker_buddy_1069_small_2.mov,2,15,14,2361,676
37,4,biker_buddy_1069_small_2.mov,2,15,14,2362,676
