# Imports

In [1]:
import os
from pathlib import Path
import itertools
import shutil
import time
import h5py
from itertools import cycle, islice

import torch
from torchvision import transforms
from torchvision.datasets.folder import default_loader

import numpy as np
import pandas as pd
import seaborn as sns

import ipywidgets as widgets
from ipywidgets import interact

import IPython
from IPython.display import Markdown, display

from tqdm.notebook import tqdm

In [2]:
import sys
sys.path.append('/Users/staveshemesh/Projects/shstav2/vokenization')
sys.path.append('/Users/staveshemesh/Projects/shstav2/vokenization/vokenization')
sys.path.append('/Users/staveshemesh/Projects/shstav2/FECNet')

In [3]:
pd.options.display.float_format = '{:.2f}'.format
pd.options.display.max_colwidth = 300

# Helpers

## Paths Resolvers

In [4]:
def get_interval_row(interval_id):
    row = df_interval = df_intervals[df_intervals['interval_id'] == interval_id].iloc[0]
    return row

def get_video_id(interval_id):
    row = get_interval_row(interval_id)
    return row['video_id']

def get_duration(interval_id):
    row = get_interval_row(interval_id)
    return row['delta_time']

def get_frame_count(interval_id):
    return read_text(interval_id).iloc[-1].end_frame

def resolve_interval_video_path(interval_id):
    video_id = get_video_id(interval_id)
    video_dir = os.path.join(PATS_DATA_ROOT, 'Youtube', SPEAKER_NAME, video_id)    
    interval_path = os.path.join(video_dir, interval_id, f'{interval_id}.mp4')
    return interval_path

def resolve_interval_frames_dir(interval_id, create=True):
    interval_video_path = resolve_interval_video_path(interval_id)
    interval_video_dir = os.path.dirname(interval_video_path)
    inetrval_frames_dir = os.path.join(interval_video_dir, 'frames')
    if create and not os.path.exists(inetrval_frames_dir):
        os.makedirs(inetrval_frames_dir)
    return inetrval_frames_dir

def resolve_speaker_intervals_text_dir():
    # '/Users/staveshemesh/Projects/PATS_DATA/Processed/oliver/data/', 'processed/oliver'
    return os.path.join(PATS_SPEAKER_DATA_DIR, 'processed', SPEAKER_NAME)

def resolve_interval_text_path(interval_id):
    speaker_intervals_texts = resolve_speaker_intervals_text_dir()
    interval_text_path = os.path.join(speaker_intervals_texts, f'{interval_id}.h5')
    return interval_text_path

def read_text(interval_id, debug=False):
    interval_text_path = resolve_interval_text_path(interval_id)
    if debug:
        print('resolve_interval_text_path: ', interval_text_path)
    df_token_frames_interval = pd.read_hdf(interval_text_path)
    df_token_frames_interval['start_frame'] = df_token_frames_interval['start_frame'].astype(int)
    df_token_frames_interval['end_frame'] = df_token_frames_interval['end_frame'].astype(int)
    df_token_frames_interval['frames_count'] = df_token_frames_interval['end_frame'] - df_token_frames_interval['start_frame']
    return df_token_frames_interval

def resolve_interval_face_annot_224_dir(interval_id, create=True):
    interval_video_path = resolve_interval_video_path(interval_id)
    interval_video_dir = os.path.dirname(interval_video_path)
    inetrval_face_annot_dir = os.path.join(interval_video_dir, 'vokens', 'face_annot_224')
    if create and not os.path.exists(inetrval_face_annot_dir):
        os.makedirs(inetrval_face_annot_dir)
    return inetrval_face_annot_dir

def resolve_224_voken_path(interval_id, frame_id):
    single_frame_face_annot_dir = resolve_interval_face_annot_224_dir(interval_id, frame_id)
    detected_face_frame_path = os.path.join(single_frame_face_annot_dir, f'{frame_id:05d}.png').format(frame_id)
    return detected_face_frame_path

## Display Utils

In [5]:
def printmd(string):
    display(Markdown(string))

CAPTION_STYLE = {
    'selector': 'caption',
    'props': [
        ('color', 'blue'),
        ('font-size', '16px')
    ]
}

def df_with_caption(df, title):
    return df.style.set_caption(title).set_table_styles([CAPTION_STYLE])
    
def style_value_counts(mask, title):
    df_value_counts = (pd.Series(mask.reshape(-1))).value_counts().to_frame().head()
    df_style = df_with_caption(df_value_counts, title)
    display(df_style)

## Constants

In [6]:
FRAME_RATE = 15
SAMPLE_INTERVAL = '214625'
SPEAKER_NAME = 'oliver'
TAB = '&nbsp;&nbsp;&nbsp;&nbsp;'

# Read Data

## Constants for Paths

In [7]:
DATA_ROOT = '/Users/staveshemesh/Projects/shstav2/token_voken/data/'
SAMPLE_DATA_ROOT = os.path.join(DATA_ROOT, 'sample')


# Input files
# ============
#   1. DataFrames
SAMPLE_INTERVALS_PATH = os.path.join(SAMPLE_DATA_ROOT, 'df_sample_intervals_64.csv')
#   2. PATS
PATS_DATA_ROOT = '/Users/staveshemesh/Projects/PATS_DATA/'
PATS_SPEAKER_VIZ_DIR = os.path.join(PATS_DATA_ROOT, f'Youtube/{SPEAKER_NAME}')
PATS_SPEAKER_DATA_DIR = os.path.join(PATS_DATA_ROOT, f'Processed/{SPEAKER_NAME}/data')
PATS_SPEAKER_PATS_INTERVAL_DIR = os.path.join(PATS_SPEAKER_DATA_DIR, 'processed', SPEAKER_NAME)

# Output files:
#   1. DataFrames
#   2. Model Input files
# Output Dir
TIMESTR = time.strftime("%Y%m%d_%H%M%S")
OUTPUT_ITERATION_DIR = os.path.join(SAMPLE_DATA_ROOT, TIMESTR)
OUTPUT_DATAFRAMES_ITERATION_DIR = os.path.join(OUTPUT_ITERATION_DIR, 'dataframes')
#  Model Input files
OUTPUT_TOKENS_PATH = os.path.join(OUTPUT_ITERATION_DIR, 'sample_token_ids_{count}.hdf5')
OUTPUT_VOKENS_PATH = os.path.join(OUTPUT_ITERATION_DIR, 'sample_voken_ids_{count}.hdf5')
OUTPUT_VOKEN_IDS_PATH = os.path.join(OUTPUT_ITERATION_DIR, '{dataset}_{count}.ids')
OUTPUT_VOKEN_KEYS_PATH = os.path.join(OUTPUT_ITERATION_DIR, 'keys')

## DataFrames

### df_intervals

In [8]:
df_intervals = pd.read_csv(SAMPLE_INTERVALS_PATH)
df_intervals['interval_id'] = df_intervals['interval_id'].astype(str)
print(f'df_intervals shape: {df_intervals.shape}')
interval_ids = df_intervals['interval_id'].unique().tolist()  
df_intervals['frame_count'] = (df_intervals['delta_time'] * FRAME_RATE).astype(int)
df_intervals.head()

df_intervals shape: (64, 20)


Unnamed: 0.1,Unnamed: 0,dataset,delta_time,end_time,interval_id,speaker,start_time,video_fn,video_link,org_start_time,...,start_time_string,end_time_string,video_id,max_frames_token,valid,valid_max_frames_per_token,valid_duration,valid_single_token_per_frame,video_downloded,frame_count
0,43551,train,21.02,0 days 00:11:55.080000,214625,oliver,0 days 00:11:34.060000,Charter_Schools_-_Last_Week_Tonight_with_John_Oliver_HBO-l_htSPGAY7I.webm,http://www.youtube.com/watch?v=l_htSPGAY7I,0 days 00:11:34.060000,...,00:11:34.6,00:11:55.8,l_htSPGAY7I,71,True,True,True,True,True,315
1,43560,train,24.89,0 days 00:05:51.610000,215346,oliver,0 days 00:05:26.720000,Charter_Schools_-_Last_Week_Tonight_with_John_Oliver_HBO-l_htSPGAY7I.webm,http://www.youtube.com/watch?v=l_htSPGAY7I,0 days 00:05:26.720000,...,00:05:26.72,00:05:51.61,l_htSPGAY7I,24,True,True,True,True,True,373
2,43565,train,45.05,0 days 00:07:04.690000,215366,oliver,0 days 00:06:19.640000,Charter_Schools_-_Last_Week_Tonight_with_John_Oliver_HBO-l_htSPGAY7I.webm,http://www.youtube.com/watch?v=l_htSPGAY7I,0 days 00:06:19.640000,...,00:06:19.64,00:07:04.69,l_htSPGAY7I,48,True,True,True,True,True,675
3,43571,train,61.66,0 days 00:11:02.090000,215431,oliver,0 days 00:10:00.430000,Charter_Schools_-_Last_Week_Tonight_with_John_Oliver_HBO-l_htSPGAY7I.webm,http://www.youtube.com/watch?v=l_htSPGAY7I,0 days 00:10:00.430000,...,00:10:00.43,00:11:02.9,l_htSPGAY7I,42,True,True,True,True,True,924
4,43578,train,27.23,0 days 00:17:06.050000,216197,oliver,0 days 00:16:38.830000,Charter_Schools_-_Last_Week_Tonight_with_John_Oliver_HBO-l_htSPGAY7I.webm,http://www.youtube.com/watch?v=l_htSPGAY7I,0 days 00:16:38.830000,...,00:16:38.83,00:17:06.5,l_htSPGAY7I,50,True,True,True,True,True,408


In [9]:
all_interval_texts = []

for interval_id in interval_ids:
    df_interval_words = read_text(interval_id)
    df_interval_words['interval_id'] = interval_id
    all_interval_texts.append(df_interval_words)

### df_token_voken

In [10]:
df_token_voken = pd.concat(all_interval_texts)
df_token_voken.rename(columns={'Word': 'word_original'}, inplace=True)


df_token_voken['frame_selected'] = df_token_voken[['start_frame', 'end_frame']].mean(axis=1).astype(int)

# join with interval data
df_token_voken = df_token_voken.merge(df_intervals, on='interval_id')

df_token_voken['frame_path'] = \
    df_token_voken['frame_selected'].astype(str) + '_' + \
    df_token_voken['interval_id'] + '_' + \
    df_token_voken['video_id']
df_token_voken['frame_count'] = df_token_voken['end_frame'] - df_token_voken['start_frame']

# /Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00000.png
# Don't use resolvers, to vectorize this calculation
df_token_voken['frame_full_path'] = PATS_SPEAKER_VIZ_DIR + '/' + \
    df_token_voken['video_id'] + '/' + df_token_voken['interval_id'] + \
    '/vokens/face_annot_224/' + \
    df_token_voken['frame_selected'].astype(str).str.zfill(5) + '.png'

# calculage word len, start and end
df_token_voken['word_len_plus_1'] = df_token_voken['word_original'].str.len() + 1
df_token_voken['word_end'] = df_token_voken.groupby('interval_id')['word_len_plus_1'].transform(pd.Series.cumsum)
df_token_voken['word_start'] = df_token_voken['word_end'] - df_token_voken['word_len_plus_1'] + 1
# fix spacing offsets
df_token_voken['word_start'] = df_token_voken['word_start'] - 1
df_token_voken['word_end'] = df_token_voken['word_end'] - 1
df_token_voken['word_time'] = round(df_token_voken['frame_selected'] / FRAME_RATE, 1)


df_token_voken.rename(
    columns={'start_frame': 'frame_start', 'end_frame': 'frame_end',
             'delta_time': 'interval_time'},
    inplace=True)

#reorder columns
TOKEN_VOKEN_COLS = [
    # token-voken
    'word_original', 'frame_selected',
    # token metadata
     'word_time', 'word_len_plus_1', 'word_start', 'word_end',
    # voken metadata
    'frame_start', 'frame_end', 'frame_count', 'frame_path', 'frame_full_path',
    # additional info
    'interval_id', 'video_id', 'interval_time'
]
df_token_voken = df_token_voken[TOKEN_VOKEN_COLS]
df_token_voken.sort_values(['interval_id', 'frame_selected'], inplace=True)

print(f'Number of word-frame pairs: {df_token_voken.shape[0]:,}')

Number of word-frame pairs: 5,618


In [11]:
df_token_voken.head()

Unnamed: 0,word_original,frame_selected,word_time,word_len_plus_1,word_start,word_end,frame_start,frame_end,frame_count,frame_path,frame_full_path,interval_id,video_id,interval_time
4045,are,0,0.0,4,0,3,0,1,1,0_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00000.png,100983,hWQiXv0sn9Y,26.56
4046,not,2,0.1,4,4,7,1,4,3,2_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00002.png,100983,hWQiXv0sn9Y,26.56
4047,understanding,7,0.5,14,8,21,4,10,6,7_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00007.png,100983,hWQiXv0sn9Y,26.56
4048,Liquid,19,1.3,7,22,28,10,29,19,19_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00019.png,100983,hWQiXv0sn9Y,26.56
4049,Gold,42,2.8,5,29,33,29,56,27,42_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00042.png,100983,hWQiXv0sn9Y,26.56


### df_interval_texts

In [12]:
df_token_voken.head()

Unnamed: 0,word_original,frame_selected,word_time,word_len_plus_1,word_start,word_end,frame_start,frame_end,frame_count,frame_path,frame_full_path,interval_id,video_id,interval_time
4045,are,0,0.0,4,0,3,0,1,1,0_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00000.png,100983,hWQiXv0sn9Y,26.56
4046,not,2,0.1,4,4,7,1,4,3,2_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00002.png,100983,hWQiXv0sn9Y,26.56
4047,understanding,7,0.5,14,8,21,4,10,6,7_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00007.png,100983,hWQiXv0sn9Y,26.56
4048,Liquid,19,1.3,7,22,28,10,29,19,19_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00019.png,100983,hWQiXv0sn9Y,26.56
4049,Gold,42,2.8,5,29,33,29,56,27,42_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00042.png,100983,hWQiXv0sn9Y,26.56


In [13]:
df_interval_text = df_token_voken.groupby('interval_id')['word_original'].apply(' '.join).reset_index()

df_interval_text.rename(columns={'word_original': 'text'}, inplace=True)
df_interval_text.sort_values('interval_id', inplace=True)

print(f'Intervals: {df_interval_text.shape[0]}')
df_interval_text.head(n=2).style.set_properties(**{'text-align': 'left', 'font-size': '12pt'})

Intervals: 64


Unnamed: 0,interval_id,text
0,100983,are not understanding Liquid Gold everything about this industry is incredibly difficult to navigate which is dangerous because even one of its own trains groups its own tree groups has wounds that some sectors of it are out of control and percent of just how helpless you can be in the face of old is just listen to Tom McClellan he was a ball as Deputy drugs are you would think that if anyone knows this world it would be him
1,101126,plus it's a phone call so you'll say whatever it takes to make it end cuz getting so but might be hard but nothing is harder than an 8 minute phone call with another human being nothing nothing at all those claims of success it is very hard to know exactly what you're getting at any given rehab 12-step bicep wrote which undoubtedly what's the song with the most experts argue that they should also be consistent access to other newer treatments like sun behavioral therapy


## Data Validity Check

### Original Word Bounds

In [14]:
df_token_voken.head(n=4)

Unnamed: 0,word_original,frame_selected,word_time,word_len_plus_1,word_start,word_end,frame_start,frame_end,frame_count,frame_path,frame_full_path,interval_id,video_id,interval_time
4045,are,0,0.0,4,0,3,0,1,1,0_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00000.png,100983,hWQiXv0sn9Y,26.56
4046,not,2,0.1,4,4,7,1,4,3,2_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00002.png,100983,hWQiXv0sn9Y,26.56
4047,understanding,7,0.5,14,8,21,4,10,6,7_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00007.png,100983,hWQiXv0sn9Y,26.56
4048,Liquid,19,1.3,7,22,28,10,29,19,19_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00019.png,100983,hWQiXv0sn9Y,26.56


In [15]:
print(df_token_voken.head(n=10)['word_original'].tolist())

['are', 'not', 'understanding', 'Liquid', 'Gold', 'everything', 'about', 'this', 'industry', 'is']


In [16]:
test_items = {
    '100983': ['are', 'not', 'understanding', 'Liquid', 'Gold', 'everything', 'about', 'this', 'industry', 'is'],
    '102545': ['problems', 'because', 'the', 'industry', 'boomed', 'that', 'I', 'found', 'it', 'in', 'kickbacks']
}

for test_interval_id, test_words in test_items.items():
    interval_text = df_interval_text[df_interval_text['interval_id'] == test_interval_id].iloc[0]['text']
    printmd('**interval full text:**')
    df_token_voken_interval = df_token_voken[df_token_voken['interval_id'] == test_interval_id]
    print(interval_text)
    for word in test_words:
        start, end = df_token_voken_interval[df_token_voken_interval['word_original'] == word].iloc[0][['word_start', 'word_end']]
        sliced_word = interval_text[start:end]
        printmd(f'{TAB} ✓ **{sliced_word.replace(" ","SPACE")}** text[start={start}:end={end}]:')
        assert word == sliced_word

**interval full text:**

are not understanding Liquid Gold everything about this industry is incredibly difficult to navigate which is dangerous because even one of its own trains groups its own tree groups has wounds that some sectors of it are out of control and percent of just how helpless you can be in the face of old is just listen to Tom McClellan he was a ball as Deputy drugs are you would think that if anyone knows this world it would be him


&nbsp;&nbsp;&nbsp;&nbsp; ✓ **are** text[start=0:end=3]:

&nbsp;&nbsp;&nbsp;&nbsp; ✓ **not** text[start=4:end=7]:

&nbsp;&nbsp;&nbsp;&nbsp; ✓ **understanding** text[start=8:end=21]:

&nbsp;&nbsp;&nbsp;&nbsp; ✓ **Liquid** text[start=22:end=28]:

&nbsp;&nbsp;&nbsp;&nbsp; ✓ **Gold** text[start=29:end=33]:

&nbsp;&nbsp;&nbsp;&nbsp; ✓ **everything** text[start=34:end=44]:

&nbsp;&nbsp;&nbsp;&nbsp; ✓ **about** text[start=45:end=50]:

&nbsp;&nbsp;&nbsp;&nbsp; ✓ **this** text[start=51:end=55]:

&nbsp;&nbsp;&nbsp;&nbsp; ✓ **industry** text[start=56:end=64]:

&nbsp;&nbsp;&nbsp;&nbsp; ✓ **is** text[start=65:end=67]:

**interval full text:**

I'm floored actually provides a window into held the flood insurance money is a treatment centers high school with massive problems because the industry boomed that I found it in the system take urine testing insurance companies drug test on some places have exploded at arranging for kickbacks from testing facility the more you take the more


&nbsp;&nbsp;&nbsp;&nbsp; ✓ **problems** text[start=123:end=131]:

&nbsp;&nbsp;&nbsp;&nbsp; ✓ **because** text[start=132:end=139]:

&nbsp;&nbsp;&nbsp;&nbsp; ✓ **the** text[start=49:end=52]:

&nbsp;&nbsp;&nbsp;&nbsp; ✓ **industry** text[start=144:end=152]:

&nbsp;&nbsp;&nbsp;&nbsp; ✓ **boomed** text[start=153:end=159]:

&nbsp;&nbsp;&nbsp;&nbsp; ✓ **that** text[start=160:end=164]:

&nbsp;&nbsp;&nbsp;&nbsp; ✓ **I** text[start=165:end=166]:

&nbsp;&nbsp;&nbsp;&nbsp; ✓ **found** text[start=167:end=172]:

&nbsp;&nbsp;&nbsp;&nbsp; ✓ **it** text[start=173:end=175]:

&nbsp;&nbsp;&nbsp;&nbsp; ✓ **in** text[start=176:end=178]:

&nbsp;&nbsp;&nbsp;&nbsp; ✓ **kickbacks** text[start=285:end=294]:

### Word's Selected Frame

In [17]:
test_words = ['boomed', 'insurance']
test_interval_id = '102545'

In [18]:
df_interval_text[df_interval_text['interval_id'] == test_interval_id].iloc[0]['text']

"I'm floored actually provides a window into held the flood insurance money is a treatment centers high school with massive problems because the industry boomed that I found it in the system take urine testing insurance companies drug test on some places have exploded at arranging for kickbacks from testing facility the more you take the more"

In [19]:
for test_word in test_words:
    mask = (df_token_voken['interval_id'] == test_interval_id) & (df_token_voken['word_original'] == test_word)
    n = 3
    selected_frame, word_time = df_token_voken[mask].iloc[0][['frame_selected', 'word_time']]
    idx = df_token_voken[mask].index[0]
    printmd(f'**{test_word}** - frame {selected_frame} at {word_time}')
    display(df_token_voken.loc[idx-n:idx+n])

**boomed** - frame 145 at 9.7

Unnamed: 0,word_original,frame_selected,word_time,word_len_plus_1,word_start,word_end,frame_start,frame_end,frame_count,frame_path,frame_full_path,interval_id,video_id,interval_time
4632,because,131,8.7,8,132,139,129,134,5,131_102545_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/102545/vokens/face_annot_224/00131.png,102545,hWQiXv0sn9Y,25.89
4633,the,134,8.9,4,140,143,134,135,1,134_102545_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/102545/vokens/face_annot_224/00134.png,102545,hWQiXv0sn9Y,25.89
4634,industry,138,9.2,9,144,152,135,141,6,138_102545_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/102545/vokens/face_annot_224/00138.png,102545,hWQiXv0sn9Y,25.89
4635,boomed,145,9.7,7,153,159,141,150,9,145_102545_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/102545/vokens/face_annot_224/00145.png,102545,hWQiXv0sn9Y,25.89
4636,that,152,10.1,5,160,164,150,155,5,152_102545_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/102545/vokens/face_annot_224/00152.png,102545,hWQiXv0sn9Y,25.89
4637,I,158,10.5,2,165,166,155,161,6,158_102545_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/102545/vokens/face_annot_224/00158.png,102545,hWQiXv0sn9Y,25.89
4638,found,174,11.6,6,167,172,161,188,27,174_102545_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/102545/vokens/face_annot_224/00174.png,102545,hWQiXv0sn9Y,25.89


**insurance** - frame 87 at 5.8

Unnamed: 0,word_original,frame_selected,word_time,word_len_plus_1,word_start,word_end,frame_start,frame_end,frame_count,frame_path,frame_full_path,interval_id,video_id,interval_time
4618,held,70,4.7,5,44,48,69,72,3,70_102545_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/102545/vokens/face_annot_224/00070.png,102545,hWQiXv0sn9Y,25.89
4619,the,73,4.9,4,49,52,72,74,2,73_102545_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/102545/vokens/face_annot_224/00073.png,102545,hWQiXv0sn9Y,25.89
4620,flood,77,5.1,6,53,58,74,81,7,77_102545_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/102545/vokens/face_annot_224/00077.png,102545,hWQiXv0sn9Y,25.89
4621,insurance,87,5.8,10,59,68,81,93,12,87_102545_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/102545/vokens/face_annot_224/00087.png,102545,hWQiXv0sn9Y,25.89
4622,money,94,6.3,6,69,74,93,96,3,94_102545_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/102545/vokens/face_annot_224/00094.png,102545,hWQiXv0sn9Y,25.89
4623,is,97,6.5,3,75,77,96,98,2,97_102545_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/102545/vokens/face_annot_224/00097.png,102545,hWQiXv0sn9Y,25.89
4624,a,98,6.5,2,78,79,98,99,1,98_102545_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/102545/vokens/face_annot_224/00098.png,102545,hWQiXv0sn9Y,25.89


In [20]:
IPython.display.Video(resolve_interval_video_path(test_interval_id), embed=True, width=500, height=500)

In [21]:
@interact(frame_id=widgets.IntSlider(min=0, max=get_frame_count(test_interval_id), step=1, value=10))
def show_img(frame_id):
    interval_frames_dir = resolve_interval_frames_dir(test_interval_id)
    frame_path = os.path.join(interval_frames_dir, f'{frame_id:05d}.png')
    image = IPython.display.Image(filename=frame_path, width=500, height=500)
    display(image)

interactive(children=(IntSlider(value=10, description='frame_id', max=389), Output()), _dom_classes=('widget-i…

### Frame Value Count 

In [22]:
style_value_counts(df_token_voken['frame_path'].values, 'Words per Frame')

Unnamed: 0,0
616_102439_5d667Bb_iYA,2
226_103077_5d667Bb_iYA,2
119_215366_l_htSPGAY7I,2
660_101174_hWQiXv0sn9Y,2
224_101190_5d667Bb_iYA,2


# Prepare Training Data

## Bert Input Ids

### Encode Text

In [23]:
!rm -rf /Users/staveshemesh/.cache/huggingface/transformers/transformers

In [24]:
from transformers import AutoTokenizer

In Transformers v4.0.0, the default path to cache downloaded models changed from '~/.cache/torch/transformers' to '~/.cache/huggingface/transformers'. Since you don't seem to have overridden and '~/.cache/torch/transformers' is a directory that exists, we're moving it to '~/.cache/huggingface/transformers' to avoid redownloading models you have already in the cache. You should only see this message once.


In [25]:
tokenizer_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_fast=True)

In [26]:
df_token_voken.head()

Unnamed: 0,word_original,frame_selected,word_time,word_len_plus_1,word_start,word_end,frame_start,frame_end,frame_count,frame_path,frame_full_path,interval_id,video_id,interval_time
4045,are,0,0.0,4,0,3,0,1,1,0_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00000.png,100983,hWQiXv0sn9Y,26.56
4046,not,2,0.1,4,4,7,1,4,3,2_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00002.png,100983,hWQiXv0sn9Y,26.56
4047,understanding,7,0.5,14,8,21,4,10,6,7_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00007.png,100983,hWQiXv0sn9Y,26.56
4048,Liquid,19,1.3,7,22,28,10,29,19,19_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00019.png,100983,hWQiXv0sn9Y,26.56
4049,Gold,42,2.8,5,29,33,29,56,27,42_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00042.png,100983,hWQiXv0sn9Y,26.56


In [27]:
df_interval_text.head()

Unnamed: 0,interval_id,text
0,100983,are not understanding Liquid Gold everything about this industry is incredibly difficult to navigate which is dangerous because even one of its own trains groups its own tree groups has wounds that some sectors of it are out of control and percent of just how helpless you can be in the face of o...
1,101126,plus it's a phone call so you'll say whatever it takes to make it end cuz getting so but might be hard but nothing is harder than an 8 minute phone call with another human being nothing nothing at all those claims of success it is very hard to know exactly what you're getting at any given rehab ...
2,101127,medications like these are not required to offer Lowe's because we have to essentially be whatever is proprietor says it is that means they can frame almost anything as treatment for instance many Hawaiians rehabs off of something cold Equine Therapy and again if you respond to that that's great...
3,101174,true even the nation's Deputy drugs are couldn't confidently navigate their system in the tragedy is his son died and remember the funny guy who hated horses he died too so this is a matter of life and death are you if you're wondering what experts would advise you to do in seeking treatment mon...
4,101178,horse ratio like in that I'm so ugly right now it can be way too difficult to get all answers to those questions which is crazy because so much about battling addiction is really hard guessing clean is hard staying clean is hard but getting good evidence-based trustworthy help should be the fuck...


In [28]:
COLS_DATASET = [
    # main elements
    'word_original', 'bert_token', 'token_id', 'frame_selected', 'interval_id',
    # bert token metadata
    'offset_start', 'offset_end',
    # original word metadata
    'word_time', 'word_len_plus_1', 'word_start', 'word_end',
    # word frames metadata
    'frame_start', 'frame_end', 'frame_count', 'frame_path', 'frame_full_path',
    # additional info
    'video_id', 'interval_time'
]

In [29]:
# https://stackoverflow.com/questions/44367672/best-way-to-join-merge-by-range-in-pandas

def map_bert_token_to_original_word(A, B):
    # offset mask
    a = A.offset_start.values
    bh = B.word_end.values
    bl = B.word_start.values
    mask_offset = (a[:, None] >= bl) & (a[:, None] < bh)
    style_value_counts(mask_offset, 'Offset Mask')

    # interval mask
    a2 = A.interval_id.values
    b2 = B.interval_id.values
    mask_interval = ((a2[:, None] == b2))
    style_value_counts(mask_interval, 'Interval Mask')

    # combine masks
    mask_combined = mask_offset & mask_interval
    i, j = np.where(mask_combined)
    style_value_counts(mask_combined, 'Combined Mask')


    cols_all = A.columns.append(B.columns)
    df = pd.DataFrame(
        np.column_stack([A.values[i], B.values[j]]),
        columns=cols_all
    )

    cols_no_dups = list(df.columns)
    for i, col_name in enumerate(df.columns):
        if col_name in df.columns[:i]:
            dup_i = cols_no_dups.index(col_name)
            assert df[cols_no_dups[i]].equals(df[cols_no_dups[dup_i]])
            cols_no_dups[i] = "toDROP"

    df.columns = cols_no_dups
    df = df.drop("toDROP", 1)
    df['token_id'] = df['token_id'].astype(int)
    display(df_with_caption(df.head(n=10), 'Original Word - Bert Token Mapping'))
    return df

In [30]:
def tokenize_text(text):
    words = text.lower().split(" ")
    tokenized_text = tokenizer.tokenize(text, add_special_tokens=False)
    tokenized_output = tokenizer(text, return_offsets_mapping=True, add_special_tokens=False)
    tokenized_line = tokenized_output['input_ids']
    offset_mapping = np.array(tokenized_output['offset_mapping'])
    #       bert input ids     ==     word pieces     ==     word piece bounds
    assert len(tokenized_line) == len(tokenized_text) == len(offset_mapping)
    return tokenized_text, tokenized_line, offset_mapping

def interval_bert_tokens(interval_id, text):
    tokenized_text, tokenized_line, offset_mapping = tokenize_text(text)
    df_interval_bert_tokens = pd.DataFrame({
        'bert_token': tokenized_text,
        'token_id': tokenized_line,
        'offset_start': offset_mapping[:,0],
        'offset_end': offset_mapping[:,1],
        'interval_id': interval_id
    })
    return df_interval_bert_tokens

def create_bert_tokens(df_interval_texts):
    interval_ids = df_interval_texts['interval_id'].tolist()
    interval_texts = df_interval_texts['text'].tolist()
    lst_df_word_pieces = []
    all_bert_tokens = [interval_bert_tokens(interval_id, text) for interval_id, text in tqdm(zip(interval_ids, interval_texts))]
    df_bert_tokens = pd.concat(all_bert_tokens)
    return df_bert_tokens

def create_bert_token_voken_dataset(df_interval_texts, df_token_voken):
    df_bert_tokens = create_bert_tokens(df_interval_texts)
    df_dataset = map_bert_token_to_original_word(df_bert_tokens, df_token_voken)
    return df_dataset[COLS_DATASET]

In [31]:
df_dataset = create_bert_token_voken_dataset(df_interval_text, df_token_voken)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




Unnamed: 0,0
False,34486595
True,260735


Unnamed: 0,0
False,34121355
True,625975


Unnamed: 0,0
False,34741145
True,6185


Unnamed: 0,bert_token,token_id,offset_start,offset_end,interval_id,word_original,frame_selected,word_time,word_len_plus_1,word_start,word_end,frame_start,frame_end,frame_count,frame_path,frame_full_path,video_id,interval_time
0,are,2024,0,3,100983,are,0,0.0,4,0,3,0,1,1,0_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00000.png,hWQiXv0sn9Y,26.5599
1,not,2025,4,7,100983,not,2,0.1,4,4,7,1,4,3,2_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00002.png,hWQiXv0sn9Y,26.5599
2,understanding,4824,8,21,100983,understanding,7,0.5,14,8,21,4,10,6,7_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00007.png,hWQiXv0sn9Y,26.5599
3,liquid,6381,22,28,100983,Liquid,19,1.3,7,22,28,10,29,19,19_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00019.png,hWQiXv0sn9Y,26.5599
4,gold,2751,29,33,100983,Gold,42,2.8,5,29,33,29,56,27,42_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00042.png,hWQiXv0sn9Y,26.5599
5,everything,2673,34,44,100983,everything,63,4.2,11,34,44,56,70,14,63_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00063.png,hWQiXv0sn9Y,26.5599
6,about,2055,45,50,100983,about,70,4.7,6,45,50,70,71,1,70_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00070.png,hWQiXv0sn9Y,26.5599
7,this,2023,51,55,100983,this,73,4.9,5,51,55,71,76,5,73_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00073.png,hWQiXv0sn9Y,26.5599
8,industry,3068,56,64,100983,industry,79,5.3,9,56,64,76,83,7,79_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00079.png,hWQiXv0sn9Y,26.5599
9,is,2003,65,67,100983,is,84,5.6,3,65,67,83,85,2,84_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00084.png,hWQiXv0sn9Y,26.5599


### Validations

In [32]:
# McClellan
df_dataset[58:62]

Unnamed: 0,word_original,bert_token,token_id,frame_selected,interval_id,offset_start,offset_end,word_time,word_len_plus_1,word_start,word_end,frame_start,frame_end,frame_count,frame_path,frame_full_path,video_id,interval_time
58,Tom,tom,3419,291,100983,317,320,19.4,4,317,320,289,293,4,291_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00291.png,hWQiXv0sn9Y,26.56
59,McClellan,mcc,23680,297,100983,321,324,19.8,10,321,330,293,301,8,297_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00297.png,hWQiXv0sn9Y,26.56
60,McClellan,##lellan,25839,297,100983,324,330,19.8,10,321,330,293,301,8,297_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00297.png,hWQiXv0sn9Y,26.56
61,he,he,2002,304,100983,331,333,20.3,3,331,333,301,308,7,304_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00304.png,hWQiXv0sn9Y,26.56


In [33]:
# cuz
df_dataset.iloc[100:105]

Unnamed: 0,word_original,bert_token,token_id,frame_selected,interval_id,offset_start,offset_end,word_time,word_len_plus_1,word_start,word_end,frame_start,frame_end,frame_count,frame_path,frame_full_path,video_id,interval_time
100,end,end,2203,125,101126,66,69,8.3,4,66,69,122,128,6,125_101126_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/101126/vokens/face_annot_224/00125.png,hWQiXv0sn9Y,36.7
101,cuz,cu,12731,131,101126,70,72,8.7,4,70,73,128,134,6,131_101126_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/101126/vokens/face_annot_224/00131.png,hWQiXv0sn9Y,36.7
102,cuz,##z,2480,131,101126,72,73,8.7,4,70,73,128,134,6,131_101126_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/101126/vokens/face_annot_224/00131.png,hWQiXv0sn9Y,36.7
103,getting,getting,2893,138,101126,74,81,9.2,8,74,81,134,142,8,138_101126_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/101126/vokens/face_annot_224/00138.png,hWQiXv0sn9Y,36.7
104,so,so,2061,144,101126,82,84,9.6,3,82,84,142,146,4,144_101126_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/101126/vokens/face_annot_224/00144.png,hWQiXv0sn9Y,36.7


In [34]:
# Specter, impeachment
df_dataset.iloc[2648:2658]

Unnamed: 0,word_original,bert_token,token_id,frame_selected,interval_id,offset_start,offset_end,word_time,word_len_plus_1,word_start,word_end,frame_start,frame_end,frame_count,frame_path,frame_full_path,video_id,interval_time
2648,has,has,2038,43,103998,39,42,2.9,4,39,42,42,45,3,43_103998_FVFdsl29s_Q,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/FVFdsl29s_Q/103998/vokens/face_annot_224/00043.png,FVFdsl29s_Q,55.67
2649,raised,raised,2992,46,103998,43,49,3.1,7,43,49,45,48,3,46_103998_FVFdsl29s_Q,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/FVFdsl29s_Q/103998/vokens/face_annot_224/00046.png,FVFdsl29s_Q,55.67
2650,the,the,1996,48,103998,50,53,3.2,4,50,53,48,49,1,48_103998_FVFdsl29s_Q,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/FVFdsl29s_Q/103998/vokens/face_annot_224/00048.png,FVFdsl29s_Q,55.67
2651,Specter,spec,28699,50,103998,54,58,3.3,8,54,61,49,52,3,50_103998_FVFdsl29s_Q,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/FVFdsl29s_Q/103998/vokens/face_annot_224/00050.png,FVFdsl29s_Q,55.67
2652,Specter,##ter,3334,50,103998,58,61,3.3,8,54,61,49,52,3,50_103998_FVFdsl29s_Q,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/FVFdsl29s_Q/103998/vokens/face_annot_224/00050.png,FVFdsl29s_Q,55.67
2653,impeachment,imp,17727,59,103998,62,65,3.9,12,62,73,52,67,15,59_103998_FVFdsl29s_Q,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/FVFdsl29s_Q/103998/vokens/face_annot_224/00059.png,FVFdsl29s_Q,55.67
2654,impeachment,##ea,5243,59,103998,65,67,3.9,12,62,73,52,67,15,59_103998_FVFdsl29s_Q,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/FVFdsl29s_Q/103998/vokens/face_annot_224/00059.png,FVFdsl29s_Q,55.67
2655,impeachment,##chment,22729,59,103998,67,73,3.9,12,62,73,52,67,15,59_103998_FVFdsl29s_Q,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/FVFdsl29s_Q/103998/vokens/face_annot_224/00059.png,FVFdsl29s_Q,55.67
2656,just,just,2074,70,103998,74,78,4.7,5,74,78,67,73,6,70_103998_FVFdsl29s_Q,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/FVFdsl29s_Q/103998/vokens/face_annot_224/00070.png,FVFdsl29s_Q,55.67
2657,four,four,2176,77,103998,79,83,5.1,5,79,83,73,82,9,77_103998_FVFdsl29s_Q,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/FVFdsl29s_Q/103998/vokens/face_annot_224/00077.png,FVFdsl29s_Q,55.67


## Generate Voken Ids

In [35]:
COLS_DATASET_FINAL = [
    # main elements
    'word_original', 'bert_token', 'token_id', 'frame_selected', 'voken_id', 'interval_id',
    # bert token metadata
    'offset_start', 'offset_end',
    # original word metadata
    'word_time', 'word_len_plus_1', 'word_start', 'word_end',
    # word frames metadata
    'frame_start', 'frame_end', 'frame_count', 'frame_path', 'frame_full_path',
    # additional info
    'video_id', 'interval_time'
]

### Create Image Ids

Based on `create_image_ids.py`

In [36]:
unique_voken_paths = df_dataset['frame_path'].unique()
unique_voken_ids = list(range(1, len(unique_voken_paths) + 1))

print(f'Unique vokens: {len(unique_voken_ids):,}')
df_vokens = pd.DataFrame({'frame_path': unique_voken_paths, 'voken_id': unique_voken_ids})
df_vokens.head()

Unique vokens: 5,611


Unnamed: 0,frame_path,voken_id
0,0_100983_hWQiXv0sn9Y,1
1,2_100983_hWQiXv0sn9Y,2
2,7_100983_hWQiXv0sn9Y,3
3,19_100983_hWQiXv0sn9Y,4
4,42_100983_hWQiXv0sn9Y,5


In [37]:
df_dataset = df_dataset.merge(df_vokens, on='frame_path')[COLS_DATASET_FINAL]
df_dataset['voken_id'] = df_dataset['voken_id'].astype(int)
df_dataset.head()

Unnamed: 0,word_original,bert_token,token_id,frame_selected,voken_id,interval_id,offset_start,offset_end,word_time,word_len_plus_1,word_start,word_end,frame_start,frame_end,frame_count,frame_path,frame_full_path,video_id,interval_time
0,are,are,2024,0,1,100983,0,3,0.0,4,0,3,0,1,1,0_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00000.png,hWQiXv0sn9Y,26.56
1,not,not,2025,2,2,100983,4,7,0.1,4,4,7,1,4,3,2_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00002.png,hWQiXv0sn9Y,26.56
2,understanding,understanding,4824,7,3,100983,8,21,0.5,14,8,21,4,10,6,7_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00007.png,hWQiXv0sn9Y,26.56
3,Liquid,liquid,6381,19,4,100983,22,28,1.3,7,22,28,10,29,19,19_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00019.png,hWQiXv0sn9Y,26.56
4,Gold,gold,2751,42,5,100983,29,33,2.8,5,29,33,29,56,27,42_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00042.png,hWQiXv0sn9Y,26.56


### Extract Keys

In [38]:
has_path = df_dataset.frame_full_path.apply(os.path.exists)
df_with_caption(has_path.value_counts().to_frame(), 'Existing Frames')

Unnamed: 0,frame_full_path
False,6130
True,55


In [39]:
from models.FECNet import FECNet

class VisnFECNetModel(torch.nn.Module):
    def __init__(self, arch='fecnet', pretrained=True):
        """
        :param dim: dimension of the output
        :param arch: backbone architecture,
        :param pretrained: load feature with pre-trained vector
        :param finetuning: finetune the model
        """
        super().__init__()
        fecnet = FECNet(pretrained=True) # Setup Backbone
        for param in fecnet.parameters():
            param.requires_grad = False
        self.backbone = fecnet

    def forward(self, img):
        """
        :param img: a tensor of shape [batch_size, H, W, C]
        :return: a tensor of [batch_size, d]
        """
        x = self.backbone(img)
        x = x.detach()
        # x = x / x.norm(2, dim=-1, keepdim=True)
        return x

In [40]:
visn_model = VisnFECNetModel(arch='fecnet').eval()
assert visn_model.backbone.training == False
assert visn_model.training == False

In [41]:
img_sets = [SPEAKER_NAME]
img_paths = df_dataset['frame_full_path'].tolist()
img_ids = df_dataset['voken_id'].tolist()
img_transform = transforms.Compose([transforms.ToTensor()])
batch_size = 32

In [42]:
saved_img_paths = []
saved_img_ids = []
img_keys = []
tensor_imgs = []
debug_count = 0
last_dim = -1

for i, img_path in enumerate(tqdm(img_paths)):
    try:
        pil_img = default_loader(img_path)
        saved_img_paths.append(img_path)
        saved_img_ids.append(img_ids[i])
        img_tensor = img_transform(pil_img)
        tensor_imgs.append(img_tensor)
        # debug
        if i < 5000 and debug_count < 10:
            debug_count += 1
            print("Loaded image %s" % img_path)
    except Exception as e:
        if str(SAMPLE_INTERVAL) in img_path:
            print(e)
            print("Skip image %s" % img_path)
        continue

    if len(tensor_imgs) == batch_size:
        visn_input = torch.stack(tensor_imgs)
        with torch.no_grad():
            visn_output = visn_model(visn_input)

        # Check sizes of features are equal.
        if last_dim == -1:
            last_dim = visn_output.shape[-1]
        assert last_dim == visn_output.shape[-1]
        last_dim = visn_output.shape[-1]

        # Saved the features in hdf5
        img_keys.extend(visn_output.detach().cpu().numpy())

        tensor_imgs = []

if len(tensor_imgs) > 0:
    visn_input = torch.stack(tensor_imgs)
    with torch.no_grad():
        visn_output = visn_model(visn_input)
    # Saved the features in hdf5
    img_keys.extend(visn_output.detach().cpu().numpy())

HBox(children=(FloatProgress(value=0.0, max=6185.0), HTML(value='')))

Loaded image /Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/l_htSPGAY7I/214625/vokens/face_annot_224/00003.png
Loaded image /Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/l_htSPGAY7I/214625/vokens/face_annot_224/00008.png
Loaded image /Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/l_htSPGAY7I/214625/vokens/face_annot_224/00023.png
Loaded image /Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/l_htSPGAY7I/214625/vokens/face_annot_224/00037.png
Loaded image /Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/l_htSPGAY7I/214625/vokens/face_annot_224/00040.png
Loaded image /Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/l_htSPGAY7I/214625/vokens/face_annot_224/00043.png
Loaded image /Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/l_htSPGAY7I/214625/vokens/face_annot_224/00044.png
Loaded image /Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/l_htSPGAY7I/214625/vokens/face_annot_224/00048.png
Loaded image /Users/staveshemesh/Projects/PATS_DATA/Yout

In [43]:
from models.FECNet import FECNet

class VisnFECNetModel(torch.nn.Module):
    def __init__(self, arch='fecnet', pretrained=True):
        """
        :param dim: dimension of the output
        :param arch: backbone architecture,
        :param pretrained: load feature with pre-trained vector
        :param finetuning: finetune the model
        """
        super().__init__()
        fecnet = FECNet(pretrained=True) # Setup Backbone
        for param in fecnet.parameters():
            param.requires_grad = False
        self.backbone = fecnet

    def forward(self, img):
        """
        :param img: a tensor of shape [batch_size, H, W, C]
        :return: a tensor of [batch_size, d]
        """
        x = self.backbone(img)
        x = x.detach()
        # x = x / x.norm(2, dim=-1, keepdim=True)
        return x

# Export Data

In [44]:
# output dir
print(f'Saving iteration outputs:')
printmd(f'{TAB}**/{TIMESTR}** → mkdir {OUTPUT_ITERATION_DIR}..')

Saving iteration outputs:


&nbsp;&nbsp;&nbsp;&nbsp;**/20210416_212032** → mkdir /Users/staveshemesh/Projects/shstav2/token_voken/data/sample/20210416_212032..

## DataFrames

In [45]:
os.makedirs(OUTPUT_DATAFRAMES_ITERATION_DIR)

In [46]:
# df_dataset
rows_df_dataset = df_dataset.shape[0]
path_df_dataset = os.path.join(OUTPUT_DATAFRAMES_ITERATION_DIR, f'{TIMESTR}_df_dataset_{rows_df_dataset}.csv')
printmd(f'{TAB}**df_dataset** → {rows_df_dataset:,} rows {path_df_dataset}')
df_dataset.to_csv(path_df_dataset, header=True)

# df_token_voken
rows_df_token_voken = df_token_voken.shape[0]
path_df_token_voken = os.path.join(OUTPUT_DATAFRAMES_ITERATION_DIR, f'{TIMESTR}_df_token_voken_{rows_df_token_voken}.csv')
printmd(f'{TAB}**df_token_voken** → {rows_df_token_voken:,} rows {path_df_token_voken}')
df_token_voken.to_csv(path_df_token_voken, header=True)

# df_interval_text
rows_df_interval_text = df_interval_text.shape[0]
path_df_interval_text = os.path.join(OUTPUT_DATAFRAMES_ITERATION_DIR, f'{TIMESTR}_df_interval_text_{rows_df_interval_text}.csv')
printmd(f'{TAB}**rows_df_interval_text** → {rows_df_interval_text:,} rows {path_df_interval_text}')
df_interval_text.to_csv(path_df_interval_text, header=True)

display(df_with_caption(df_dataset.head(), 'df_dataset'))
display(df_with_caption(df_token_voken.head(), 'df_token_voken'))
display(df_with_caption(df_interval_text.head(), 'df_interval_text'))

&nbsp;&nbsp;&nbsp;&nbsp;**df_dataset** → 6,185 rows /Users/staveshemesh/Projects/shstav2/token_voken/data/sample/20210416_212032/dataframes/20210416_212032_df_dataset_6185.csv

&nbsp;&nbsp;&nbsp;&nbsp;**df_token_voken** → 5,618 rows /Users/staveshemesh/Projects/shstav2/token_voken/data/sample/20210416_212032/dataframes/20210416_212032_df_token_voken_5618.csv

&nbsp;&nbsp;&nbsp;&nbsp;**rows_df_interval_text** → 64 rows /Users/staveshemesh/Projects/shstav2/token_voken/data/sample/20210416_212032/dataframes/20210416_212032_df_interval_text_64.csv

Unnamed: 0,word_original,bert_token,token_id,frame_selected,voken_id,interval_id,offset_start,offset_end,word_time,word_len_plus_1,word_start,word_end,frame_start,frame_end,frame_count,frame_path,frame_full_path,video_id,interval_time
0,are,are,2024,0,1,100983,0,3,0.0,4,0,3,0,1,1,0_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00000.png,hWQiXv0sn9Y,26.5599
1,not,not,2025,2,2,100983,4,7,0.1,4,4,7,1,4,3,2_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00002.png,hWQiXv0sn9Y,26.5599
2,understanding,understanding,4824,7,3,100983,8,21,0.5,14,8,21,4,10,6,7_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00007.png,hWQiXv0sn9Y,26.5599
3,Liquid,liquid,6381,19,4,100983,22,28,1.3,7,22,28,10,29,19,19_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00019.png,hWQiXv0sn9Y,26.5599
4,Gold,gold,2751,42,5,100983,29,33,2.8,5,29,33,29,56,27,42_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00042.png,hWQiXv0sn9Y,26.5599


Unnamed: 0,word_original,frame_selected,word_time,word_len_plus_1,word_start,word_end,frame_start,frame_end,frame_count,frame_path,frame_full_path,interval_id,video_id,interval_time
4045,are,0,0.0,4,0,3,0,1,1,0_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00000.png,100983,hWQiXv0sn9Y,26.5599
4046,not,2,0.1,4,4,7,1,4,3,2_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00002.png,100983,hWQiXv0sn9Y,26.5599
4047,understanding,7,0.5,14,8,21,4,10,6,7_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00007.png,100983,hWQiXv0sn9Y,26.5599
4048,Liquid,19,1.3,7,22,28,10,29,19,19_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00019.png,100983,hWQiXv0sn9Y,26.5599
4049,Gold,42,2.8,5,29,33,29,56,27,42_100983_hWQiXv0sn9Y,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/vokens/face_annot_224/00042.png,100983,hWQiXv0sn9Y,26.5599


Unnamed: 0,interval_id,text
0,100983,are not understanding Liquid Gold everything about this industry is incredibly difficult to navigate which is dangerous because even one of its own trains groups its own tree groups has wounds that some sectors of it are out of control and percent of just how helpless you can be in the face of old is just listen to Tom McClellan he was a ball as Deputy drugs are you would think that if anyone knows this world it would be him
1,101126,plus it's a phone call so you'll say whatever it takes to make it end cuz getting so but might be hard but nothing is harder than an 8 minute phone call with another human being nothing nothing at all those claims of success it is very hard to know exactly what you're getting at any given rehab 12-step bicep wrote which undoubtedly what's the song with the most experts argue that they should also be consistent access to other newer treatments like sun behavioral therapy
2,101127,medications like these are not required to offer Lowe's because we have to essentially be whatever is proprietor says it is that means they can frame almost anything as treatment for instance many Hawaiians rehabs off of something cold Equine Therapy and again if you respond to that that's great but there is no empirical evidence that as a treatment for addiction it works which means some people who paid a high price for it can end up
3,101174,"true even the nation's Deputy drugs are couldn't confidently navigate their system in the tragedy is his son died and remember the funny guy who hated horses he died too so this is a matter of life and death are you if you're wondering what experts would advise you to do in seeking treatment money that we spoke to suggested beginning not with a rehab which may try and sell you on its own substance but with the doctor who is board certified in addiction medicine now it's only recently become an official specialty so they just aren't many of them around but you can find those the doing sitting out this website here that you may be able to go with you to a treatment the best suits your needs which might well not be a fancy Center where you have to pay $73,000 to have this guy lose you look up this system clearly Berkeley needs more expertise and it might be really important to understand the word rehab is so broadly defined as to be close to meaningless it is honestly barely better defined in the word building and if someone were to tell you I have a drug problem but don't worry I'm going to building in Florida would not"
4,101178,horse ratio like in that I'm so ugly right now it can be way too difficult to get all answers to those questions which is crazy because so much about battling addiction is really hard guessing clean is hard staying clean is hard but getting good evidence-based trustworthy help should be the fucking easy pops and right now it is way too easy to literally one. Pissing


In [47]:
df_dataset['voken_file'] = df_dataset['frame_full_path'].str.slice(47)
SHOW_COLS = ['word_original', 'bert_token', 'frame_selected', 'frame_full_path', 'voken_file']
df_dataset[SHOW_COLS][90:106].style.set_properties(**{'text-align': 'left', 'font-size': '12pt'})

Unnamed: 0,word_original,bert_token,frame_selected,frame_full_path,voken_file
90,you'll,you,98,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/101126/vokens/face_annot_224/00098.png,oliver/hWQiXv0sn9Y/101126/vokens/face_annot_224/00098.png
91,you'll,',98,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/101126/vokens/face_annot_224/00098.png,oliver/hWQiXv0sn9Y/101126/vokens/face_annot_224/00098.png
92,you'll,ll,98,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/101126/vokens/face_annot_224/00098.png,oliver/hWQiXv0sn9Y/101126/vokens/face_annot_224/00098.png
93,say,say,101,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/101126/vokens/face_annot_224/00101.png,oliver/hWQiXv0sn9Y/101126/vokens/face_annot_224/00101.png
94,whatever,whatever,105,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/101126/vokens/face_annot_224/00105.png,oliver/hWQiXv0sn9Y/101126/vokens/face_annot_224/00105.png
95,it,it,109,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/101126/vokens/face_annot_224/00109.png,oliver/hWQiXv0sn9Y/101126/vokens/face_annot_224/00109.png
96,takes,takes,114,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/101126/vokens/face_annot_224/00114.png,oliver/hWQiXv0sn9Y/101126/vokens/face_annot_224/00114.png
97,to,to,117,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/101126/vokens/face_annot_224/00117.png,oliver/hWQiXv0sn9Y/101126/vokens/face_annot_224/00117.png
98,make,make,118,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/101126/vokens/face_annot_224/00118.png,oliver/hWQiXv0sn9Y/101126/vokens/face_annot_224/00118.png
99,it,it,120,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/101126/vokens/face_annot_224/00120.png,oliver/hWQiXv0sn9Y/101126/vokens/face_annot_224/00120.png


## Training Data

## Tokens

### Save tokens.hdf5

In [48]:
token_ids = df_dataset['token_id']
token_output_path = OUTPUT_TOKENS_PATH.format(count=len(token_ids))
with h5py.File(token_output_path, 'w') as hf:
    hf.create_dataset('tokens', data=token_ids)

print(f'Saved {len(token_ids):,} to {token_output_path}.')

Saved 6,185 to /Users/staveshemesh/Projects/shstav2/token_voken/data/sample/20210416_212032/sample_token_ids_6185.hdf5.


In [49]:
token_hdf5 = h5py.File(token_output_path, 'r')
token_ids_from_file = token_hdf5['tokens']
print(token_ids_from_file)
token_hdf5.close()

<HDF5 dataset "tokens": shape (6185,), type "<i8">


## Vokens

### Save vokens.hdf5

In [50]:
voken_ids = df_dataset['voken_id'].tolist()

**Save File**

In [51]:
voken_output_path = OUTPUT_VOKENS_PATH.format(count=len(voken_ids))
with h5py.File(voken_output_path, 'w') as hf:
    hf.create_dataset('vokens', data=voken_ids)

printmd(f'{TAB}**voken keys** → {len(voken_ids):,} to {voken_output_path}.')

&nbsp;&nbsp;&nbsp;&nbsp;**voken keys** → 6,185 to /Users/staveshemesh/Projects/shstav2/token_voken/data/sample/20210416_212032/sample_voken_ids_6185.hdf5.

**Validate File**

In [52]:
voken_hdf5 = h5py.File(voken_output_path, 'r')
voken_ids_from_file = voken_hdf5['vokens']
print(voken_ids_from_file)
voken_ids_from_file_list = voken_ids_from_file[:].tolist()
assert voken_ids_from_file_list == voken_ids
voken_hdf5.close()

<HDF5 dataset "vokens": shape (6185,), type "<i8">


### Save vokens.ids (vg_nococo/241)

In [53]:
# TODO: use pandas
img_set = img_sets[0]
img_ids = (img_set + '/' + df_vokens['voken_id'].astype(str)).tolist()

ids_path = OUTPUT_VOKEN_IDS_PATH.format(dataset=img_set, count=len(unique_voken_ids))

with open(ids_path, 'w') as f:
    for img_id in img_ids:
        f.write(str(img_id) + '\n')

printmd(f'{TAB}**voken unique ids** → {len(img_ids):,} to {ids_path}.')

&nbsp;&nbsp;&nbsp;&nbsp;**voken unique ids** → 5,611 to /Users/staveshemesh/Projects/shstav2/token_voken/data/sample/20210416_212032/oliver_5611.ids.

### Save voken keys

In [54]:
keys_output_dir = OUTPUT_VOKEN_KEYS_PATH
printmd(f'{TAB}**keys dir** → {keys_output_dir}')
os.makedirs(keys_output_dir)

&nbsp;&nbsp;&nbsp;&nbsp;**keys dir** → /Users/staveshemesh/Projects/shstav2/token_voken/data/sample/20210416_212032/keys

In [55]:
df_dataset.shape

(6185, 20)

In [56]:
h5_path = os.path.join(keys_output_dir, img_set + '.hdf5')
print(f"\tSave features (keys) to {h5_path} with hdf5 dataset 'Keys'.")
h5_file = h5py.File(h5_path, 'w')
#TODO: TMP. FIXME
TARGET_LEN = df_dataset.shape[0]
dset = h5_file.create_dataset("keys", (TARGET_LEN, last_dim))
# dset = h5_file.create_dataset("keys", (len(saved_img_paths), last_dim))
for i, img_key in enumerate(islice(cycle(img_keys), TARGET_LEN)):
    dset[i] = img_key
h5_file.close()

	Save features (keys) to /Users/staveshemesh/Projects/shstav2/token_voken/data/sample/20210416_212032/keys/oliver.hdf5 with hdf5 dataset 'Keys'.


In [57]:
voken_feat_hdf5 = h5py.File(h5_path, 'r')
voken_keys_from_file = voken_feat_hdf5['keys']
print(voken_keys_from_file)
voken_keys_from_file_list = voken_keys_from_file[:].tolist()
assert TARGET_LEN == len(voken_keys_from_file_list)
# assert len(img_keys) == len(voken_keys_from_file_list)
assert all([(img_keys[i] == voken_keys_from_file_list[i]).all() for i in range(len(img_keys))])
print(f'✓ Same Len: {len(img_keys)}')
print(f'✓ Same Features: {len(img_keys)}x{len(img_keys[0])}')
voken_feat_hdf5.close()

<HDF5 dataset "keys": shape (6185, 16), type "<f4">
✓ Same Len: 55
✓ Same Features: 55x16


In [58]:
print(f'Save {len(saved_img_paths):,} feature keys of size {img_keys[0].shape[0]} each.')

Save 55 feature keys of size 16 each.


In [59]:
# Save Image Paths
curr_paths_fname = os.path.join(keys_output_dir, img_set + '.path')
print("\tSave img paths to ", curr_paths_fname)
with open(curr_paths_fname, 'w') as f:
    for path in img_paths:
        f.write(path + "\n")

	Save img paths to  /Users/staveshemesh/Projects/shstav2/token_voken/data/sample/20210416_212032/keys/oliver.path


In [60]:
# Save Within Set Image Ids
curr_ids_fname = os.path.join(keys_output_dir, img_set + '.ids')
local_img_ids = df_dataset['voken_id'].tolist()
print("\tSave img ids to ", curr_ids_fname)
with open(curr_ids_fname, 'w') as f:
    for idx in local_img_ids:
        f.write(str(idx) + "\n")

	Save img ids to  /Users/staveshemesh/Projects/shstav2/token_voken/data/sample/20210416_212032/keys/oliver.ids


## Show Content

In [61]:
!tree {OUTPUT_ITERATION_DIR}

[01;34m/Users/staveshemesh/Projects/shstav2/token_voken/data/sample/20210416_212032[00m
├── [01;34mdataframes[00m
│   ├── 20210416_212032_df_dataset_6185.csv
│   ├── 20210416_212032_df_interval_text_64.csv
│   └── 20210416_212032_df_token_voken_5618.csv
├── [01;34mkeys[00m
│   ├── oliver.hdf5
│   ├── oliver.ids
│   └── oliver.path
├── oliver_5611.ids
├── sample_token_ids_6185.hdf5
└── sample_voken_ids_6185.hdf5

2 directories, 9 files


List of **unique** voken ids

### Vokens

In [62]:
! echo {curr_ids_fname}
! (head -5 && tail -n5) < {curr_ids_fname}

/Users/staveshemesh/Projects/shstav2/token_voken/data/sample/20210416_212032/keys/oliver.ids
1
2
3
4
5
5608
5609
5610
5610
5611


In [63]:
!cat {OUTPUT_ITERATION_DIR}/keys/pats.path | head -n 5

cat: /Users/staveshemesh/Projects/shstav2/token_voken/data/sample/20210416_212032/keys/pats.path: No such file or directory
