In [None]:
import os
subs_dir = 'subtitles'
vids_dir = 'videos'
subs_files = sorted(os.listdir(subs_dir))
vids_files = [x for x in sorted(os.listdir(vids_dir)) if x[-4:] == '.m4v']
print(subs_files)
print(vids_files)

In [None]:
import webvtt

subs = []
for subs_file in subs_files:
    subs.append(webvtt.read(os.path.join(subs_dir, subs_file)))

for caption in subs[0]:
    print('start:', caption.start)
    print('caption:', caption.text)
    print('end:', caption.end)
    print('----------------')

print(subs[6][0].text)

In [None]:
"""
The data uses middle minute of the videos, thus we need to remove captions that don't belong
to that timeframe. Also there should be a caption per each .5 sec in the video to match eeg data rows.
"""
from moviepy.editor import VideoFileClip  # This library is complete overkill for this purpose
import time
     
# Read video file durations
video_durations = [] 
for videofile in vids_files:
    clip = VideoFileClip(os.path.join(vids_dir, videofile))
    print(videofile, clip.duration)
    video_durations.append(clip.duration)

In [None]:
import pandas as pd
eeg_df = pd.read_csv('EEG_data.csv')
print(eeg_df.iloc[0])
eeg_df.VideoID = pd.to_numeric(eeg_df.VideoID)
eeg_df.SubjectID = pd.to_numeric(eeg_df.SubjectID)
print(eeg_df.VideoID.value_counts().sort_index())
print(eeg_df.SubjectID.value_counts().sort_index())

# Users seem to have different amounts of watch time assuming .5 second measurement interval is correct 
for subject_id in sorted(eeg_df.SubjectID.unique()):
    print('SubjecttId', subject_id)
    print(eeg_df.query('SubjectID == {}'.format(subject_id)).VideoID.value_counts().sort_index())

In [None]:
# filter captions from subs that are not in the middle minute
import numpy as np

def invalid_sub(sub, duration):
    start_time = time.strftime('%H:%M:%S', time.gmtime(duration / 2 - 30))
    end_time = time.strftime('%H:%M:%S', time.gmtime(duration / 2 + 30))
    return sub.end < start_time or sub.start > end_time

#filtered_subs = [list(filter(lambda x: not invalid_sub(x, duration), sub)) 
#                 for sub, duration in zip(subs, video_durations)]

def strf_seconds(seconds):
    return '{:2.0f}:{:2.0f}:{:2.0f}.{}'.format(seconds / 3600, int(int(seconds) / 60) % 60, int(seconds) % 60,
                                         str(seconds).split('.')[-1]).replace(' ', '0')

def get_video_intervals(vid_duration, n_rows):
    half_usertime = n_rows / 4  # 1 row is .5 seconds
    start_time = vid_duration / 2 - half_usertime
    end_time = vid_duration / 2 + half_usertime

    intervals = np.arange(2 * start_time, 2 * end_time) / 2
    if len(intervals) > n_rows: intervals = intervals[:-1]
    assert len(intervals) == n_rows
    return [strf_seconds(interval) for interval in intervals]

def get_subs_for_rows(vid_subs, vid_intervals):
    #print(vid_intervals)
    row_subs = []
    a = 0
    b = 0
    #print(len(vid_intervals), len(vid_subs))
    while b < len(vid_intervals):
        while a < len(vid_subs) and vid_subs[a].end < vid_intervals[b]:
            a += 1
        if a < len(vid_subs) and vid_subs[a].start < vid_intervals[b] < vid_subs[a].end: 
            row_subs.append(vid_subs[a].text)
        else: row_subs.append('<empty>')
        b += 1  
    assert len(vid_intervals) == len(row_subs)
    return row_subs

# Test strf_seconds
print(strf_seconds(119.5))
#for i, row in eeg_df.iterrows():
#    print(row.VideoID, row.SubjectID, sep=', ')

In [None]:
# make on row in subs as caption for a .5 sec period (as if a sample is taken every .5 seconds)
subs_per_row = []
for subject_id in sorted(eeg_df.SubjectID.unique()):
    for video_id in sorted(eeg_df.VideoID.unique()):
        #print(subject_id, video_id)
        video_duration = video_durations[int(video_id)]
        video_intervals = get_video_intervals(video_duration,
                                              len(eeg_df.query('SubjectID == {} and VideoID == {}'
                                                              .format(subject_id, video_id))))
        subs_per_row += get_subs_for_rows(subs[int(video_id)], video_intervals)
            

In [None]:
# elmo embed subs, requires https://github.com/HIT-SCIR/ELMoForManyLangs
from elmoformanylangs import Embedder
import numpy as np
e = Embedder('../../../text_embedding_repos/ELMoForManyLangs/Elmo_english_pre')


In [None]:
subs_sents = [sub.split() for sub in subs_per_row]
elmo_sents = e.sents2elmo(subs_sents)
elmo_avg_sents = [np.mean(vec, axis=0) for vec in elmo_sents]

In [None]:
import pandas as pd

data = []
for i, row in eeg_df.iterrows():
    row = np.insert(elmo_avg_sents[i], (0, 0), (row.SubjectID, row.VideoID)).astype(np.float32)
    data.append(row)

df = pd.DataFrame(data, columns=['SubjectID', 'VideoID', *[str(x) for x in range(len(elmo_avg_sents[0]))]])
#df.to_csv('elmo_embedded_subs.csv', index=False)

In [None]:
import pandas as pd
df = pd.read_csv('elmo_embedded_subs.csv')

In [None]:
# Whole df takes 257M space -> split by VideoID
for video_id in sorted(df.VideoID.unique()):
    vid_df = df.query('VideoID == {}'.format(video_id))
    vid_df.to_csv('vid_{}_elmo_embedded_subs.csv'.format(int(video_id)), index=False)

In [None]:
# Combining vid_dfs
import numpy as np
vid_dfs = pd.concat([pd.read_csv('vid_{}_elmo_embedded_subs.csv'.format(i))
                     for i in range(10)], ignore_index=True
                   ).sort_values(['SubjectID', 'VideoID']).reset_index(drop=True)

vec_cols = [str(x) for x in range(1024)]
vid_dfs[vec_cols] = vid_dfs[vec_cols].astype(np.float32) # for speed

In [None]:
def round5(x):
    return round(x, 5)

assert vid_dfs.applymap(round5).equals(df.applymap(round5))