In [None]:
%matplotlib inline
from query.datasets.prelude import *
import pysrt
import textacy
import gensim

In [None]:
# Load transcripts and compute time offsets for captions
videos = list(Video.objects.all())

def load_transcript(video):
    base = os.path.split(os.path.splitext(video.path)[0])[1]
    path = '/app/subs/{}.cc5.srt'.format(base)
    if not os.path.isfile(path):
        return None
    subs = pysrt.open(path)
    subs.shift(seconds=-5)
    offsets = []
    transcript = ''
    for sub in subs:
        offsets.append((len(transcript), sub.start, sub.end))
        transcript += sub.text.replace('\n', ' ').replace('>', ' ') + ' '
    return (video, transcript, offsets)

transcripts = par_for(load_transcript, videos[:10000])

In [None]:
# Convert transcripts into textacy Docs (implicitly performing various NLP tasks)
texts, metadatas = unzip([
    (t[1], {'path': t[0].path, 'offsets': t[2]})
    for t in tqdm(transcripts)
    if t is not None
])

with Timer('Creating corpus of {} docs'.format(len(texts))):
    corpus = textacy.Corpus(u'en')
    corpus.add_texts(texts, metadatas, n_threads=8)

In [None]:
# Helper functions
def token_time(doc, tok):
    tok_offset = tok.idx
    for i, (offset, start, end) in enumerate(doc.metadata['offsets']):
        if tok_offset < offset:
            return (start, end)
    raise Exception("No token time?")
    
def compute_tfidf(corpus):
    gensim_dict = gensim.corpora.Dictionary(corpus)
    gensim_corpus = [gensim_dict.doc2bow(text) for text in transcript_words]
    tfidf_model = gensim.models.tfidfmodel.TfidfModel(gensim_corpus, normalize=True)
    return {gensim_dict.get(id): value for doc in tfidf[gensim_corpus] for id, value in doc}

In [None]:
import tempfile
import random
import string


def video_url(path):
    return sp.check_output(
        "gsutil signurl -d 1m /app/service-key.json gs://esper/{} | awk 'FNR==2{{print $5}}'".format(path), shell=True).strip()
    

def precise_time(doc, tokens):
    transcript = ' '.join(
        [t.text for t in list(doc.tokens)[tokens[0].i-50:tokens[-1].i+50]])
    start, _ = token_time(doc, tokens[0])
    _, end = token_time(doc, tokens[-1])

    path = doc.metadata['path']
    url = video_url(path)
  
    def tmpname():
        return '/tmp/{}'.format(''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(8)))

    def fmt_time(t):
        return '{:02d}:{:02d}:{:02d}'.format(t.hours, t.minutes, t.seconds)

    faudio = tmpname() + '.aac'
    ftext = tmpname()

    start2 = start - {'seconds': 3}
    end2 = end + {'seconds': 3}
    sp.check_call('ffmpeg -ss {} -i "{}" -t {} {}'.format(
        fmt_time(start2), url, fmt_time(end2-start2), faudio), shell=True)

    try:
        with open(ftext, 'wb') as f:
            s = transcript.encode('utf-8')
            if len(s) == 0:
                raise Exception("WTF", doc.metadata['path'], token)
            f.write(s)
    except Exception:
        traceback.print_exc()
        return None
    
    aligned = json.loads(sp.check_output(
        'curl -F "audio=@{}" -F "transcript=@{}" "http://gentle:8765/transcriptions?async=false"'.format(
            faudio, ftext), shell=True))
    
    os.remove(faudio)
    os.remove(ftext)
    
    word = None
    words = aligned['words']
    for i in range(len(words)-len(tokens)+1):
        match = True
        for j in range(len(tokens)):
            if words[i+j]['word'] != tokens[j].text or words[i+j]['case'] == 'not-found-in-audio':
                match = False
                break
            
        if match:
            start_sec = start2.hours * 3600 + start2.minutes * 60 + start2.seconds
            return start_sec + words[i]['start'], start_sec + words[i+len(tokens)-1]['end']
        
    return None

In [None]:
def get_segment((i, (path, start, end))):
    url = video_url(path)
     
    def fmt_time(t):
        return '{:02d}:{:02d}:{:02d}.{:03d}'.format(
            int(t / 3600), int(t / 60 % 60), int(t % 60), int(t * 1000 % 1000))
    
    seg_path = '/tmp/segment{:04d}.mp4'.format(i)
    if os.path.isfile(seg_path):
        os.remove(seg_path)
    sp.check_call('ffmpeg -ss {} -i "{}" -t {} -vf "scale=640:360:force_original_aspect_ratio" -r 30 {}'.format(fmt_time(start), url, fmt_time(end - start), seg_path), shell=True)

In [None]:
import glob

# words = ['crooked hillary']
# N=100
# ngram=2
# output_path='/app/supercut.mp4'

def supercut(words, ngram=1, N=300, output_path='/app/supercut.mp4'):
    log.debug('Finding tokens')
    instances = []
    for doc in tqdm(docs):
        tokens = list(doc.tokens)
        for i in range(len(tokens)-ngram+1):
            if ' '.join([tok.lemma_ for tok in tokens[i:i+ngram]]) in words:
                instances.append((doc, tokens[i:i+ngram]))

    def get_time(inst):
        tup = precise_time(*inst)
        if tup is not None:
            return (inst[0].metadata['path'], tup[0], tup[1])
        else:
            return None

    log.debug('Aliging to audio')
    random.shuffle(instances)
    frames = par_for(get_time, instances[:N])
    frames2 = [f for f in frames if f is not None]
    with open('/app/frames.json', 'wb') as f:
        f.write(json.dumps(frames2))    

    log.debug('Fetching video segments')
    par_for(get_segment, list(enumerate(frames2)))

    log.debug('Concatenating {} segments'.format(len(frames2)))
    if os.path.isfile(output_path):
        os.remove(output_path)
    with tempfile.NamedTemporaryFile(delete=False) as f:
        f.write('\n'.join(["file '{}'".format(p) for p in glob.glob('/tmp/segment*.mp4')]))
        f.flush()

        sp.check_call(
            "ffmpeg -f concat -safe 0 -i {} {}".format(f.name, output_path),
            shell=True)

    sp.check_call('rm /tmp/segment*.mp4', shell=True)
    
supercut(['crooked hillary'], ngram=2, N=1000)

In [None]:
# Load Word2Vec model
model = gensim.models.KeyedVectors.load_word2vec_format('/app/deps/word2vec/GoogleNews-vectors-negative300.bin', binary=True)

def sim_score(a, b):
    try:
        return model.wv.similarity(a, b)
    except KeyError:
        return 0

In [None]:
model.wv.similarity('economy', 'jobs')

In [None]:
# TODO: incorporate TFIDF segments

topics = ['immigration', 'china', 'syria', 'terrorism', 'economy', 'election']
transcript_tokens = [
    list(textacy.extract.words(doc, filter_nums=True, include_pos=['PROPN', 'NOUN'])) 
    for doc in corpus.docs]
  
N = len(topics)    
#fig, ax = plt.subplots(2, N / 2, figsize=(18, 16))
#ax = ax.flatten()
W = 30
MENTION_THRESHOLD = 3
PEAK_DURATION_THRESHOLD = 5

def topic_peaks(topic, tokens):
    scores = np.array([sim_score(token.text.lower(), topic) for token in tokens])
    scores = scores > 0.6
    sums = np.array([np.sum(scores[j:j+W]) for j in range(W, len(scores) - W)]) >= MENTION_THRESHOLD
    start = None
    peaks = []
    for j in range(len(sums)):
        if sums[j] and start is None:
            start = j
        elif not sums[j] and start is not None:
            idx = (start + j) / 2 + W
            peak_duration = j - start
            if peak_duration > PEAK_DURATION_THRESHOLD:
                peaks.append((tokens[idx], peak_duration))
            start = None

    #ax[i].set_ylim([0, 1])
    #ax[i].set_title(topic)
    #ax[i].plot(sums)
    
    return peaks


topic_labeler, _ = Labeler.objects.get_or_create(name='word2vec')
topic_tracks = []
for tokens, doc in zip(transcript_tokens, corpus.docs):
    print(doc.metadata['path'])
    video = Video.objects.get(path=doc.metadata['path'])
    for i, topic in enumerate(topics): 
        peaks = topic_peaks(topic, tokens)
        print(topic, peaks)
        for (token, _) in peaks:
            (start, end) = token_time(doc, token)
            start = start.to_time()
            seconds = (((start.hour * 60) + start.minute) * 60 + start.second)
            
            # Naively assume 4 minute segment size
            min_time = max(seconds - 2 * 60, 0)
            max_time = min(seconds + 2 * 60, int(video.num_frames / video.fps))
            
            topic_model, _ = Topic.objects.get_or_create(name=topic)
            track = TopicTrack(
                video=video, min_frame=min_time*video.fps, max_frame=max_time*video.fps, topic=topic_model,
                labeler=topic_labeler)
            topic_tracks.append(track)
#_ = TopicTrack.objects.bulk_create(topic_tracks)            