In [1]:
%matplotlib inline
from query.datasets.prelude import *
import pysrt
import textacy
import gensim

DEBUG 2017-12-08 15:24:46,193 doc2vec.py:073] Fast version of gensim.models.doc2vec is being used
INFO 2017-12-08 15:24:46,203 textcleaner.py:020] 'pattern' package not found; tag filters are not available for English


In [2]:
# Load transcripts and compute time offsets for captions
transcripts = []
for video in Video.objects.exclude(path__contains='segment'):
    base = os.path.split(os.path.splitext(video.path)[0])[1]
    subs = pysrt.open('/app/subs/{}.cc5.srt'.format(base))
    offsets = []
    transcript = ''
    for sub in subs:
        offsets.append((len(transcript), sub.start, sub.end))
        transcript += sub.text.replace('\n', ' ').replace('>', ' ') + ' '
    transcripts.append((video, transcript, offsets))

In [3]:
# Convert transcripts into textacy Docs (implicitly performing various NLP tasks)
docs = [textacy.doc.Doc(transcript, lang=u'en', metadata={'path': video.path, 'offsets': offsets}) for (video, transcript, offsets) in transcripts]
corpus = textacy.Corpus(u'en', docs=docs)

DEBUG 2017-12-08 15:24:56,616 cache.py:088] Loading "en" spaCy pipeline


In [4]:
# Helper functions
def token_time(doc, tok):
    tok_offset = tok.idx
    for i, (offset, start, end) in enumerate(doc.metadata['offsets']):
        if tok_offset < offset:
            return (start, end)
    raise Exception("No token time?")
    
def compute_tfidf(corpus):
    gensim_dict = gensim.corpora.Dictionary(corpus)
    gensim_corpus = [gensim_dict.doc2bow(text) for text in transcript_words]
    tfidf_model = gensim.models.tfidfmodel.TfidfModel(gensim_corpus, normalize=True)
    return {gensim_dict.get(id): value for doc in tfidf[gensim_corpus] for id, value in doc}

In [5]:
# Load Word2Vec model
model = gensim.models.KeyedVectors.load_word2vec_format('/app/deps/word2vec/GoogleNews-vectors-negative300.bin', binary=True)

def sim_score(a, b):
    try:
        return model.wv.similarity(a, b)
    except KeyError:
        return 0

INFO 2017-12-08 15:28:06,556 keyedvectors.py:201] loading projection weights from /app/deps/word2vec/GoogleNews-vectors-negative300.bin
DEBUG 2017-12-08 15:28:06,557 smart_open_lib.py:149] {'kw': {}, 'mode': 'rb', 'uri': '/app/deps/word2vec/GoogleNews-vectors-negative300.bin'}
DEBUG 2017-12-08 15:28:06,578 smart_open_lib.py:621] encoding_wrapper: {'errors': 'strict', 'mode': 'rb', 'fileobj': <open file '/app/deps/word2vec/GoogleNews-vectors-negative300.bin', mode 'rb' at 0x7f0a01b76150>, 'encoding': None}
INFO 2017-12-08 15:29:04,257 keyedvectors.py:263] loaded (3000000, 300) matrix from /app/deps/word2vec/GoogleNews-vectors-negative300.bin


In [10]:
model.wv.similarity('economy', 'jobs')

0.40456532973171394

In [6]:
# TODO: incorporate TFIDF segments

topics = ['immigration', 'china', 'syria', 'terrorism', 'economy', 'election']
transcript_tokens = [
    list(textacy.extract.words(doc, filter_nums=True, include_pos=['PROPN', 'NOUN'])) 
    for doc in corpus.docs]
  
N = len(topics)    
#fig, ax = plt.subplots(2, N / 2, figsize=(18, 16))
#ax = ax.flatten()
W = 30
MENTION_THRESHOLD = 3
PEAK_DURATION_THRESHOLD = 5

def topic_peaks(topic, tokens):
    scores = np.array([sim_score(token.text.lower(), topic) for token in tokens])
    scores = scores > 0.6
    sums = np.array([np.sum(scores[j:j+W]) for j in range(W, len(scores) - W)]) >= MENTION_THRESHOLD
    start = None
    peaks = []
    for j in range(len(sums)):
        if sums[j] and start is None:
            start = j
        elif not sums[j] and start is not None:
            idx = (start + j) / 2 + W
            peak_duration = j - start
            if peak_duration > PEAK_DURATION_THRESHOLD:
                peaks.append((tokens[idx], peak_duration))
            start = None

    #ax[i].set_ylim([0, 1])
    #ax[i].set_title(topic)
    #ax[i].plot(sums)
    
    return peaks


topic_labeler, _ = Labeler.objects.get_or_create(name='word2vec')
topic_tracks = []
for tokens, doc in zip(transcript_tokens, corpus.docs):
    print(doc.metadata['path'])
    video = Video.objects.get(path=doc.metadata['path'])
    for i, topic in enumerate(topics): 
        peaks = topic_peaks(topic, tokens)
        print(topic, peaks)
        for (token, _) in peaks:
            (start, end) = token_time(doc, token)
            start = start.to_time()
            seconds = (((start.hour * 60) + start.minute) * 60 + start.second)
            
            # Naively assume 4 minute segment size
            min_time = max(seconds - 2 * 60, 0)
            max_time = min(seconds + 2 * 60, int(video.num_frames / video.fps))
            
            topic_model, _ = Topic.objects.get_or_create(name=topic)
            track = TopicTrack(
                video=video, min_frame=min_time*video.fps, max_frame=max_time*video.fps, topic=topic_model,
                labeler=topic_labeler)
            topic_tracks.append(track)
#_ = TopicTrack.objects.bulk_create(topic_tracks)            

tvnews/videos/CNN_20120429_170000_Fareed_Zakaria_GPS.mp4
('immigration', [(UNITED, 30)])
('china', [(DETAINED, 14), (DICTATORSHIP, 23)])
('syria', [(SECURITY, 25)])
('terrorism', [])
('economy', [(BUDGET, 30)])
('election', [])
tvnews/videos/CNN_20121102_200000_The_Situation_Room.mp4
('immigration', [])
('china', [])
('syria', [(VIEW, 30)])
('terrorism', [])
('economy', [(UNEMPLOYMENT, 27)])
('election', [(PRESIDENT, 30), (WORRIES, 14), (DAY, 25), (SMALL, 18), (JOBS, 10)])
tvnews/videos/CNNW_20120809_200000_The_Situation_Room.mp4
('immigration', [(PROGRAMS, 20)])
('china', [])
('syria', [(LEE, 15), (FIGHT, 6), (SERIOUS, 29)])
('terrorism', [])
('economy', [(PRESIDENT, 24)])
('election', [(OBAMA, 9), (JOURNAL, 9)])
tvnews/videos/FOXNEWSW_20160401_190000_Shepard_Smith_Reporting.mp4
('immigration', [])
('china', [])
('syria', [(ISIS, 8)])
('terrorism', [])
('economy', [])
('election', [(CAMPAIGN, 8)])
tvnews/videos/FOXNEWSW_20120421_200000_Americas_News_Headquarters.mp4
('immigration', [(

('syria', [])
('terrorism', [])
('economy', [])
('election', [(LEADER, 29)])
tvnews/videos/FOXNEWSW_20160527_210000_The_Five.mp4
('immigration', [])
('china', [(WHO, 30)])
('syria', [])
('terrorism', [])
('economy', [])
('election', [])
tvnews/videos/FOXNEWSW_20120623_170000_Americas_News_Headquarters.mp4
('immigration', [(FACT, 9), (ECONOMY, 21)])
('china', [])
('syria', [(YESTERDAY, 23), (MUJAHEDDIN, 6), (POWERS, 37)])
('terrorism', [])
('economy', [(WE'VE, 30)])
('election', [])
tvnews/videos/FOXNEWSW_20160601_190000_Shepard_Smith_Reporting.mp4
('immigration', [])
('china', [])
('syria', [])
('terrorism', [])
('economy', [(PLACE, 19)])
('election', [(B, 20)])
tvnews/videos/FOXNEWSW_20160603_180000_The_Real_Story_With_Gretchen_Carlson.mp4
('immigration', [(POLL, 7), (PREVIOUS, 9), (ISSUE, 19)])
('china', [])
('syria', [])
('terrorism', [])
('economy', [(GDP, 28), (JOBS, 13)])
('election', [])
tvnews/videos/FOXNEWSW_20120312_210000_The_Five.mp4
('immigration', [])
('china', [])
('syri

KeyboardInterrupt: 