In [34]:
import os
import re
import time

import lda
import nltk
import numpy as np
from datetime import datetime, timedelta
from nltk.tokenize import word_tokenize
from pymongo import MongoClient
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from scipy.sparse import dok_matrix
import matplotlib.pyplot as plt

In [2]:
MONGO_URL = 'mongodb://um.media.mit.edu:27017/super-glue'
db = MongoClient(MONGO_URL).get_default_database()

In [3]:
nlp_data = db['nlp_data'].find_one()
stopwords = nlp_data["stopwords"]
vocab = nlp_data["vocab_non_stemmed"]
vocab_dict = nlp_data["vocab_non_stemmed_dict"]

In [29]:
DAY = 86400000
HOUR = 3600000

def millis():
    return int(round(time.time() * 1000))
def millis_since(num_days='2'):
    days = int(os.environ.get('TIME_FRAME_DAYS', num_days))
    return millis() - days*DAY
def millis_since_hours(hours):
    return millis()-HOUR*hours

def get_vectors(media):
    segs = media["story_segments"]
    media_url = "media_url_no_comm"
    if not media["module_reports"]["commercial_skip_module"]["removed_commercials"]:
        media_url = "media_url"
    segs_vectors = []
    file_name = lambda x:''.join(x.split('.')[4:])
    for i in range (len(segs)):
        start = segs[i]["start"]
        end = segs[i]["end"]
        thumb = "/static/images/blank.jpg"
        if "thumbnail_image" in segs[i]:
            thumb = segs[i]["thumbnail_image"]
        url = "%s#t=%.2f,%.2f"%(media[media_url],start/1000.0,end/1000.0)
        air_date = media["date_added"]
        length = float(end)-float(start)
        if "word_count" in segs[i]:
            vector = segs[i]["word_count"]
            if len(vector.keys())>3 and length>4000:
                segs_vectors.append({
                    "start":start,
                    "end":end,
                    "url":url,
                    "channel":media["channel"],
                    "length":length,
                    "date":air_date,
                    "thumbnail":thumb,
                    "media_id": str(media["_id"]),
                    "segment_index": i,
                    "vector":vector})
    return segs_vectors

def get_all_segments(since_hour):
    all_media_has_segments = db['media'].find(
        {"date_added": {"$gt": millis_since_hours(since_hour+24), "$lt": millis_since_hours(since_hour)},
         "story_segments":{"$exists": True},"is_news":{"$eq": True}})
    num_of_videos = all_media_has_segments.count()
    print "%d videos"%num_of_videos
    # total_segments = 0
    all_segments = []
    for media in all_media_has_segments:
        segs = get_vectors(media)
        all_segments.extend(segs)
    print "%d total segments"%len(all_segments)
    return all_segments

In [30]:
def get_topic_words(topic_word, n_top_words):
    topic_summaries = []
    for i, topic_dist in enumerate(topic_word):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
        topic_summaries.append('-'.join(topic_words))
    return topic_summaries

def run_lda(all_segments):
    vectors = dok_matrix((len(all_segments), len(vocab)), dtype=np.int32)
    for i, seg in enumerate(all_segments):
        vec = seg["vector"]
        for ind in vec.keys():
            vectors[i, int(ind)] = int(vec[ind])
        seg.pop("vector")

    n_topics = len(all_segments)/9
    n_iter = 1000 #1000
    lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter)
    X_topics = lda_model.fit_transform(vectors)
    
    n_words_long = 40
    long_topic_summaries = get_topic_words(lda_model.topic_word_, n_words_long)

    clusters = []
    for i in range(n_topics):
        clusters.append([])
    doc_topic = lda_model.doc_topic_
    for i, seg in enumerate(all_segments):
        clusters[doc_topic[i].argmax()].append(seg)


    print
    long_results = []
    words_sizes = vectors.sum(axis=0)
    for i, topic in enumerate(long_topic_summaries):
        channels = list(set([seg["channel"] for seg in clusters[i] ]))
        segs_by_channel = [{"channel":channel,"videos":sorted([segm for segm in clusters[i] if segm["channel"]==channel],key=lambda x:x["date"])} for channel in channels]
        sorted_segs_by_channel = sorted(segs_by_channel, key=lambda x:len(x["videos"]), reverse=True)
        words = [{"text":word, "size":words_sizes[0, vocab.index(word)]} for word in topic.split('-')]
        long_topic = topic
        long_res = {
            'id':i,
            'summary':long_topic,
            'value':len(clusters[i]),
            'segments':sorted_segs_by_channel,
            'words':[{"text":word, "size":words_sizes[0, vocab.index(word)]} for word in long_topic.split('-')],
            'ratio':(len(clusters[i])*1.0)/(len(all_segments)*1.0)
            }

        long_results.append(long_res)
    return {'all_clusters': long_results}
    

In [43]:
clusters_collection = MongoClient(MONGO_URL)['perspectives']['clusters_full']
def add_data (since_hour):
    clus = run_lda(get_all_segments(since_hour))
    timestamp = int(time.mktime((datetime.now() - timedelta(hours=since_hour)).timetuple()))
    clus["timestamp"] = timestamp
    result = clusters_collection.insert_one(clus)
    print "added %d"%since_hour

In [48]:
hours_one_moth_ago = 10
since_hour = hours_one_moth_ago
while since_hour>3:
    add_data(since_hour)
    since_hour-=3

114 videos
480 total segments


INFO:lda:n_documents: 480
INFO:lda:vocab_size: 33701
INFO:lda:n_words: 95083
INFO:lda:n_topics: 53
INFO:lda:n_iter: 1000
INFO:lda:<0> log likelihood: -1320262
INFO:lda:<10> log likelihood: -894486
INFO:lda:<20> log likelihood: -860731
INFO:lda:<30> log likelihood: -846019
INFO:lda:<40> log likelihood: -837625
INFO:lda:<50> log likelihood: -832141
INFO:lda:<60> log likelihood: -827858
INFO:lda:<70> log likelihood: -823814
INFO:lda:<80> log likelihood: -819879
INFO:lda:<90> log likelihood: -817217
INFO:lda:<100> log likelihood: -814887
INFO:lda:<110> log likelihood: -812190
INFO:lda:<120> log likelihood: -809619
INFO:lda:<130> log likelihood: -808003
INFO:lda:<140> log likelihood: -806727
INFO:lda:<150> log likelihood: -804664
INFO:lda:<160> log likelihood: -803239
INFO:lda:<170> log likelihood: -802713
INFO:lda:<180> log likelihood: -801719
INFO:lda:<190> log likelihood: -800825
INFO:lda:<200> log likelihood: -798737
INFO:lda:<210> log likelihood: -798626
INFO:lda:<220> log likelihood: 


added 10
114 videos
546 total segments


INFO:lda:n_documents: 546
INFO:lda:vocab_size: 33701
INFO:lda:n_words: 97032
INFO:lda:n_topics: 60
INFO:lda:n_iter: 1000
INFO:lda:<0> log likelihood: -1371573
INFO:lda:<10> log likelihood: -924649
INFO:lda:<20> log likelihood: -888586
INFO:lda:<30> log likelihood: -872632
INFO:lda:<40> log likelihood: -861765
INFO:lda:<50> log likelihood: -853767
INFO:lda:<60> log likelihood: -848213
INFO:lda:<70> log likelihood: -843073
INFO:lda:<80> log likelihood: -839534
INFO:lda:<90> log likelihood: -836184
INFO:lda:<100> log likelihood: -833045
INFO:lda:<110> log likelihood: -830489
INFO:lda:<120> log likelihood: -829858
INFO:lda:<130> log likelihood: -827466
INFO:lda:<140> log likelihood: -825305
INFO:lda:<150> log likelihood: -825153
INFO:lda:<160> log likelihood: -823378
INFO:lda:<170> log likelihood: -822588
INFO:lda:<180> log likelihood: -821560
INFO:lda:<190> log likelihood: -820270
INFO:lda:<200> log likelihood: -819028
INFO:lda:<210> log likelihood: -818484
INFO:lda:<220> log likelihood: 


added 7
118 videos
556 total segments


INFO:lda:n_documents: 556
INFO:lda:vocab_size: 33701
INFO:lda:n_words: 98408
INFO:lda:n_topics: 61
INFO:lda:n_iter: 1000
INFO:lda:<0> log likelihood: -1393870
INFO:lda:<10> log likelihood: -936970
INFO:lda:<20> log likelihood: -901481
INFO:lda:<30> log likelihood: -886079
INFO:lda:<40> log likelihood: -877012
INFO:lda:<50> log likelihood: -870243
INFO:lda:<60> log likelihood: -864402
INFO:lda:<70> log likelihood: -859475
INFO:lda:<80> log likelihood: -855406
INFO:lda:<90> log likelihood: -851917
INFO:lda:<100> log likelihood: -849299
INFO:lda:<110> log likelihood: -846391
INFO:lda:<120> log likelihood: -845988
INFO:lda:<130> log likelihood: -843165
INFO:lda:<140> log likelihood: -841082
INFO:lda:<150> log likelihood: -839805
INFO:lda:<160> log likelihood: -837797
INFO:lda:<170> log likelihood: -837657
INFO:lda:<180> log likelihood: -836192
INFO:lda:<190> log likelihood: -834799
INFO:lda:<200> log likelihood: -833872
INFO:lda:<210> log likelihood: -833243
INFO:lda:<220> log likelihood: 


added 4
