In [33]:
import os
import time
import re
import nltk
import numpy as np
from nltk.tokenize import word_tokenize
from pymongo import MongoClient
from sklearn.feature_extraction.text import CountVectorizer
import segments

In [34]:
DAY = 86400000
HOUR = 3600000
def millis():
    return int(round(time.time() * 1000))

def millis_since(num_days='2'):
    days = int(os.environ.get('TIME_FRAME_DAYS', num_days))
    return millis() - days*DAY

def get_all_segments():
    MONGO_URL = 'mongodb://um.media.mit.edu:27017/super-glue'
    collection = MongoClient(MONGO_URL).get_default_database()['media']

    #"date_added": {"$gt": millis_since('1')},
    all_media_has_segments = collection.find({"story_segments":{"$exists": True},"is_news":{"$eq": True}})
    num_of_videos = all_media_has_segments.count()
    print "%d videos"%num_of_videos
    # total_segments = 0
    all_segments = []
    for media in all_media_has_segments:
        segs = get_texts(media)
        all_segments.extend(segs)
    print "%d total segments"%len(all_segments)
    return all_segments

In [35]:
def get_texts(media):
    segs = media["story_segments"]
    # com_caps = media["commercials_captions"]
    texts = []
    file_name = lambda x:''.join(x.split('.')[4:])
    for i in range (len(segs)):
        text = ""
        if "text" in segs[i]:
            text = segs[i]["text"]
        length = float(segs[i]["end"])-float(segs[i]["start"])
        if len(text.strip())>200 and length>4000:
            texts.append({
                "text":text})
    return texts

In [75]:
# load nltk's SnowballStemmer as variabled 'stemmer'
from nltk.stem.snowball import SnowballStemmer

def process_texts(all_segments):
    stemmer = SnowballStemmer("english")
    print ("starting to precess texts")
    # load nltk's English stopwords as variable called 'stopwords'
    stopwords = nltk.corpus.stopwords.words('english')
    print ("got stopwords!")
    pattern = re.compile('[\W_]+')
    #nouns = ['NN', 'NNS', 'NNPS', 'NNP', 'JJ']
    for seg in all_segments:
        parts = [pattern.sub('', token).lower() for token in word_tokenize(seg["text"])]
        clean = [i for i in parts if i not in stopwords and len(re.findall('[a-zA-Z]', i))>2]
        seg["processed"] = clean

    seg_texts = [seg["text"] for seg in all_segments] # list of all tweet texts
    seg_texts_processed = [str.join(" ", seg["processed"]) for seg in all_segments] # list of pre-processed tweet texts
    for seg in all_segments:
        seg.pop("processed")
    return seg_texts_processed

In [76]:
all_segments = get_all_segments()
processed_segments = process_texts(all_segments)
stopwords = nltk.corpus.stopwords.words('english')
cvectorizer = CountVectorizer(max_df=1.0, min_df=3, max_features=150000, stop_words=stopwords, ngram_range=(1,1))
cvz = cvectorizer.fit(processed_segments)

7553 videos
24698 total segments
starting to precess texts
got stopwords!


In [77]:
non_stemmed_vocab_arr = cvectorizer.get_feature_names()
non_stemmed_vocab_dict = cvectorizer.vocabulary_

In [79]:
import json
with open('non_stemmed_vocab_arr.json', 'w') as outfile:
    json.dump({"vocab":non_stemmed_vocab_arr}, outfile)
with open('non_stemmed_vocab_dict.json', 'w') as outfile:
    json.dump(non_stemmed_vocab_dict, outfile)

In [78]:
print len(non_stemmed_vocab_arr)

33639


In [67]:
print vocab_arr[0:200]

[u'000', u'10', u'100', u'104', u'108', u'10ths', u'11', u'12', u'120', u'126', u'13', u'130', u'14', u'15', u'16', u'164', u'17', u'18', u'180', u'19', u'1980reagan', u'20', u'200', u'2006', u'21', u'22', u'23', u'24', u'25', u'26', u'27', u'270', u'28', u'29', u'30', u'300', u'31', u'32', u'33', u'34', u'35', u'350', u'36', u'38', u'39', u'40', u'400', u'41', u'42', u'43', u'44', u'45', u'46', u'47', u'48', u'49', u'49er', u'50', u'500', u'51', u'52', u'53', u'538', u'54', u'55', u'57', u'58', u'59', u'5ths', u'60', u'61', u'62', u'63', u'65', u'66', u'67', u'68', u'69', u'70', u'700', u'71', u'73', u'74', u'743', u'75', u'77', u'780', u'79', u'80', u'800', u'83', u'86', u'87', u'888', u'90', u'94', u'95', u'97', u'99', u'aaa', u'aagre', u'aall', u'aand', u'aandf', u'aar', u'aaron', u'aarp', u'aassum', u'aation', u'aattack', u'ab', u'aback', u'abadi', u'abandon', u'abat', u'abbey', u'abbi', u'abbott', u'abc', u'abd', u'abdean', u'abdic', u'abdomen', u'abduct', u'abdul', u'abe', u'abe

In [48]:
test = cvectorizer.transform(["hello my name is jasmin im 12 years old trump man city caesarea isreal trump trump cat jasmin"])

In [65]:
arr_test = test.toarray()
print arr_test
print arr_test.tocsr()

[[0 0 0 ..., 0 0 0]]


AttributeError: 'numpy.ndarray' object has no attribute 'tocsr'

years
