In [1]:
import os
import time
import re
import nltk
import numpy as np
from nltk.tokenize import word_tokenize
from pymongo import MongoClient
from sklearn.feature_extraction.text import CountVectorizer
import segments

In [2]:
DAY = 86400000
HOUR = 3600000
def millis():
    return int(round(time.time() * 1000))

def millis_since(num_days='2'):
    days = int(os.environ.get('TIME_FRAME_DAYS', num_days))
    return millis() - days*DAY

def get_all_segments():
    MONGO_URL = 'mongodb://um.media.mit.edu:27017/super-glue'
    collection = MongoClient(MONGO_URL).get_default_database()['media']

    #"date_added": {"$gt": millis_since('1')},
    all_media_has_segments = collection.find({"story_segments":{"$exists": True},"is_news":{"$eq": True}})
    num_of_videos = all_media_has_segments.count()
    print "%d videos"%num_of_videos
    # total_segments = 0
    all_segments = []
    for media in all_media_has_segments:
        segs = get_texts(media)
        all_segments.extend(segs)
    print "%d total segments"%len(all_segments)
    return all_segments

In [3]:
def get_texts(media):
    segs = media["story_segments"]
    # com_caps = media["commercials_captions"]
    texts = []
    file_name = lambda x:''.join(x.split('.')[4:])
    for i in range (len(segs)):
        text = ""
        if "text" in segs[i]:
            text = segs[i]["text"]
        length = float(segs[i]["end"])-float(segs[i]["start"])
        if len(text.strip())>200 and length>4000:
            texts.append({
                "text":text})
    return texts

In [17]:
# load nltk's SnowballStemmer as variabled 'stemmer'
from nltk.stem.snowball import SnowballStemmer

def process_texts(all_segments):
    stemmer = SnowballStemmer("english")
    print ("starting to precess texts")
    # load nltk's English stopwords as variable called 'stopwords'
    stopwords = nltk.corpus.stopwords.words('english')
    print ("got stopwords!")
    pattern = re.compile('[\W_]+')
    #nouns = ['NN', 'NNS', 'NNPS', 'NNP', 'JJ']
    for seg in all_segments:
        parts = [pattern.sub('', token).lower() for token in word_tokenize(seg["text"])]
        clean = [i for i in parts if i not in stopwords and len(re.findall('[a-zA-Z]', i))>2]
        seg["processed"] = clean

    seg_texts = [seg["text"] for seg in all_segments] # list of all tweet texts
    seg_texts_processed = [str.join(" ", seg["processed"]) for seg in all_segments] # list of pre-processed tweet texts
    for seg in all_segments:
        seg.pop("processed")
    return seg_texts_processed

In [51]:
all_segments = get_all_segments()
processed_segments = process_texts(all_segments)
stopwords = nltk.corpus.stopwords.words('english')
cvectorizer = CountVectorizer(max_df=1.0, min_df=1, max_features=150000, stop_words=stopwords, ngram_range=(1,1))
cvz = cvectorizer.fit(processed_segments)

7644 videos
25237 total segments
starting to precess texts
got stopwords!


In [52]:
vocab_arr = cvectorizer.get_feature_names()
vocab_dict = cvectorizer.vocabulary_

In [53]:
print len(vocab_arr)

78732


In [20]:
# save vocabulary to files
import json
with open('non_stemmed_vocab_arr.json', 'w') as outfile:
    json.dump({"vocab":no_stemmed_vocab_arr}, outfile)
with open('non_stemmed_vocab_dict.json', 'w') as outfile:
    json.dump(no_stemmed_vocab_dict, outfile)

In [None]:
# save stopwords
with open('stopwords.json', 'w') as outfile:
    json.dump({"stopwords":stopwords}, outfile)

In [21]:
test = cvectorizer.transform(["hello my name is jasmin im 12 years old trump man city caesarea isreal trump trump cat jasmin"])

print test
print "---------"
arr = test.toarray()
print {i:arr[0][i] for i in np.nonzero(arr[0])[0]}

In [23]:
with open('vocab_arr.json', 'r') as outfile:
    vocab_stemmed = json.load(outfile)["vocab"]
with open('vocab_dict.json', 'r') as outfile:
    vocab_dict_stemmed = json.load(outfile)
with open('non_stemmed_vocab_arr.json', 'r') as outfile:
    vocab_non_stemmed = json.load(outfile)["vocab"]
with open('non_stemmed_vocab_dict.json', 'r') as outfile:
    vocab_dict_non_stemmed = json.load(outfile)
with open('stopwords.json', 'r') as outfile:
    stopwords_to_save = json.load (outfile)["stopwords"]

In [50]:
# Don't run this!! 
# should only be run once to add vocabulary and stopwords to UM mongo

# MONGO_URL = 'mongodb://um.media.mit.edu:27017/super-glue'
MONGO_URL = 'mongodb://jasmin:viral@ds019058.mlab.com:19058/super-glue-debug'
collection = MongoClient(MONGO_URL).get_default_database()['nlp_data']
collection.insert_one({
        "vocab_stemmed":vocab_stemmed,
        "vocab_stemmed_dict":vocab_dict_stemmed,
        "vocab_non_stemmed": vocab_non_stemmed,
        "vocab_non_stemmed_dict": vocab_dict_non_stemmed,
        "stopwords":stopwords_to_save
    })

<pymongo.results.InsertOneResult at 0x11acbe690>