In [11]:
from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

In [12]:
n_samples = 2000
n_features = 1000
n_topics = 50
n_top_words = 10

In [13]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()


In [14]:
data = pd.read_csv("/Users/rituc/twitter_data/stream_table1.csv")

In [15]:
print("Loading dataset...")
t0 = time()
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))
data_samples = dataset.data[:n_samples]
print("done in %0.3fs." % (time() - t0))

Loading dataset...
done in 3.351s.


In [16]:
data_samples = list(data[:n_samples]["Tweet content"])

In [17]:
data_samples

['Wind 3.2 mph NNE. Barometer 30.20 in, Rising slowly. Temperature 49.3 \xc2\xb0F. Rain today 0.00 in. Humidity 32%',
 'Pausa pro caf\xc3\xa9 antes de embarcar no pr\xc3\xb3ximo v\xc3\xb4o. #trippolisontheroad #danipolisviaja \n\nPause for\xe2\x80\xa6 https://t.co/PhcJ4oYktP',
 'Good. Morning. #morning #Saturday #diner #VT #breakfast #nucorpsofcadetsring #ring #college\xe2\x80\xa6 https://t.co/dBZ7dbwX6f',
 '@gratefuldead recordstoredayus \xf0\x9f\x8c\xb9\xf0\x9f\x8c\xb9\xf0\x9f\x8c\xb9 @ TOMS MUSIC TRADE https://t.co/CURRmn6iJo',
 'Egg in a muffin!!! (@ Rocket Baby Bakery - @rocketbabybaker in Wauwatosa, WI) https://t.co/mwfhrcxtRp',
 "@lyricwaters should've gave the neighbor  a buzz. Iv got ice cream and moms baked goodies ",
 'On the way to CT! (@ Mamaroneck, NY in Mamaroneck, NY) https://t.co/6rpe6MXDkB',
 "We're #hiring! Read about our latest #job opening here: Retail Sales Consultant [CWA MOB] Bryn Mawr PA - https://t.co/bBwxSPsL4f #Retail",
 'Me... @ Montgomery Scrap Corporation

In [18]:
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')

Extracting tf-idf features for NMF...


In [19]:
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))


done in 0.122s.


In [20]:
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

Extracting tf features for LDA...
done in 0.107s.


In [21]:
# Fit the NMF model
print("Fitting the NMF model with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_topics, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model:")

Fitting the NMF model with tf-idf features, n_samples=2000 and n_features=1000...
done in 0.914s.

Topics in NMF model:


In [22]:
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

Topic #0:
https en ca park run california hill way beach best
Topic #1:
jobs careerarc hiring job https hospitality alert businessmgmt engineering shift
Topic #2:
great fit interested near job hiring https careerarc columbus manufacturing
Topic #3:
opening latest read hiring job view https technician management maintenance
Topic #4:
work want details view click jobs opening hiring job latest
Topic #5:
retail store positions associate maurices consultant cashier crew careerarc leader
Topic #6:
morning good https early charlotte god like doing says airport
Topic #7:
nurse rn registered nursing practical licensed lpn staffing intensive accountable
Topic #8:
just posted photo https california francisco country san video monkey
Topic #9:
day https game record lol way rsd16 gym eat stadium
Topic #10:
apply click latest job hiring https details engineer manufacturing operator
Topic #11:
recommend job hiring careerarc https nursing manufacturing ky louisville customerservice
Topic #12:
wind ra

In [23]:
print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)

Fitting LDA models with tf features, n_samples=2000 and n_features=1000...


In [24]:
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

done in 2.761s.


In [25]:
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model:
Topic #0:
quality shopdreamcollection iamdreamhair virgin dreamhaircollection highest dixie stores tonight head
Topic #1:
careerarc job hiring jobs https sb engineering analyst ga south
Topic #2:
chicago grand morning https jobs ky officeteam managed massachusetts air
Topic #3:
https job hiring latest opening veterans hospitality jobs careerarc join
Topic #4:
world disney star https espn wide heart wars sports marathon
Topic #5:
wake trying на fayetteville nc practical job care licensed lpn
Topic #6:
ri newport hall museum international https road work latest opening
Topic #7:
children treats street mt support wv donnie nc tavern favorite
Topic #8:
https great hiring fit job interested near fl careerarc good
Topic #9:
que tus te el check like records mechanic surgical support
Topic #10:
pharmaceutical project nj manager clinical recommend job hiring https uniform
Topic #11:
resort palm island comes said https fl great job country
Topic #12:
orlando 20 16 st houseb