In [23]:
from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
import pandas as pd


In [2]:
n_samples = 2000
n_features = 1000
n_components = 10
n_top_words = 20

In [24]:
dataframes = {
    "cooking": pd.read_csv("/home/neo/ml1/data/light/cooking_light.csv"),
    "crypto": pd.read_csv("/home/neo/ml1/data/light/crypto_light.csv"),
    "robotics": pd.read_csv("/home/neo/ml1/data/light/robotics_light.csv"),
    "biology": pd.read_csv("/home/neo/ml1/data/light/biology_light.csv"),
    "travel": pd.read_csv("/home/neo/ml1/data/light/travel_light.csv"),
    "diy": pd.read_csv("/home/neo/ml1/data/light/diy_light.csv"),
    "physics": pd.read_csv("/home/neo/ml1/data/light/physics_light.csv"),
}

In [25]:
new_df = pd.DataFrame()
train_df = pd.DataFrame()
for df in dataframes:
    new_df = new_df.append(dataframes[df])
    new_df.fillna("NAN")
train_df["content"] = new_df["title"] +" "+ new_df["content"]
train_df = train_df.reset_index()

In [3]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [47]:
print("Loading dataset...")
t0 = time()
# dataset = fetch_20newsgroups(shuffle=True, random_state=1,remove=('headers', 'footers', 'quotes'))
dataset = train_df["content"]

Loading dataset...


In [49]:
print(dataset[:2])

0    chewy chocolate chip cookies chocolate chips c...
1    cook bacon oven heard people cooking bacon ove...
Name: content, dtype: object


In [57]:
data_samples = []
for row in train_df["content"]:
    data_samples.append(str(row))

In [58]:
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=n_features,stop_words='english')

Extracting tf-idf features for NMF...


In [59]:
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

done in 10.463s.


In [60]:
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,max_features=n_features,stop_words='english')

Extracting tf features for LDA...


In [61]:
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))
print()

done in 10.430s.



In [62]:
# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))

Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=2000 and n_features=1000...


In [63]:
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

done in 22.923s.


In [64]:
print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


Topics in NMF model (Frobenius norm):
Topic #0: equation quantum state theory function particle wave mechanics space question operator states physics understand particles spin momentum hamiltonian equations classical
Topic #1: visa passport uk schengen travel transit visit days citizen country apply entry airport stay flight tourist valid usa india travelling
Topic #2: water pressure hot temperature heater cold air pipe tank heat valve shower pump drain gas flow house sink liquid surface
Topic #3: light speed switch wave waves photons photon wavelength black faster lights source frequency travel fixture universe beam red laser visible
Topic #4: energy potential kinetic electron mass heat temperature photon electrons conservation work matter atom particle photons total particles radiation momentum universe
Topic #5: field magnetic electric charge current fields potential charges charged wire electrons conductor law electromagnetic surface magnet direction point flux sphere
Topic #6: wa

In [65]:
# Fit the NMF model
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))

Fitting the NMF model (generalized Kullback-Leibler divergence) with tf-idf features, n_samples=2000 and n_features=1000...


In [None]:
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

In [None]:
print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

In [None]:
print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))

In [None]:
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)

In [18]:
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

done in 4.152s.


In [19]:
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model:
Topic #0: edu com mail send graphics ftp pub available contact university list faq ca information cs 1993 program sun uk mit
Topic #1: don like just know think ve way use right good going make sure ll point got need really time doesn
Topic #2: christian think atheism faith pittsburgh new bible radio games alt lot just religion like book read play time subject believe
Topic #3: drive disk windows thanks use card drives hard version pc software file using scsi help does new dos controller 16
Topic #4: hiv health aids disease april medical care research 1993 light information study national service test led 10 page new drug
Topic #5: god people does just good don jesus say israel way life know true fact time law want believe make think
Topic #6: 55 10 11 18 15 team game 19 period play 23 12 13 flyers 20 25 22 17 24 16
Topic #7: car year just cars new engine like bike good oil insurance better tires 000 thing speed model brake driving performance
Topic #8: people said