In [23]:
from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
import pandas as pd


In [24]:
n_samples = 2000
n_features = 1000
n_components = 7
n_top_words = 1

In [25]:
dataframes = {
    "cooking": pd.read_csv("/home/neo/ml1/data/light/cooking_light.csv"),
    "crypto": pd.read_csv("/home/neo/ml1/data/light/crypto_light.csv"),
    "robotics": pd.read_csv("/home/neo/ml1/data/light/robotics_light.csv"),
    "biology": pd.read_csv("/home/neo/ml1/data/light/biology_light.csv"),
    "travel": pd.read_csv("/home/neo/ml1/data/light/travel_light.csv"),
    "diy": pd.read_csv("/home/neo/ml1/data/light/diy_light.csv"),
    "physics": pd.read_csv("/home/neo/ml1/data/light/physics_light.csv"),
}

In [26]:
new_df = pd.DataFrame()
train_df = pd.DataFrame()
for df in dataframes:
    new_df = new_df.append(dataframes[df])
    new_df.fillna("NAN")
train_df["content"] = new_df["title"] +" "+ new_df["content"]
train_df = train_df.reset_index()

In [27]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [28]:
print("Loading dataset...")
t0 = time()
# dataset = fetch_20newsgroups(shuffle=True, random_state=1,remove=('headers', 'footers', 'quotes'))
dataset = train_df["content"]

Loading dataset...


In [29]:
print(dataset[:2])

0    chewy chocolate chip cookies chocolate chips c...
1    cook bacon oven heard people cooking bacon ove...
Name: content, dtype: object


In [30]:
data_samples = []
for row in train_df["content"]:
    data_samples.append(str(row))

In [31]:
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=n_features,stop_words='english')

Extracting tf-idf features for NMF...


In [32]:
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

done in 10.883s.


In [33]:
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,max_features=n_features,stop_words='english')

Extracting tf features for LDA...


In [34]:
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))
print()

done in 10.937s.



In [35]:
# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))

Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=2000 and n_features=1000...


In [36]:
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

done in 15.731s.


In [37]:
print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


Topics in NMF model (Frobenius norm):
Topic #0: equation
Topic #1: visa
Topic #2: water
Topic #3: light
Topic #4: energy
Topic #5: field
Topic #6: key



In [38]:
# Fit the NMF model
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))

Fitting the NMF model (generalized Kullback-Leibler divergence) with tf-idf features, n_samples=2000 and n_features=1000...


In [39]:
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

done in 92.978s.


In [40]:
print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


Topics in NMF model (generalized Kullback-Leibler divergence):
Topic #0: question
Topic #1: visa
Topic #2: water
Topic #3: time
Topic #4: light
Topic #5: field
Topic #6: work



In [41]:
print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))

Fitting LDA models with tf features, n_samples=2000 and n_features=1000...


In [42]:
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)

In [43]:
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

done in 168.477s.


In [44]:
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model:
Topic #0: quantum
Topic #1: electric
Topic #2: field
Topic #3: mass
Topic #4: energy
Topic #5: equation
Topic #6: light

