## Website: https://medium.com/mlreview/topic-modeling-with-scikit-learn-e80d33668730

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx),
              " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 10

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online',
                                learning_offset=50.,random_state=0).fit(tf)

no_top_words = 10

print('NMF')
display_topics(nmf, tfidf_feature_names, no_top_words)
print('\n')
print('LDA')
display_topics(lda, tf_feature_names, no_top_words)



NMF
Topic 0: don just people think like know time good right ve
Topic 1: card video monitor drivers cards bus vga driver color ram
Topic 2: god jesus bible christ faith believe christian christians church sin
Topic 3: game team year games season players play hockey win player
Topic 4: car new 00 sale 10 price offer condition shipping 20
Topic 5: thanks does know advance mail hi anybody info looking help
Topic 6: windows file use files dos window program using problem running
Topic 7: edu soon cs university com email internet article ftp send
Topic 8: key chip encryption clipper keys government escrow public use algorithm
Topic 9: drive scsi drives hard disk ide controller floppy cd mac


LDA
Topic 0: people gun armenian armenians war turkish states israel said children
Topic 1: government people law mr use president don think right public
Topic 2: space program output entry data nasa use science research build
Topic 3: key car chip used keys bike use bit clipper number
Topic 4: edu fil

## Bleach Report articles

In [26]:
import pickle, codecs, json

docs_raw_filename = "docs_raw_saturday2"
with open (docs_raw_filename, 'rb') as fp:
    raw_docs = pickle.load(fp)

docs_cleaned_filename = "docs_cleaned_saturday2"
with open (docs_cleaned_filename, 'rb') as fp:
    cleaned_docs = pickle.load(fp)

# titles_filename = 'article_dicts_saturday2'
# with open (titles_filename, 'rb') as fp:
#     titles = pickle.load(fp)
    
article_dicts_filename = "article_dicts_saturday2"
all_articles = []
with codecs.open(article_dicts_filename,'rU','utf-8') as f:
    for line in f:
        all_articles.append(json.loads(line))

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx),
              " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

# dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
# documents = dataset.data
documents = raw_docs

no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 10

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online',
                                learning_offset=50.,random_state=0).fit(tf)

no_top_words = 10

print('NMF')
display_topics(nmf, tfidf_feature_names, no_top_words)
print('\n')
print('LDA')
display_topics(lda, tf_feature_names, no_top_words)



NMF
Topic 0: points game series celtics rebounds season games rockets assists jazz
Topic 1: nba triple mj 30 way best liangelo chanceraptors classmikal clowning
Topic 2: draft nfl browns lb landry football app wnba journey st
Topic 3: year season said team nba players league like knicks million
Topic 4: james cleveland cavaliers lebron cavs thomas lue trade love nance
Topic 5: warriors curry durant golden state kerr thompson green stephen klay
Topic 6: betting oddsshark odds cover picks games check nba point total
Topic 7: embiid philadelphia 76ers sixers heat miami fultz simmons joel game
Topic 8: tournament ncaa michigan villanova kansas duke state wildcats sg sf
Topic 9: leonard spurs antonio san popovich kawhi aldridge wright wojnarowski meeting


LDA
Topic 0: percent season points year game point shooting nba just ball
Topic 1: game games nba vs conference state warriors round series rockets
Topic 2: million free trade year season team lakers pick deal contract
Topic 3: draft tour

In [34]:
raw_nmf, raw_lda = nmf.transform(tfidf), lda.transform(tf) 
print(len(raw_nmf), len(raw_lda))

3316 3316


# 2-norm

In [81]:
from random import randint

nn_nmf, nn_lda = {}, {}

for _ in range(20):
    i = randint(0, 3315) # assume won't pick twice the same i, over the 20 iterations
    n_nmf, n_lda = 0, 0
    for j in range(3316):
        if sum((raw_nmf[i]-raw_nmf[j])**2) < sum((raw_nmf[i]-raw_nmf[n_nmf])**2) and sum((raw_nmf[i]-raw_nmf[j])**2) > 0:
            n_nmf = j
        if sum((raw_lda[i]-raw_lda[j])**2) < sum((raw_lda[i]-raw_lda[n_lda])**2) and sum((raw_lda[i]-raw_lda[j])**2) > 0:
            n_lda = j
    nn_nmf[i], nn_lda[i] = n_nmf, n_lda

print('Check 20:', len(nn_nmf.keys())==20, '\n')

print('NMF:', nn_nmf, '\n')
print('LDA:', nn_lda, '\n')

print('***** NMF *****', '\n')
for i,j in nn_nmf.items():
    print(i, '\t', all_articles[i]['title'])
    print(j, '\t', all_articles[j]['title'], '\n')

print('***** LDA *****', '\n')
for i,j in nn_lda.items():
    print(i, '\t', all_articles[i]['title'])
    print(j, '\t', all_articles[j]['title'], '\n')

Check 20: True 

NMF: {1124: 1020, 2289: 1550, 2370: 1692, 2133: 2077, 90: 30, 638: 1454, 2555: 701, 2759: 2958, 2997: 2743, 3124: 1885, 2036: 2334, 1635: 655, 3002: 3003, 1707: 1935, 2218: 3082, 2741: 3181, 1806: 1325, 1041: 1045, 3194: 2190, 1533: 1391} 

LDA: {1124: 452, 2289: 2766, 2370: 902, 2133: 1832, 90: 1372, 638: 1963, 2555: 2359, 2759: 2214, 2997: 149, 3124: 2313, 2036: 2870, 1635: 644, 3002: 2215, 1707: 1313, 2218: 3188, 2741: 2792, 1806: 2619, 1041: 1048, 3194: 2348, 1533: 1493} 

***** NMF ***** 

1124 	 LeBron James on 46-Point Outburst vs. Pacers: 'I Was in a Really Good Rhythm'
1020 	 Kevin Durant, Warriors Bounce Spurs Amid Rumors of Kawhi Leonard's Future 

2289 	 Dirk Nowitzki Becomes 6th Player in NBA History to Record 50,000 Career Minutes
1550 	 A History of Jimmy Butler's Bench Antics 

2370 	 Darren Collison to Undergo Knee Surgery, Miss 2-3 Weeks with Injury
1692 	 2018 NBA Playoff MVP Rankings: Can Anyone Catch LeBron James? 

2133 	 Kyrie Irving Reportedly W

# 1-norm

In [82]:
from random import randint

nn_nmf, nn_lda = {}, {}

for _ in range(20):
    i = randint(0, 3315) # assume won't pick twice the same i, over the 20 iterations
    n_nmf, n_lda = 0, 0
    for j in range(3316):
        if sum(abs(raw_nmf[i]-raw_nmf[j])) < sum(abs(raw_nmf[i]-raw_nmf[n_nmf])) and sum(abs(raw_nmf[i]-raw_nmf[j])) > 0:
            n_nmf = j
        if sum(abs(raw_lda[i]-raw_lda[j])) < sum(abs(raw_lda[i]-raw_lda[n_lda])) and sum(abs(raw_lda[i]-raw_lda[j])) > 0:
            n_lda = j
    nn_nmf[i], nn_lda[i] = n_nmf, n_lda

print('Check 20:', len(nn_nmf.keys())==20, '\n')

print('NMF:', nn_nmf, '\n')
print('LDA:', nn_lda, '\n')

print('***** NMF *****', '\n')
for i,j in nn_nmf.items():
    print(i, '\t', all_articles[i]['title'])
    print(j, '\t', all_articles[j]['title'], '\n')

print('***** LDA *****', '\n')
for i,j in nn_lda.items():
    print(i, '\t', all_articles[i]['title'])
    print(j, '\t', all_articles[j]['title'], '\n')

Check 20: True 

NMF: {1231: 2707, 485: 431, 2038: 2874, 300: 1366, 101: 50, 2700: 2763, 479: 167, 2943: 2241, 1: 248, 3261: 3189, 757: 1998, 3138: 3033, 475: 718, 2063: 1573, 2090: 2096, 2727: 2244, 228: 217, 2766: 1706, 1638: 2485, 330: 138} 

LDA: {1231: 3088, 485: 207, 2038: 310, 300: 277, 101: 786, 2700: 3019, 479: 229, 2943: 1326, 1: 1020, 3261: 3062, 757: 2549, 3138: 565, 475: 240, 2063: 2993, 2090: 2243, 2727: 2990, 228: 669, 2766: 3115, 1638: 2271, 330: 458} 

***** NMF ***** 

1231 	 DeMarcus Cousins Offers to Pay for Stephon Clark's Funeral After Police Shooting
2707 	 Manchester United Transfer News: Latest Rumours on Paul Pogba, Jean Michael Seri 

485 	 Kevin Durant Says There Is 100 Percent Chance He Returns to Warriors Next Season
431 	 Stephen Curry Reportedly Will Return from Ankle Injury for Hawks vs. Warriors 

2038 	 JR Smith Suspended 1 Game by Cavaliers for Detrimental Conduct
2874 	 NCAA Tournament 2018: Projecting Bracket-Busters Before Conference Tournaments 


# max_norm

In [90]:
from random import randint

nn_nmf, nn_lda = {}, {}

for _ in range(20):
    i = randint(0, 3315) # assume won't pick twice the same i, over the 20 iterations
    n_nmf, n_lda = 0, 0
    for j in range(3316):
        if abs(max(raw_nmf[i])-max(raw_nmf[j])) < abs(max(raw_nmf[i])-max(raw_nmf[n_nmf])) and sum(abs(raw_nmf[i]-raw_nmf[j])) > 0:
            n_nmf = j
        if abs(max(raw_lda[i])-max(raw_lda[j])) < abs(max(raw_lda[i])-max(raw_lda[n_lda])) and sum(abs(raw_lda[i]-raw_lda[j])) > 0:
            n_lda = j
    nn_nmf[i], nn_lda[i] = n_nmf, n_lda

print('Check 20:', len(nn_nmf.keys())==20, '\n')

print('NMF:', nn_nmf, '\n')
print('LDA:', nn_lda, '\n')

print('***** NMF *****', '\n')
for i,j in nn_nmf.items():
    print(i, '\t', all_articles[i]['title'])
    print(j, '\t', all_articles[j]['title'], '\n')

print('***** LDA *****', '\n')
for i,j in nn_lda.items():
    print(i, '\t', all_articles[i]['title'])
    print(j, '\t', all_articles[j]['title'], '\n')

Check 20: True 

NMF: {646: 1842, 1300: 1460, 457: 2122, 2520: 359, 661: 2878, 1219: 2901, 2394: 2122, 2583: 598, 822: 541, 1018: 298, 2665: 2089, 404: 710, 2226: 996, 1631: 3107, 2185: 2394, 1969: 2373, 1105: 2253, 20: 2659, 2488: 1288, 87: 599} 

LDA: {646: 1588, 1300: 2682, 457: 2430, 2520: 241, 661: 204, 1219: 755, 2394: 1641, 2583: 1845, 822: 59, 1018: 2060, 2665: 3074, 404: 3262, 2226: 2254, 1631: 961, 2185: 1986, 1969: 1125, 1105: 2613, 20: 3133, 2488: 2399, 87: 992} 

***** NMF ***** 

646 	 LeBron James: 'Give Me Damian Lillard; I'll Show You How Appreciated He'll Be'
1842 	 Cavaliers Rumors: Tyronn Lue Returning from Absence to Coach vs. Wizards 

1300 	 2018 NBA China Games Schedule of Dates, Matchups Announced
1460 	 Joel Embiid Travels with 76ers After Post Requesting Not to Be 'F-cking Babied' 

457 	 Adam Silver Says NBA Not Ready for 1-16 Playoff Seeding
2122 	 LeBron James Talks with ESPN About Training with Ex-Navy SEAL to Fix Back Injury 

2520 	 Kyrie Irving 'Proud'

# Annex

In [25]:
from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

n_samples = 2000
n_features = 1000
n_components = 10
n_top_words = 20


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()


# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics
# to filter out useless terms early on: the posts are stripped of headers,
# footers and quoted replies, and common English words, words occurring in
# only one document or in at least 95% of the documents are removed.

print("Loading dataset...")
t0 = time()
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))
data_samples = dataset.data[:n_samples]
print("done in %0.3fs." % (time() - t0))

# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))
print()

# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

# Fit the NMF model
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Loading dataset...
done in 1.102s.
Extracting tf-idf features for NMF...
done in 0.245s.
Extracting tf features for LDA...
done in 0.250s.

Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=2000 and n_features=1000...
done in 0.167s.

Topics in NMF model (Frobenius norm):
Topic #0: just people don think like know time good make way really say right ve want did ll new use years
Topic #1: windows use dos using window program os drivers application help software pc running ms screen files version card code work
Topic #2: god jesus bible faith christian christ christians does heaven sin believe lord life church mary atheism belief human love religion
Topic #3: thanks know does mail advance hi info interested email anybody looking card help like appreciated information send list video need
Topic #4: car cars tires miles 00 new engine insurance price condition oil power speed good 000 brake year models used bought
Topic #5: edu soon com send university internet mit ftp m