## Website: https://medium.com/mlreview/topic-modeling-with-scikit-learn-e80d33668730

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx),
              " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 10

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online',
                                learning_offset=50.,random_state=0).fit(tf)

no_top_words = 10

print('NMF')
display_topics(nmf, tfidf_feature_names, no_top_words)
print('\n')
print('LDA')
display_topics(lda, tf_feature_names, no_top_words)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


NMF
Topic 0: don just people think like know time good right ve
Topic 1: card video monitor drivers cards bus vga driver color ram
Topic 2: god jesus bible christ faith believe christian christians church sin
Topic 3: game team year games season players play hockey win player
Topic 4: car new 00 sale 10 price offer condition shipping 20
Topic 5: thanks does know advance mail hi anybody info looking help
Topic 6: windows file use files dos window program using problem running
Topic 7: edu soon cs university com email internet article ftp send
Topic 8: key chip encryption clipper keys government escrow public use algorithm
Topic 9: drive scsi drives hard disk ide controller floppy cd mac


LDA
Topic 0: people gun armenian armenians war turkish states israel said children
Topic 1: government people law mr use president don think right public
Topic 2: space program output entry data nasa use science research build
Topic 3: key car chip used keys bike use bit clipper number
Topic 4: edu fil

## Bleach Report articles

In [1]:
import pickle, codecs, json

docs_raw_filename = "docs_raw_saturday2"
with open (docs_raw_filename, 'rb') as fp:
    raw_docs = pickle.load(fp)

docs_cleaned_filename = "docs_cleaned_saturday2"
with open (docs_cleaned_filename, 'rb') as fp:
    cleaned_docs = pickle.load(fp)

# titles_filename = 'article_dicts_saturday2'
# with open (titles_filename, 'rb') as fp:
#     titles = pickle.load(fp)
    
article_dicts_filename = "article_dicts_saturday2"
all_articles = []
with codecs.open(article_dicts_filename,'rU','utf-8') as f:
    for line in f:
        all_articles.append(json.loads(line))

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx),
              " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

# dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
# documents = dataset.data
documents = raw_docs

no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 13

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online',
                                learning_offset=50.,random_state=0).fit(tf)

no_top_words = 10

print('NMF')
display_topics(nmf, tfidf_feature_names, no_top_words)
print('\n')
print('LDA')
display_topics(lda, tf_feature_names, no_top_words)



NMF
Topic 0: points game rebounds series rockets season jazz thunder games assists
Topic 1: nba mj triple 30 way best liangelo possiblethe clowning doublesthe
Topic 2: draft nfl browns lb landry app football wnba journey st
Topic 3: year said like players league million nba team season player
Topic 4: james cleveland cavaliers lebron cavs lue thomas love trade hill
Topic 5: warriors curry durant golden state kerr thompson green stephen klay
Topic 6: knicks coach york new porzingis head coaching assistant jackson seasons
Topic 7: embiid philadelphia 76ers sixers heat miami simmons fultz joel game
Topic 8: tournament ncaa michigan kansas villanova wildcats vs state duke loyola
Topic 9: leonard spurs antonio san popovich kawhi aldridge wright wojnarowski return
Topic 10: betting oddsshark odds cover picks games check nba point total
Topic 11: celtics irving boston smart hayward kyrie knee brown surgery marcus
Topic 12: sg sf pg pf draft freshman ayton doncic suns bagley


LDA
Topic 0: per

In [3]:
raw_nmf, raw_lda = nmf.transform(tfidf), lda.transform(tf)

print(len(raw_nmf), len(raw_lda))

3316 3316


In [20]:
import numpy as np


def dist2_nmf(i, j):
    return np.sqrt(sum((raw_nmf[i]-raw_nmf[j])**2))

def dist2_lda(i, j):
    return np.sqrt(sum((raw_lda[i]-raw_lda[j])**2))

def dist1_nmf(i, j):
    return sum(abs(raw_nmf[i]-raw_nmf[j]))

def dist1_lda(i, j):
    return sum(abs(raw_lda[i]-raw_lda[j]))

def distinf_nmf(i, j):
    return abs(max(raw_nmf[i])-max(raw_nmf[j]))

def distinf_lda(i, j):
    return abs(max(raw_lda[i])-max(raw_lda[j]))


def dist2_nmf_2(i, j):
    return np.sqrt(sum((raw_nmf[i]-raw_nmf[j])**2))+np.sqrt(sum((raw_nmf[nn1_nmf[i]]-raw_nmf[j])**2))

def dist2_lda_2(i, j):
    return np.sqrt(sum((raw_lda[i]-raw_lda[j])**2))+np.sqrt(sum((raw_lda[nn1_lda[i]]-raw_lda[j])**2))

def dist1_nmf_2(i, j):
    return sum(abs(raw_nmf[i]-raw_nmf[j]))+sum(abs(raw_nmf[nn1_nmf[i]]-raw_nmf[j]))

def dist1_lda_2(i, j):
    return sum(abs(raw_lda[i]-raw_lda[j]))+sum(abs(raw_lda[nn1_lda[i]]-raw_lda[j]))

def distinf_nmf_2(i, j):
    return abs(max(raw_nmf[i])-max(raw_nmf[j]))+abs(max(raw_nmf[nn1_nmf[i]])-max(raw_nmf[j]))

def distinf_lda_2(i, j):
    return abs(max(raw_lda[i])-max(raw_lda[j]))+abs(max(raw_lda[nn1_lda[i]])-max(raw_lda[j]))

Train Word2Vec Here

In [3]:
import pickle
with open("docs_word2vec_saturday2", 'rb') as fp:
    word2vec_docs = pickle.load(fp)

In [None]:
from gensim.models import Word2Vec

### 2-norm

In [19]:
from random import randint

print('*'*110)
print('************************************ Recommendation after the 1st article ************************************')
print('*'*110, '\n')

nn1_nmf, nn1_lda = {}, {}

for _ in range(20):
    i = randint(0, 3315) # assume won't pick twice the same i, over the 20 iterations
    n_nmf, n_lda = 0, 0
    for j in range(3316):
        if dist2_nmf(i, j) < dist2_nmf(i, n_nmf) and dist2_nmf(i, j) > 0:
            n_nmf = j
        if dist2_lda(i, j) < dist2_lda(i, n_lda) and dist2_lda(i, j) > 0:
            n_lda = j
    nn1_nmf[i], nn1_lda[i] = n_nmf, n_lda

print('Check 20:', len(nn1_nmf.keys())==20, '\n')

print('NMF:', nn1_nmf, '\n')
print('LDA:', nn1_lda, '\n')

print('***** NMF *****', '\n')
for i,j in nn1_nmf.items():
    print(i, '\t', all_articles[i]['title'])
    print(j, '\t', all_articles[j]['title'], '\n')

print('***** LDA *****', '\n')
for i,j in nn1_lda.items():
    print(i, '\t', all_articles[i]['title'])
    print(j, '\t', all_articles[j]['title'], '\n')

print('*'*110)
print('************************************ Recommendation after the 2nd article ************************************')
print('*'*110, '\n')

nn2_nmf, nn2_lda = {}, {}

for i in nn1_nmf.keys():
    n_nmf, n_lda = 0, 0
    for j in range(3316):
        if dist2_nmf_2(i, j) < dist2_nmf_2(i, n_nmf) and dist2_nmf(i, j) > 0 and dist2_nmf(nn1_nmf[i], j) > 0:
            n_nmf = j
        if dist2_lda_2(i, j) < dist2_lda_2(i, n_lda) and dist2_lda(i, j) > 0 and dist2_lda(nn1_lda[i], j) > 0:
            n_lda = j
    nn2_nmf[i], nn2_lda[i] = n_nmf, n_lda

print('NMF:', nn2_nmf, '\n')
print('LDA:', nn2_lda, '\n')

print('***** NMF *****', '\n')
for i,j in nn2_nmf.items():
    print(i, '\t', all_articles[i]['title'])
    print(nn1_nmf[i], '\t', all_articles[nn1_nmf[i]]['title'])
    print(j, '\t', all_articles[j]['title'], '\n')

print('***** LDA *****', '\n')
for i,j in nn2_lda.items():
    print(i, '\t', all_articles[i]['title'])
    print(nn1_lda[i], '\t', all_articles[nn1_lda[i]]['title'])
    print(j, '\t', all_articles[j]['title'], '\n')

**************************************************************************************************************
************************************ Recommendation after the 1st article ************************************
************************************************************************************************************** 

Check 20: True 

NMF: {1480: 980, 1271: 375, 2197: 2077, 1647: 1977, 1833: 2646, 1769: 2464, 524: 925, 1282: 1393, 2407: 2806, 1186: 2287, 3052: 2540, 11: 6, 1936: 1687, 1471: 2187, 2566: 1305, 1050: 1971, 963: 1104, 2217: 2122, 816: 916, 197: 2766} 

LDA: {1480: 2265, 1271: 1505, 2197: 2457, 1647: 2252, 1833: 3028, 1769: 2508, 524: 1396, 1282: 1255, 2407: 1870, 1186: 1611, 3052: 3027, 11: 3, 1936: 626, 1471: 611, 2566: 2162, 1050: 1159, 963: 1406, 2217: 625, 816: 217, 197: 2070} 

***** NMF ***** 

1480 	 Why Did Knicks Trade for Emmanuel Mudiay When They Have Frank Ntilikina?
980 	 George Hill Trade Rumors: Rockets, Cavaliers Interested in Kings PG 

1271

### 1-norm

In [21]:
from random import randint

print('*'*110)
print('************************************ Recommendation after the 1st article ************************************')
print('*'*110, '\n')

nn1_nmf, nn1_lda = {}, {}

for _ in range(20):
    i = randint(0, 3315) # assume won't pick twice the same i, over the 20 iterations
    n_nmf, n_lda = 0, 0
    for j in range(3316):
        if dist1_nmf(i, j) < dist1_nmf(i, n_nmf) and dist1_nmf(i, j) > 0:
            n_nmf = j
        if dist1_lda(i, j) < dist1_lda(i, n_lda) and dist1_lda(i, j) > 0:
            n_lda = j
    nn1_nmf[i], nn1_lda[i] = n_nmf, n_lda

print('Check 20:', len(nn1_nmf.keys())==20, '\n')

print('NMF:', nn1_nmf, '\n')
print('LDA:', nn1_lda, '\n')

print('***** NMF *****', '\n')
for i,j in nn1_nmf.items():
    print(i, '\t', all_articles[i]['title'])
    print(j, '\t', all_articles[j]['title'], '\n')

print('***** LDA *****', '\n')
for i,j in nn1_lda.items():
    print(i, '\t', all_articles[i]['title'])
    print(j, '\t', all_articles[j]['title'], '\n')

print('*'*110)
print('************************************ Recommendation after the 2nd article ************************************')
print('*'*110, '\n')

nn2_nmf, nn2_lda = {}, {}

for i in nn1_nmf.keys():
    n_nmf, n_lda = 0, 0
    for j in range(3316):
        if dist1_nmf_2(i, j) < dist1_nmf_2(i, n_nmf) and dist1_nmf(i, j) > 0 and dist1_nmf(nn1_nmf[i], j) > 0:
            n_nmf = j
        if dist1_lda_2(i, j) < dist1_lda_2(i, n_lda) and dist1_lda(i, j) > 0 and dist1_lda(nn1_lda[i], j) > 0:
            n_lda = j
    nn2_nmf[i], nn2_lda[i] = n_nmf, n_lda

print('NMF:', nn2_nmf, '\n')
print('LDA:', nn2_lda, '\n')

print('***** NMF *****', '\n')
for i,j in nn2_nmf.items():
    print(i, '\t', all_articles[i]['title'])
    print(nn1_nmf[i], '\t', all_articles[nn1_nmf[i]]['title'])
    print(j, '\t', all_articles[j]['title'], '\n')

print('***** LDA *****', '\n')
for i,j in nn2_lda.items():
    print(i, '\t', all_articles[i]['title'])
    print(nn1_lda[i], '\t', all_articles[nn1_lda[i]]['title'])
    print(j, '\t', all_articles[j]['title'], '\n')

**************************************************************************************************************
************************************ Recommendation after the 1st article ************************************
************************************************************************************************************** 

Check 20: True 

NMF: {3042: 2708, 392: 665, 2849: 2857, 1689: 381, 1258: 3160, 2609: 3103, 1072: 2505, 697: 449, 2005: 936, 2029: 1767, 1459: 1687, 426: 135, 3287: 2708, 3220: 2825, 2130: 865, 2227: 2581, 1568: 2556, 1344: 2016, 3267: 2719, 2198: 2482} 

LDA: {3042: 2755, 392: 930, 2849: 2855, 1689: 1694, 1258: 487, 2609: 2924, 1072: 3087, 697: 1233, 2005: 56, 2029: 1767, 1459: 2139, 426: 135, 3287: 2623, 3220: 3153, 2130: 610, 2227: 2091, 1568: 2928, 1344: 548, 3267: 3214, 2198: 2638} 

***** NMF ***** 

3042 	 Michele Roberts Reportedly Will Seek New Contract as NBPA Executive
2708 	 HS Star Anfernee Simons Decides CBB Isn't for Him, but Is He Ready for 

### inf-norm

In [23]:
from random import randint

print('*'*110)
print('************************************ Recommendation after the 1st article ************************************')
print('*'*110, '\n')

nn1_nmf, nn1_lda = {}, {}

for _ in range(20):
    i = randint(0, 3315) # assume won't pick twice the same i, over the 20 iterations
    n_nmf, n_lda = 0, 0
    for j in range(3316):
        if dist2_nmf(i, j) < dist2_nmf(i, n_nmf) and dist2_nmf(i, j) > 0:
            n_nmf = j
        if dist2_lda(i, j) < dist2_lda(i, n_lda) and dist2_lda(i, j) > 0:
            n_lda = j
    nn1_nmf[i], nn1_lda[i] = n_nmf, n_lda

print('Check 20:', len(nn1_nmf.keys())==20, '\n')

print('NMF:', nn1_nmf, '\n')
print('LDA:', nn1_lda, '\n')

print('***** NMF *****', '\n')
for i,j in nn1_nmf.items():
    print(i, '\t', all_articles[i]['title'])
    print(j, '\t', all_articles[j]['title'], '\n')

print('***** LDA *****', '\n')
for i,j in nn1_lda.items():
    print(i, '\t', all_articles[i]['title'])
    print(j, '\t', all_articles[j]['title'], '\n')

print('*'*110)
print('************************************ Recommendation after the 2nd article ************************************')
print('*'*110, '\n')

nn2_nmf, nn2_lda = {}, {}

for i in nn1_nmf.keys():
    n_nmf, n_lda = 0, 0
    for j in range(3316):
        if distinf_nmf_2(i, j) < distinf_nmf_2(i, n_nmf) and distinf_nmf(i, j) > 0 and distinf_nmf(nn1_nmf[i], j) > 0:
            n_nmf = j
        if distinf_lda_2(i, j) < distinf_lda_2(i, n_lda) and distinf_lda(i, j) > 0 and distinf_lda(nn1_lda[i], j) > 0:
            n_lda = j
    nn2_nmf[i], nn2_lda[i] = n_nmf, n_lda

print('NMF:', nn2_nmf, '\n')
print('LDA:', nn2_lda, '\n')

print('***** NMF *****', '\n')
for i,j in nn2_nmf.items():
    print(i, '\t', all_articles[i]['title'])
    print(nn1_nmf[i], '\t', all_articles[nn1_nmf[i]]['title'])
    print(j, '\t', all_articles[j]['title'], '\n')

print('***** LDA *****', '\n')
for i,j in nn2_lda.items():
    print(i, '\t', all_articles[i]['title'])
    print(nn1_lda[i], '\t', all_articles[nn1_lda[i]]['title'])
    print(j, '\t', all_articles[j]['title'], '\n')

**************************************************************************************************************
************************************ Recommendation after the 1st article ************************************
************************************************************************************************************** 

Check 20: False 

NMF: {2804: 3071, 1381: 3033, 1839: 1079, 2324: 1915, 2467: 1940, 1994: 581, 2132: 2187, 596: 1217, 780: 1993, 3181: 2741, 1748: 1241, 65: 128, 404: 1299, 3008: 3137, 508: 435, 3297: 3309, 3246: 2876, 1581: 1221, 2157: 789} 

LDA: {2804: 1551, 1381: 2698, 1839: 2087, 2324: 1134, 2467: 124, 1994: 1622, 2132: 1631, 596: 740, 780: 3165, 3181: 2882, 1748: 620, 65: 70, 404: 532, 3008: 3136, 508: 21, 3297: 3309, 3246: 2540, 1581: 1141, 2157: 1983} 

***** NMF ***** 

2804 	 Kemba Walker Remains on Hornets After NBA Trade Deadline Passes
3071 	 Survivor of Two Plane Crashes, Austin Hatch Inspires Michigan's Final Four Run 

1381 	 LeBron James An

# Annex

In [25]:
from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

n_samples = 2000
n_features = 1000
n_components = 10
n_top_words = 20


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()


# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics
# to filter out useless terms early on: the posts are stripped of headers,
# footers and quoted replies, and common English words, words occurring in
# only one document or in at least 95% of the documents are removed.

print("Loading dataset...")
t0 = time()
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))
data_samples = dataset.data[:n_samples]
print("done in %0.3fs." % (time() - t0))

# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))
print()

# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

# Fit the NMF model
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Loading dataset...
done in 1.102s.
Extracting tf-idf features for NMF...
done in 0.245s.
Extracting tf features for LDA...
done in 0.250s.

Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=2000 and n_features=1000...
done in 0.167s.

Topics in NMF model (Frobenius norm):
Topic #0: just people don think like know time good make way really say right ve want did ll new use years
Topic #1: windows use dos using window program os drivers application help software pc running ms screen files version card code work
Topic #2: god jesus bible faith christian christ christians does heaven sin believe lord life church mary atheism belief human love religion
Topic #3: thanks know does mail advance hi info interested email anybody looking card help like appreciated information send list video need
Topic #4: car cars tires miles 00 new engine insurance price condition oil power speed good 000 brake year models used bought
Topic #5: edu soon com send university internet mit ftp m