## 1 - Latent Dirichlet Allocation

In [46]:
# We will be using articles from NPR (National Public Radio), obtained from their website www.npr.org

# Importing the Libraries and the Dataset

import pandas as pd
npr = pd.read_csv('npr.csv')
npr.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [2]:
# Text Preprocessing - Unstructured Text

from sklearn.feature_extraction.text import CountVectorizer

**`max_df`**` : float in range [0.0, 1.0] or int, default=1.0`<br>
When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.

**`min_df`**` : float in range [0.0, 1.0] or int, default=1`<br>
When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.

In [3]:
# Fit CV to the Dataset and Transform it
cv = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = cv.fit_transform(npr['Article'])

# Get the shaper of Document Term Matrix
dtm

<11992x54777 sparse matrix of type '<class 'numpy.int64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

In [4]:
# LDA

from sklearn.decomposition import LatentDirichletAllocation

LDA = LatentDirichletAllocation(n_components=7,random_state=42)

# This can take awhile, as we're dealing with a large amount of documents!
LDA.fit(dtm)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=7, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [8]:
# The Length of Column represents no of Unique Words
dtm.shape[1]

54777

In [13]:
# Show the Words/Features - The words will be stored as Features

# Get the Length of Column
print(len(cv.get_feature_names()))

# Get some Random 10 words
import random

for i in range(0,11):
    random_word_id = random.randint(0,dtm.shape[1])
    print(random_word_id, '--->', cv.get_feature_names()[random_word_id])

54777
8671 ---> cents
53070 ---> warp
20944 ---> glockenspiel
33287 ---> nephew
28542 ---> libel
43114 ---> schatz
5674 ---> bianchi
30287 ---> marxist
27794 ---> lakisha
2892 ---> andreessen
45662 ---> somber


### Top Word Occurances in a Topic

In [14]:
# n_components / No of Topics
print(len(LDA.components_))

7


In [15]:
# Each Document is in Array format, with 7 Topic probablities
LDA.components_

array([[8.64332806e+00, 2.38014333e+03, 1.42900522e-01, ...,
        1.43006821e-01, 1.42902042e-01, 1.42861626e-01],
       [2.76191749e+01, 5.36394437e+02, 1.42857148e-01, ...,
        1.42861973e-01, 1.42857147e-01, 1.42906875e-01],
       [7.22783888e+00, 8.24033986e+02, 1.42857148e-01, ...,
        6.14236247e+00, 2.14061364e+00, 1.42923753e-01],
       ...,
       [3.11488651e+00, 3.50409655e+02, 1.42857147e-01, ...,
        1.42859912e-01, 1.42857146e-01, 1.42866614e-01],
       [4.61486388e+01, 5.14408600e+01, 3.14281373e+00, ...,
        1.43107628e-01, 1.43902481e-01, 2.14271779e+00],
       [4.93991422e-01, 4.18841042e+02, 1.42857151e-01, ...,
        1.42857146e-01, 1.43760101e-01, 1.42866201e-01]])

In [16]:
# Check the Features/Words for 1st topic
len(LDA.components_[0])

54777

In [20]:
# Chech for a Single Topic

single_topic = LDA.components_[0]

# Returns the indices that would sort this array.
print(single_topic.argsort())

# Get the Word least representative of this topic - Random Index
single_topic[18302]

# Get the Word most representative of this topic - Random Index
single_topic[42993]

[ 2475 18302 35285 ... 22673 42561 42993]


6247.245510521071

In [21]:
# Get the Top 10 Words in the Topic

# Top 10 words for this topic:
top_word_indices = single_topic.argsort()[-10:]

# Get the Names of these Top Words
for index in top_word_indices:
    print(cv.get_feature_names()[index])

new
percent
government
company
million
care
people
health
said
says


In [22]:
# Look for Top 15 words in All the Topics

for index,topic in enumerate(LDA.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['companies', 'money', 'year', 'federal', '000', 'new', 'percent', 'government', 'company', 'million', 'care', 'people', 'health', 'said', 'says']


THE TOP 15 WORDS FOR TOPIC #1
['military', 'house', 'security', 'russia', 'government', 'npr', 'reports', 'says', 'news', 'people', 'told', 'police', 'president', 'trump', 'said']


THE TOP 15 WORDS FOR TOPIC #2
['way', 'world', 'family', 'home', 'day', 'time', 'water', 'city', 'new', 'years', 'food', 'just', 'people', 'like', 'says']


THE TOP 15 WORDS FOR TOPIC #3
['time', 'new', 'don', 'years', 'medical', 'disease', 'patients', 'just', 'children', 'study', 'like', 'women', 'health', 'people', 'says']


THE TOP 15 WORDS FOR TOPIC #4
['voters', 'vote', 'election', 'party', 'new', 'obama', 'court', 'republican', 'campaign', 'people', 'state', 'president', 'clinton', 'said', 'trump']


THE TOP 15 WORDS FOR TOPIC #5
['years', 'going', 've', 'life', 'don', 'new', 'way', 'music', 'really', 'time', 'know', 'think',

In [23]:
# Attaching Discovered Topic Labels to Original Articles

print(dtm.shape)
len(npr)

(11992, 54777)


11992

In [24]:
# USE .transform() to transform the Matrix to Topics
topic_results = LDA.transform(dtm)

In [25]:
topic_results.shape

(11992, 7)

In [27]:
# Check for a Sample
print(topic_results[0])

# Get the Highest probablity of the result across all topics
print(topic_results[0].round(2))

# Get the Top Result - Which Topic does this array belongs
topic_results[0].argmax()

[1.61040465e-02 6.83341493e-01 2.25376318e-04 2.25369288e-04
 2.99652737e-01 2.25479379e-04 2.25497980e-04]
[0.02 0.68 0.   0.   0.3  0.   0.  ]


1

In [28]:
# Apply the results for all the Topics in the Dataset

# .argmax(axis=1) - Applies the Max Topic Index across all rows
npr['Topic'] = topic_results.argmax(axis=1)

In [29]:
# Show the DataFrame
npr.head(10)

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",1
4,"From photography, illustration and video, to d...",2
5,I did not want to join yoga class. I hated tho...,3
6,With a who has publicly supported the debunk...,3
7,"I was standing by the airport exit, debating w...",2
8,"If movies were trying to be more realistic, pe...",3
9,"Eighteen years ago, on New Year’s Eve, David F...",2


## 2 - Non-Negative Matrix Factorization

Let's repeat the Topic modeling task, but this time, we will use NMF instead of LDA.

In [47]:
# Same Setup Above Till Creating Document Term Matrix (dtm), But we apply TFIDF instead of CountVectorizer

# Data Import
npr = pd.read_csv('npr.csv')

# Import Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Creating the Object
tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')

dtm = tfidf.fit_transform(npr['Article'])

### Applying NMF

In [48]:
from sklearn.decomposition import NMF

# FIT NMF to Data

nmf_model = NMF(n_components=7,random_state=42)

# This can take awhile, we're dealing with a large amount of documents!
nmf_model.fit(dtm)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=7, random_state=42, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [49]:
# NMF is Much Faster than LDA as it works on with numpy arrays

# Get the Topics

import random

# Print Random Features

for i in range(10):
    random_word_id = random.randint(0,54776)
    print(random_word_id, '--->', tfidf.get_feature_names()[random_word_id])

35841 ---> parts
37697 ---> postscript
5568 ---> berthing
30043 ---> marais
34692 ---> optimizing
47026 ---> strangling
22350 ---> hardening
39092 ---> quadruple
22513 ---> hasty
28234 ---> lecturing


In [50]:
# Check the No of Topics

len(nmf_model.components_)

7

In [51]:
# get the Length for Topic 1

len(nmf_model.components_[0])

54777

In [52]:
# Read through a Single Topic and get the Words for that topic

single_topic = nmf_model.components_[0]

# Top 10 words for this topic:
top_word_indices = single_topic.argsort()[-10:]

# Print the Feature Names

for index in top_word_indices:
    print(tfidf.get_feature_names()[index])

disease
percent
women
virus
study
water
food
people
zika
says


In [53]:
# Get the Topic and the Words for All the Topics

for index,topic in enumerate(nmf_model.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['new', 'research', 'like', 'patients', 'health', 'disease', 'percent', 'women', 'virus', 'study', 'water', 'food', 'people', 'zika', 'says']


THE TOP 15 WORDS FOR TOPIC #1
['gop', 'pence', 'presidential', 'russia', 'administration', 'election', 'republican', 'obama', 'white', 'house', 'donald', 'campaign', 'said', 'president', 'trump']


THE TOP 15 WORDS FOR TOPIC #2
['senate', 'house', 'people', 'act', 'law', 'tax', 'plan', 'republicans', 'affordable', 'obamacare', 'coverage', 'medicaid', 'insurance', 'care', 'health']


THE TOP 15 WORDS FOR TOPIC #3
['officers', 'syria', 'security', 'department', 'law', 'isis', 'russia', 'government', 'state', 'attack', 'president', 'reports', 'court', 'said', 'police']


THE TOP 15 WORDS FOR TOPIC #4
['primary', 'cruz', 'election', 'democrats', 'percent', 'party', 'delegates', 'vote', 'state', 'democratic', 'hillary', 'campaign', 'voters', 'sanders', 'clinton']


THE TOP 15 WORDS FOR TOPIC #5
['love', 've', 'don', 'al

In [54]:
# Adding the Original Labels to the Documents

print(dtm.shape)
print(len(npr))

(11992, 54777)
11992


In [55]:
# Transform to get the Topics of the Document Term Matrix

topic_results = nmf_model.transform(dtm)

In [56]:
topic_results.shape

(11992, 7)

In [57]:
# Check out the First Data

print(topic_results[0])
print(topic_results[0].round(2))
print(topic_results[0].argmax())

# This means that our model thinks that the first article belongs to topic #1.

[0.         0.12075603 0.00140297 0.05919954 0.01518909 0.
 0.        ]
[0.   0.12 0.   0.06 0.02 0.   0.  ]
1


In [58]:
# Combining the Tokens with Original Data

npr['Topic'] = topic_results.argmax(axis=1)

npr.head()

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3
4,"From photography, illustration and video, to d...",6


In [59]:
# To Apply Labels to the Topic, You can use

topic_dict = {0:'HealthResearch',1:'CampaignPolitics',2:'Tax-Legal',3:'Security-Police',4:'Election',5:'Music-Ent',6:'Education'}

npr['Topic_Label'] = npr["Topic"].map(topic_dict)

In [60]:
# Display Top 10 values
npr.head(10)

Unnamed: 0,Article,Topic,Topic_Label
0,"In the Washington of 2016, even when the polic...",1,CampaignPolitics
1,Donald Trump has used Twitter — his prefe...,1,CampaignPolitics
2,Donald Trump is unabashedly praising Russian...,1,CampaignPolitics
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3,Security-Police
4,"From photography, illustration and video, to d...",6,Education
5,I did not want to join yoga class. I hated tho...,5,Music-Ent
6,With a who has publicly supported the debunk...,0,HealthResearch
7,"I was standing by the airport exit, debating w...",0,HealthResearch
8,"If movies were trying to be more realistic, pe...",0,HealthResearch
9,"Eighteen years ago, on New Year’s Eve, David F...",5,Music-Ent
