In [1]:
import pandas as pd
npr = pd.read_csv('npr.csv')
npr.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_df = 0.95, min_df = 2, stop_words = 'english')

# Variable name used is dtm, but it generates tfidf vectors
dtm = tfidf.fit_transform(npr['Article'])
dtm

<11992x54777 sparse matrix of type '<class 'numpy.float64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

In [3]:
from sklearn.decomposition import NMF

nmf_model = NMF(n_components = 7, random_state = 42)
nmf_model.fit(dtm)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=7, random_state=42, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [4]:
tfidf.get_feature_names()[25000]

'infiltrated'

In [5]:
# In LDA we deal with words with highest probability values for a topic
# Here in NMF, we deal with words with highest coefficient values for a topic
for index, topic in enumerate(nmf_model.components_):
  print(f"THE TOP 15 WORDS FOR TOPIC # {index}")
  print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
  print('\n')
  print('\n')

THE TOP 15 WORDS FOR TOPIC # 0
['new', 'research', 'like', 'patients', 'health', 'disease', 'percent', 'women', 'virus', 'study', 'water', 'food', 'people', 'zika', 'says']




THE TOP 15 WORDS FOR TOPIC # 1
['gop', 'pence', 'presidential', 'russia', 'administration', 'election', 'republican', 'obama', 'white', 'house', 'donald', 'campaign', 'said', 'president', 'trump']




THE TOP 15 WORDS FOR TOPIC # 2
['senate', 'house', 'people', 'act', 'law', 'tax', 'plan', 'republicans', 'affordable', 'obamacare', 'coverage', 'medicaid', 'insurance', 'care', 'health']




THE TOP 15 WORDS FOR TOPIC # 3
['officers', 'syria', 'security', 'department', 'law', 'isis', 'russia', 'government', 'state', 'attack', 'president', 'reports', 'court', 'said', 'police']




THE TOP 15 WORDS FOR TOPIC # 4
['primary', 'cruz', 'election', 'democrats', 'percent', 'party', 'delegates', 'vote', 'state', 'democratic', 'hillary', 'campaign', 'voters', 'sanders', 'clinton']




THE TOP 15 WORDS FOR TOPIC # 5
['love', 

In [6]:
topic_results = nmf_model.transform(dtm)
topic_results[0]

array([0.        , 0.12075603, 0.00140297, 0.05919954, 0.01518909,
       0.        , 0.        ])

In [7]:
topic_results[0].argmax()

1

In [8]:
npr['Topic'] = topic_results.argmax(axis=1)
npr.head()

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3
4,"From photography, illustration and video, to d...",6


In [9]:
topic_dict = {0 : 'Healthcare' , 1 : 'Election', 2 : 'Legislation', 3 : 'Current Affairs', 4 : 'Campaign', 5 : 'Music', 6 :'Education'}
npr['Topic Label'] = npr['Topic'].map(topic_dict)
npr.head()

Unnamed: 0,Article,Topic,Topic Label
0,"In the Washington of 2016, even when the polic...",1,Election
1,Donald Trump has used Twitter — his prefe...,1,Election
2,Donald Trump is unabashedly praising Russian...,1,Election
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3,Current Affairs
4,"From photography, illustration and video, to d...",6,Education
