In [1]:
import pandas as pd
import numpy as np

In [2]:
npr = pd.read_csv('./npr.csv')

In [3]:
npr.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [4]:
npr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11992 entries, 0 to 11991
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Article  11992 non-null  object
dtypes: object(1)
memory usage: 93.8+ KB


In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
# max_df = value between 0 and 1 removes words which appears in given % of documents
cv = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')


In [7]:
dtm = cv.fit_transform(npr['Article'])

In [8]:
dtm.shape

(11992, 54777)

In [9]:
from sklearn.decomposition import LatentDirichletAllocation

In [10]:
LDA = LatentDirichletAllocation(n_components=7, random_state=42)

In [11]:
LDA.fit(dtm)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=7, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [12]:
len(cv.get_feature_names())

54777

In [13]:
cv.get_feature_names()[5000]

'bask'

In [14]:
LDA.components_.shape

(7, 54777)

In [15]:
single_topic = LDA.components_[0]

In [16]:
single_topic.argsort()

array([ 2475, 18302, 35285, ..., 22673, 42561, 42993])

In [17]:
arr = np.array([10, 200, 1])

In [18]:
arr.argsort() # argsort returns sorted order as per index

array([2, 0, 1])

In [19]:
for index in single_topic.argsort()[-20:]:
    print(cv.get_feature_names()[index])

president
state
tax
insurance
trump
companies
money
year
federal
000
new
percent
government
company
million
care
people
health
said
says


In [20]:
for index,topic in enumerate(LDA.components_):
    print(f"The top 15 words for topic #{index+1}")
    print([cv.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print()
    print()

The top 15 words for topic #1
['companies', 'money', 'year', 'federal', '000', 'new', 'percent', 'government', 'company', 'million', 'care', 'people', 'health', 'said', 'says']


The top 15 words for topic #2
['military', 'house', 'security', 'russia', 'government', 'npr', 'reports', 'says', 'news', 'people', 'told', 'police', 'president', 'trump', 'said']


The top 15 words for topic #3
['way', 'world', 'family', 'home', 'day', 'time', 'water', 'city', 'new', 'years', 'food', 'just', 'people', 'like', 'says']


The top 15 words for topic #4
['time', 'new', 'don', 'years', 'medical', 'disease', 'patients', 'just', 'children', 'study', 'like', 'women', 'health', 'people', 'says']


The top 15 words for topic #5
['voters', 'vote', 'election', 'party', 'new', 'obama', 'court', 'republican', 'campaign', 'people', 'state', 'president', 'clinton', 'said', 'trump']


The top 15 words for topic #6
['years', 'going', 've', 'life', 'don', 'new', 'way', 'music', 'really', 'time', 'know', 'think',

In [21]:
topic_results = LDA.transform(dtm)

In [22]:
topic_results[0].argmax() # probability of each article in each topic

1

In [23]:
npr['Topic'] = topic_results.argmax(axis=1)

In [24]:
npr.head()

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",1
4,"From photography, illustration and video, to d...",2
