# Latent Dirichlet Allocation
       Method for Topic Modeling
## Data

We will be using articles scraped from NPR (National Public Radio), obtained from their website [www.npr.org](http://www.npr.org)

In [1]:
import pandas as pd
import numpy as np

## Loading data and performing some EDA

In [2]:
data = pd.read_csv('npr.csv')

In [3]:
data.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [4]:
data.Article[0][:300]

'In the Washington of 2016, even when the policy can be bipartisan, the politics cannot. And in that sense, this year shows little sign of ending on Dec. 31. When President Obama moved to sanction Russia over its alleged interference in the U. S. election just concluded, some Republicans who had long'

In [5]:
print(data.shape,end='\n\n')
data.info()

(11992, 1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11992 entries, 0 to 11991
Data columns (total 1 columns):
Article    11992 non-null object
dtypes: object(1)
memory usage: 93.8+ KB


## Creating document term matrix for our article using CountVectorizer

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
cv = CountVectorizer(max_df=0.9,min_df=2,stop_words='english')
dtm = cv.fit_transform(data['Article'])
dtm.shape

(11992, 54777)

## Using Non-Negative Matrix Factorization method for Topic Modeling

In [8]:
from sklearn.decomposition import LatentDirichletAllocation

In [9]:
LDA = LatentDirichletAllocation(n_components=7,random_state=42) # choosing 7 topics to discover from our documents

In [10]:
LDA.fit(dtm)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=7, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=42, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [11]:
LDA.components_.shape

(7, 54777)

In [12]:
LDA.components_

array([[8.64332806e+00, 2.38014333e+03, 1.42900522e-01, ...,
        1.43006821e-01, 1.42902042e-01, 1.42861626e-01],
       [2.76191749e+01, 5.36394437e+02, 1.42857148e-01, ...,
        1.42861973e-01, 1.42857147e-01, 1.42906875e-01],
       [7.22783888e+00, 8.24033986e+02, 1.42857148e-01, ...,
        6.14236247e+00, 2.14061364e+00, 1.42923753e-01],
       ...,
       [3.11488651e+00, 3.50409655e+02, 1.42857147e-01, ...,
        1.42859912e-01, 1.42857146e-01, 1.42866614e-01],
       [4.61486388e+01, 5.14408600e+01, 3.14281373e+00, ...,
        1.43107628e-01, 1.43902481e-01, 2.14271779e+00],
       [4.93991422e-01, 4.18841042e+02, 1.42857151e-01, ...,
        1.42857146e-01, 1.43760101e-01, 1.42866201e-01]])

## Getting the top 10 words assigned to each topic after NMF process

In [13]:
for index,topic in enumerate(LDA.components_):
    print(f"Top 10 words for the Topic #{index} :")
    print([cv.get_feature_names()[i] for i in topic.argsort()[-10:]])

Top 10 words for the Topic #0 :
['new', 'percent', 'government', 'company', 'million', 'care', 'people', 'health', 'said', 'says']
Top 10 words for the Topic #1 :
['npr', 'reports', 'says', 'news', 'people', 'told', 'police', 'president', 'trump', 'said']
Top 10 words for the Topic #2 :
['time', 'water', 'city', 'new', 'years', 'food', 'just', 'people', 'like', 'says']
Top 10 words for the Topic #3 :
['disease', 'patients', 'just', 'children', 'study', 'like', 'women', 'health', 'people', 'says']
Top 10 words for the Topic #4 :
['obama', 'court', 'republican', 'campaign', 'people', 'state', 'president', 'clinton', 'said', 'trump']
Top 10 words for the Topic #5 :
['new', 'way', 'music', 'really', 'time', 'know', 'think', 'people', 'just', 'like']
Top 10 words for the Topic #6 :
['people', 'time', 'schools', 'just', 'education', 'new', 'like', 'students', 'school', 'says']


In [14]:
topic_results = LDA.transform(dtm)

In [15]:
topic_results.shape

(11992, 7)

## Assigning each document in our dataset the calculated topic number

In [16]:
data['Topic'] = topic_results.argmax(axis=1)
data.head()

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",1
4,"From photography, illustration and video, to d...",2


# Great job!!!