## NLP Topic Modeling Exercise

In [1]:
# import TfidfVectorizer and CountVectorizer from sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# import fetch_20newsgroups from sklearn.datasets
from sklearn.datasets import fetch_20newsgroups

# import NMF and LatentDirichletAllocation from sklearn
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [2]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

* create a variable called `'no_features'` and set its value to 100.

In [3]:
no_features = 100

* create a variable `'no_topics'` and set its value to 100

In [4]:
no_topics = 100

## NMF

* instantiate a TfidfVectorizer with the following parameters:


    * max_df = 0.95
    * min_df = 2
    * max_features = no_features
    * stop_words = 'english'

In [5]:
# Import the NLTK package and download the necessary data
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# view the stopwords
stopwords.words()

#view english stopwords
engstopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bevli\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
tfidfv = TfidfVectorizer(max_df = 0.95, min_df = 7, max_features = no_features,
                         stop_words = stopwords.words('english'))

* use fit_transform method of TfidfVectorizer to transform the documents

In [7]:
processed_features = tfidfv.fit_transform(documents)

* get the features names from TfidfVectorizer

In [8]:
feature_names = tfidfv.get_feature_names()
feature_names

['00',
 '10',
 '15',
 '20',
 'also',
 'another',
 'anyone',
 'around',
 'available',
 'ax',
 'b8f',
 'back',
 'believe',
 'better',
 'bit',
 'case',
 'com',
 'come',
 'could',
 'data',
 'drive',
 'edu',
 'etc',
 'even',
 'file',
 'find',
 'first',
 'g9v',
 'get',
 'go',
 'god',
 'going',
 'good',
 'got',
 'government',
 'help',
 'however',
 'information',
 'key',
 'know',
 'last',
 'law',
 'let',
 'like',
 'long',
 'look',
 'made',
 'mail',
 'make',
 'many',
 'max',
 'may',
 'might',
 'much',
 'must',
 'need',
 'never',
 'new',
 'number',
 'one',
 'part',
 'people',
 'please',
 'point',
 'power',
 'problem',
 'program',
 'question',
 'read',
 'really',
 'right',
 'said',
 'say',
 'see',
 'since',
 'something',
 'space',
 'still',
 'sure',
 'system',
 'take',
 'thanks',
 'things',
 'think',
 'time',
 'two',
 'us',
 'use',
 'used',
 'using',
 'want',
 'way',
 'well',
 'windows',
 'without',
 'work',
 'world',
 'would',
 'year',
 'years']

* instantiate NMF and fit transformed data

In [9]:
nmf = NMF(n_components=no_topics)
nmfd = nmf.fit_transform(processed_features)



## LDA w/ Sklearn

* instantiate a CountVectorizer with following parameters:


    * max_df = 0.95
    * min_df = 2
    * max_features = no_features
    * stop_words = 'english'

In [10]:
CountVectorizer = CountVectorizer( max_df = 0.95, min_df = 2
, max_features = no_features
, stop_words = 'english')

* use fit_transform method of CountVectorizer to transform documents

In [11]:
processed_features_ = CountVectorizer.fit_transform(documents)

* get the features names from TfidfVectorizer

In [12]:
feature_names_ = tfidfv.get_feature_names()
feature_names_

['00',
 '10',
 '15',
 '20',
 'also',
 'another',
 'anyone',
 'around',
 'available',
 'ax',
 'b8f',
 'back',
 'believe',
 'better',
 'bit',
 'case',
 'com',
 'come',
 'could',
 'data',
 'drive',
 'edu',
 'etc',
 'even',
 'file',
 'find',
 'first',
 'g9v',
 'get',
 'go',
 'god',
 'going',
 'good',
 'got',
 'government',
 'help',
 'however',
 'information',
 'key',
 'know',
 'last',
 'law',
 'let',
 'like',
 'long',
 'look',
 'made',
 'mail',
 'make',
 'many',
 'max',
 'may',
 'might',
 'much',
 'must',
 'need',
 'never',
 'new',
 'number',
 'one',
 'part',
 'people',
 'please',
 'point',
 'power',
 'problem',
 'program',
 'question',
 'read',
 'really',
 'right',
 'said',
 'say',
 'see',
 'since',
 'something',
 'space',
 'still',
 'sure',
 'system',
 'take',
 'thanks',
 'things',
 'think',
 'time',
 'two',
 'us',
 'use',
 'used',
 'using',
 'want',
 'way',
 'well',
 'windows',
 'without',
 'work',
 'world',
 'would',
 'year',
 'years']

* instantiate LatentDirichletAllocation and fit transformed data 

In [13]:
lda = LatentDirichletAllocation()
lda.fit_transform(processed_features_)

array([[0.01000207, 0.01000069, 0.01000184, ..., 0.01      , 0.2032331 ,
        0.01000132],
       [0.01111273, 0.01111323, 0.0111145 , ..., 0.01111111, 0.01111207,
        0.01111833],
       [0.01111117, 0.01111226, 0.14303224, ..., 0.01111111, 0.01111412,
        0.01111182],
       ...,
       [0.03333341, 0.03333526, 0.03333721, ..., 0.03333333, 0.0333397 ,
        0.03333579],
       [0.02000008, 0.02000303, 0.0200029 , ..., 0.02      , 0.02000189,
        0.02000136],
       [0.00400025, 0.00400036, 0.0040007 , ..., 0.004     , 0.43308564,
        0.00400016]])

* create a function `display_topics` that is able to display the top words in a topic for different models

In [39]:
def display_topics(model, vectorizer, top = 10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])for i in topic.argsort()[:-top - 1:-1]])

* display top 10 words from each topic from NMF model

In [37]:
display_topics(nmf, tfidfv)
print("=" * 20)

Topic 0:
[('say', 3.2426363496488153), ('people', 7.10126671430922e-05), ('god', 3.343159669262139e-09), ('first', 0.0), ('g9v', 0.0), ('get', 0.0), ('go', 0.0), ('going', 0.0), ('good', 0.0), ('got', 0.0)]
Topic 1:
[('please', 11.03591781287428), ('god', 1.2234308492061488e-09), ('years', 0.0), ('first', 0.0), ('g9v', 0.0), ('get', 0.0), ('go', 0.0), ('going', 0.0), ('good', 0.0), ('got', 0.0)]
Topic 2:
[('would', 3.844860814838243), ('god', 6.737144547111459e-10), ('years', 0.0), ('however', 0.0), ('first', 0.0), ('g9v', 0.0), ('get', 0.0), ('go', 0.0), ('going', 0.0), ('good', 0.0)]
Topic 3:
[('one', 4.104580607084754), ('god', 8.718251460666214e-10), ('years', 0.0), ('however', 0.0), ('first', 0.0), ('g9v', 0.0), ('get', 0.0), ('go', 0.0), ('going', 0.0), ('good', 0.0)]
Topic 4:
[('know', 6.71944240417515), ('god', 3.6232100627735275e-10), ('years', 0.0), ('however', 0.0), ('first', 0.0), ('g9v', 0.0), ('get', 0.0), ('go', 0.0), ('going', 0.0), ('good', 0.0)]
Topic 5:
[('edu', 4.46

In [23]:
display_topics2(nmf, tfidfv)
print("=" * 20)

Topic 0:
[('say', 3.2426363496488153), ('people', 7.10126671430922e-05), ('god', 3.343159669262139e-09), ('first', 0.0), ('g9v', 0.0), ('get', 0.0), ('go', 0.0), ('going', 0.0), ('good', 0.0), ('got', 0.0)]
Topic 1:
[('please', 11.03591781287428), ('god', 1.2234308492061488e-09), ('years', 0.0), ('first', 0.0), ('g9v', 0.0), ('get', 0.0), ('go', 0.0), ('going', 0.0), ('good', 0.0), ('got', 0.0)]
Topic 2:
[('would', 3.844860814838243), ('god', 6.737144547111459e-10), ('years', 0.0), ('however', 0.0), ('first', 0.0), ('g9v', 0.0), ('get', 0.0), ('go', 0.0), ('going', 0.0), ('good', 0.0)]
Topic 3:
[('one', 4.104580607084754), ('god', 8.718251460666214e-10), ('years', 0.0), ('however', 0.0), ('first', 0.0), ('g9v', 0.0), ('get', 0.0), ('go', 0.0), ('going', 0.0), ('good', 0.0)]
Topic 4:
[('know', 6.71944240417515), ('god', 3.6232100627735275e-10), ('years', 0.0), ('however', 0.0), ('first', 0.0), ('g9v', 0.0), ('get', 0.0), ('go', 0.0), ('going', 0.0), ('good', 0.0)]
Topic 5:
[('edu', 4.46

[('using', 8.11351059361515), ('max', 0.0038639270021702658), ('part', 1.7703283736012907e-07), ('however', 0.0), ('first', 0.0), ('g9v', 0.0), ('get', 0.0), ('go', 0.0), ('god', 0.0), ('going', 0.0)]
Topic 50:
[('said', 5.39993341065245), ('max', 0.0015152812658633268), ('last', 1.2262112088600783e-06), ('part', 9.377693185086467e-07), ('believe', 1.1647591846816342e-08), ('help', 0.0), ('g9v', 0.0), ('get', 0.0), ('go', 0.0), ('god', 0.0)]
Topic 51:
[('mail', 6.802966195183858), ('please', 0.0005327679103452707), ('first', 0.0), ('g9v', 0.0), ('get', 0.0), ('go', 0.0), ('god', 0.0), ('going', 0.0), ('good', 0.0), ('got', 0.0)]
Topic 52:
[('still', 6.698351341934194), ('last', 1.995538588561952e-05), ('believe', 1.3429447785500577e-08), ('help', 0.0), ('first', 0.0), ('g9v', 0.0), ('get', 0.0), ('go', 0.0), ('god', 0.0), ('going', 0.0)]
Topic 53:
[('something', 5.181606365590817), ('help', 0.0), ('find', 0.0), ('first', 0.0), ('g9v', 0.0), ('get', 0.0), ('go', 0.0), ('god', 0.0), ('go

* display top 10 words from each topic from LDA model

### Stretch: Use LDA w/ Gensim to do the same thing.