# ***Latent-Dirichlet-Allocation***

In [17]:
import pandas as pd
import numpy as np

In [2]:
npr = pd.read_csv('/content/drive/MyDrive/Data Science/NLP/npr.csv')

In [3]:
npr.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_df=0.9, min_df=2, stop_words='english')

In [5]:
dtm = cv.fit_transform(npr['Article'])

In [6]:
from sklearn.decomposition import LatentDirichletAllocation
LDA = LatentDirichletAllocation(n_components=7, random_state=42)

In [7]:
LDA.fit(dtm)

In [8]:
# Grab the vocabulary of words

len(cv.get_feature_names_out())

54777

In [12]:
type(cv.get_feature_names_out())

numpy.ndarray

In [22]:
import random

random_word_id = random.randint(0, 54777)

cv.get_feature_names_out()[random_word_id]

'bureaucrats'

In [23]:
# Grab the Topics
single_topic = LDA.components_[0]

In [24]:
single_topic.argsort()[-10:]

array([33390, 36310, 21228, 10425, 31464,  8149, 36283, 22673, 42561,
       42993])

In [25]:
top_ten_words = single_topic.argsort()[-10:]

In [26]:
for word in top_ten_words:
  print(cv.get_feature_names_out()[word])
  print('\n')

new


percent


government


company


million


care


people


health


said


says




In [27]:
for i, topic in enumerate(LDA.components_):
  print(f"Top 15 Words for Topic #{i}:")
  print([cv.get_feature_names_out()[index] for index in topic.argsort()[-10:]])
  print('\n')

Top 15 Words for Topic #0:
['new', 'percent', 'government', 'company', 'million', 'care', 'people', 'health', 'said', 'says']


Top 15 Words for Topic #1:
['npr', 'reports', 'says', 'news', 'people', 'told', 'police', 'president', 'trump', 'said']


Top 15 Words for Topic #2:
['time', 'water', 'city', 'new', 'years', 'food', 'just', 'people', 'like', 'says']


Top 15 Words for Topic #3:
['disease', 'patients', 'just', 'children', 'study', 'like', 'women', 'health', 'people', 'says']


Top 15 Words for Topic #4:
['obama', 'court', 'republican', 'campaign', 'people', 'state', 'president', 'clinton', 'said', 'trump']


Top 15 Words for Topic #5:
['new', 'way', 'music', 'really', 'time', 'know', 'think', 'people', 'just', 'like']


Top 15 Words for Topic #6:
['people', 'time', 'schools', 'just', 'education', 'new', 'like', 'students', 'school', 'says']




In [30]:
topics_results = LDA.transform(dtm)

In [31]:
topics_results[0].round(2)

array([0.02, 0.68, 0.  , 0.  , 0.3 , 0.  , 0.  ])

In [32]:
topics_results[0].argmax()

1

In [33]:
npr['Topic'] = topics_results.argmax(axis=1)

In [34]:
npr.head()

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",1
4,"From photography, illustration and video, to d...",2


# ***Non-Negative-Matrix-Factorization***

In [35]:
npr2 = pd.read_csv('/content/drive/MyDrive/Data Science/NLP/npr.csv')

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [37]:
tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')

In [42]:
dtm = tfidf.fit_transform(npr2['Article'])

In [43]:
from sklearn.decomposition import NMF

In [44]:
nmf_model = NMF(n_components=7, random_state=42)

In [45]:
nmf_model.fit(dtm)

In [47]:
for i, topic in enumerate(nmf_model.components_):
  print(f"Top 15 Words for Topic #{i}:")
  print([tfidf.get_feature_names_out()[index] for index in topic.argsort()[-15:]])
  print('\n')

Top 15 Words for Topic #0:
['new', 'research', 'like', 'patients', 'health', 'disease', 'percent', 'women', 'virus', 'study', 'water', 'food', 'people', 'zika', 'says']


Top 15 Words for Topic #1:
['gop', 'pence', 'presidential', 'russia', 'administration', 'election', 'republican', 'obama', 'white', 'house', 'donald', 'campaign', 'said', 'president', 'trump']


Top 15 Words for Topic #2:
['senate', 'house', 'people', 'act', 'law', 'tax', 'plan', 'republicans', 'affordable', 'obamacare', 'coverage', 'medicaid', 'insurance', 'care', 'health']


Top 15 Words for Topic #3:
['officers', 'syria', 'security', 'department', 'law', 'isis', 'russia', 'government', 'state', 'attack', 'president', 'reports', 'court', 'said', 'police']


Top 15 Words for Topic #4:
['primary', 'cruz', 'election', 'democrats', 'percent', 'party', 'delegates', 'vote', 'state', 'democratic', 'hillary', 'campaign', 'voters', 'sanders', 'clinton']


Top 15 Words for Topic #5:
['love', 've', 'don', 'album', 'way', 'time

In [48]:
topics_results = nmf_model.transform(dtm)

In [49]:
topics_results[0].round(2)

array([0.  , 0.12, 0.  , 0.06, 0.02, 0.  , 0.  ])

In [50]:
topics_results[0].argmax()

1

In [51]:
npr2['Topic'] = topics_results.argmax(axis=1)

In [52]:
npr2.head()

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3
4,"From photography, illustration and video, to d...",6


In [53]:
topic_names = [
    "Health & Science",           # Topic #0
    "U.S. Politics & Presidency", # Topic #1
    "Healthcare Legislation",     # Topic #2
    "Security & International Relations", # Topic #3
    "Elections & Voting",         # Topic #4
    "Music & Culture",            # Topic #5
    "Education"                   # Topic #6
]

In [54]:
npr2['Topic_Name'] = npr2['Topic'].map(lambda x: topic_names[x])

In [57]:
npr2.head(10)

Unnamed: 0,Article,Topic,Topic_Name
0,"In the Washington of 2016, even when the polic...",1,U.S. Politics & Presidency
1,Donald Trump has used Twitter — his prefe...,1,U.S. Politics & Presidency
2,Donald Trump is unabashedly praising Russian...,1,U.S. Politics & Presidency
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3,Security & International Relations
4,"From photography, illustration and video, to d...",6,Education
5,I did not want to join yoga class. I hated tho...,5,Music & Culture
6,With a who has publicly supported the debunk...,0,Health & Science
7,"I was standing by the airport exit, debating w...",0,Health & Science
8,"If movies were trying to be more realistic, pe...",0,Health & Science
9,"Eighteen years ago, on New Year’s Eve, David F...",5,Music & Culture
