In [None]:
"""
Non-negative matrix factorization(NMF)

It is an unsupervised algorithm that simultaneously performs
dimensionality reduction and clustering.
It can be used with TF-IDF to model topics across documents.

NMF factorizes matrix A into W & H such that none of the elements
in W & H are negative.

            A                       =>     W           .        H          : W >= 0, H >= 0
          (nXm)                          (nXk)                (kXm)
    (rows=Features, cols=Objects)      (rows=Features)      (cols=Objects)
    
    A: is a non-negative matrix.
    W: is basis matrix made up of basis vectors(the topics in the data)
    H: is coefficient matrix which contains the membership weights for 
       documents relative to each topic(cluster)

We will then approximate each object(cols of A) by a linear combination of k reduced dimensions
or 'basis vectors' in W.
Each basis vector can then be interpreted as a cluster and membership of objects in these
clusters is encoded by H.

Process of NMF:

1. Construct vector space model for documents(after stopword filtering),
   resulting in TDM(term document matrix) A.
2. Apply TF-IDF term weight normalisation to A.
3. Normalize TF-IDF vectors to unit length.
4. Initialize factors using NNDSVD(Nonnegative Double Singular Value Decomposition) on A.
5. Apply projected gradient NMF to A.
"""

In [1]:
file_path = "/home/viper/Downloads/UPDATED_NLP_COURSE/TextFiles/npr.csv"

In [2]:
import pandas as pd

In [3]:
npr = pd.read_csv(file_path)

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
tf_vec = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')

In [6]:
dtm  = tf_vec.fit_transform(npr['Article'])

In [7]:
dtm.shape

(11992, 54777)

In [8]:
from sklearn.decomposition import NMF

In [11]:
nmf_model = NMF(n_components=7, random_state=42, verbose=0)

In [12]:
nmf_model.fit(dtm)

In [14]:
tf_vec.get_feature_names_out()[2000]

'africa'

In [16]:
for index, topic in enumerate(nmf_model.components_):
    print(f"TOP 15 WORDS FOR TOPIC {index+1}")
    print([tf_vec.get_feature_names_out()[i] for i in topic.argsort()[-15:]])
    print()

TOP 15 WORDS FOR TOPIC 1
['new', 'research', 'like', 'patients', 'health', 'disease', 'percent', 'women', 'virus', 'study', 'water', 'food', 'people', 'zika', 'says']

TOP 15 WORDS FOR TOPIC 2
['gop', 'pence', 'presidential', 'russia', 'administration', 'election', 'republican', 'obama', 'white', 'house', 'donald', 'campaign', 'said', 'president', 'trump']

TOP 15 WORDS FOR TOPIC 3
['senate', 'house', 'people', 'act', 'law', 'tax', 'plan', 'republicans', 'affordable', 'obamacare', 'coverage', 'medicaid', 'insurance', 'care', 'health']

TOP 15 WORDS FOR TOPIC 4
['officers', 'syria', 'security', 'department', 'law', 'isis', 'russia', 'government', 'state', 'attack', 'president', 'reports', 'court', 'said', 'police']

TOP 15 WORDS FOR TOPIC 5
['primary', 'cruz', 'election', 'democrats', 'percent', 'party', 'delegates', 'vote', 'state', 'democratic', 'hillary', 'campaign', 'voters', 'sanders', 'clinton']

TOP 15 WORDS FOR TOPIC 6
['love', 've', 'don', 'album', 'way', 'time', 'song', 'life'

In [17]:
topic_results = nmf_model.transform(dtm)

In [21]:
npr['Topic'] = topic_results.argmax(axis=1)

In [22]:
npr.head()

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3
4,"From photography, illustration and video, to d...",6


In [23]:
npr['Topic'].value_counts()

5    3579
3    2808
0    2433
1    1295
4     641
6     635
2     601
Name: Topic, dtype: int64

In [None]:
"""
topic_dict = {1: 'Politics', 2:'Alpha', ....}
npr['Topic'].map(lambda value: topic_dict[value])
"""