In [11]:
%matplotlib inline


# Topic extraction with Non-negative Matrix Factorization and Latent Dirichlet Allocation

This is an example of applying :class:`~sklearn.decomposition.NMF` and
:class:`~sklearn.decomposition.LatentDirichletAllocation` on a corpus
of documents and extract additive models of the topic structure of the
corpus.  The output is a plot of topics, each represented as bar plot
using top few words based on weights.

Non-negative Matrix Factorization is applied with two different objective
functions: the Frobenius norm, and the generalized Kullback-Leibler divergence.
The latter is equivalent to Probabilistic Latent Semantic Indexing.

The default parameters (n_samples / n_features / n_components) should make
the example runnable in a couple of tens of seconds. You can try to
increase the dimensions of the problem, but be aware that the time
complexity is polynomial in NMF. In LDA, the time complexity is
proportional to (n_samples * iterations).


In [12]:
from time import time
from sklearn.datasets import fetch_20newsgroups

n_samples = 2000

# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics
# to filter out useless terms early on: the posts are stripped of headers,
# footers and quoted replies

print("Loading dataset...")
t0 = time()
data, _ = fetch_20newsgroups(
    shuffle=True,
    random_state=1,
    remove=("headers", "footers", "quotes"),
    return_X_y=True,
)
data_samples = data[:n_samples]
print("done in %0.3fs." % (time() - t0))

Loading dataset...
done in 0.817s.


In [14]:
data_samples
data_samples[0]

"Well i'm not sure about the story nad it did seem biased. What\nI disagree with is your statement that the U.S. Media is out to\nruin Israels reputation. That is rediculous. The U.S. media is\nthe most pro-israeli media in the world. Having lived in Europe\nI realize that incidences such as the one described in the\nletter have occured. The U.S. media as a whole seem to try to\nignore them. The U.S. is subsidizing Israels existance and the\nEuropeans are not (at least not to the same degree). So I think\nthat might be a reason they report more clearly on the\natrocities.\n\tWhat is a shame is that in Austria, daily reports of\nthe inhuman acts commited by Israeli soldiers and the blessing\nreceived from the Government makes some of the Holocaust guilt\ngo away. After all, look how the Jews are treating other races\nwhen they got power. It is unfortunate.\n"

## Understanding vectorization

Following tutorial from https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction

In [15]:
# playing with countvector
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer

CountVectorizer()

In [17]:
corpus = [
     'This is the first document.',
     'This is the second second document.',
     'And the third one.',
     'Is this the first document?',
]

X = vectorizer.fit_transform(corpus)
X

<4x9 sparse matrix of type '<class 'numpy.int64'>'
	with 19 stored elements in Compressed Sparse Row format>

In [20]:
# the vectorizer has tokenized all the words (vocabullary)

# this will show the actual names
vectorizer.get_feature_names_out()

array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
       'this'], dtype=object)

In [22]:
# the X represents presence of each word in each document
# We passed 4 documents, therefore X has 4 rows
# each row contains 9 elements because there are 9 total words
# the value 0,1,2 show the count of number of times that word is present in that document
X.toarray()

# in first document:
# and is present 0 times, document is present 1 time, first is present 1 time

array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 1, 0, 1, 0, 2, 1, 0, 1],
       [1, 0, 0, 0, 1, 0, 1, 1, 0],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]])

In [24]:
# we created a vocabulary of individual words in the above
# it ignored relative positions of the words
#
# we can create a vocabulary of phrases as well by using n_grams
# for example, an n_gram of 2 will create a dictionary of all
# individual words as well as all 2 consecutive words

In [25]:
# we didn't use stop words in the above

### Understanding tf-idf

Based on readings and examples from https://scikit-learn.org/stable/modules/feature_extraction.html#tfidf-term-weighting

In [26]:
# we used the above vectorisation strategy to convert a raw text
# into a matrix of numbers
# each row in the matrix represents the document row.
#
# each row has equal number of elements
# the number of elements is the dictionary of all words
# the number in each row represents the count of number of times the dictionary word is used in document
#
# now we can use this matrix of numbers (bag-of-words) to do further analysis
#
# tf-idf is a method to give weighted information for each term in each document
# we use tfidf-transformer to convert the count matrix into weight matrix
#
# this transformation from count matrix -> to weighted matrix is done using formula:
# term weight = term-frequency * 1 / uniqueness of term
# i.e. terms-weight is directly proportional to its frequency
# but inversely proportional to how frequently it is found in OTHER documents
# this final term-weight is a value between 0 to 1.

In [27]:
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer(smooth_idf=False)
transformer

TfidfTransformer(smooth_idf=False)

In [30]:
# convert X bag-of-words into weighted average
tfidf = transformer.fit_transform(X)
tfidf.toarray()

array([[0.        , 0.43306685, 0.56943086, 0.43306685, 0.        ,
        0.        , 0.33631504, 0.        , 0.43306685],
       [0.        , 0.24014568, 0.        , 0.24014568, 0.        ,
        0.89006176, 0.18649454, 0.        , 0.24014568],
       [0.56115953, 0.        , 0.        , 0.        , 0.56115953,
        0.        , 0.23515939, 0.56115953, 0.        ],
       [0.        , 0.43306685, 0.56943086, 0.43306685, 0.        ,
        0.        , 0.33631504, 0.        , 0.43306685]])

## Returning to our code

In [33]:
# now data we have a sample data,
# lets convert it to a tf-idf vector

from sklearn.feature_extraction.text import TfidfVectorizer

# exclude common English words
# also exclude words only one document
# or in at least 95% of the documents
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(
    max_df=0.95, # ignore words which are present in over 95% documents
    min_df=2, # ignore words which are present in less than 2 documents
    max_features=1000, # consider only the top 1000 words by frequency
    stop_words="english", # ignore common english words such as the
)

t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

Extracting tf-idf features for NMF...
done in 0.173s.


In [35]:
tfidf_vectorizer.get_feature_names_out()

array(['00', '000', '10', '100', '11', '12', '128', '13', '130', '14',
       '15', '16', '17', '18', '19', '1992', '1993', '20', '200', '21',
       '22', '23', '24', '25', '250', '26', '27', '28', '29', '2nd', '30',
       '300', '31', '32', '33', '34', '35', '36', '37', '38', '3d', '40',
       '42', '43', '44', '45', '48', '49', '50', '500', '51', '55', '60',
       '66', '70', '72', '75', '80', '800', '86', '90', '92', '93', '__',
       'able', 'ac', 'accept', 'access', 'according', 'act', 'action',
       'actually', 'add', 'added', 'addition', 'address',
       'administration', 'advance', 'age', 'ago', 'agree', 'aids', 'air',
       'al', 'allow', 'allowed', 'alt', 'america', 'american', 'amiga',
       'analysis', 'anonymous', 'answer', 'answers', 'anti', 'anybody',
       'apartment', 'appears', 'apple', 'application', 'applications',
       'apply', 'appreciated', 'approach', 'appropriate', 'apr', 'april',
       'archive', 'area', 'areas', 'aren', 'argument', 'armenia',
  

In [34]:
tfidf.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.08365563, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.12574832, 0.04605022,
        0.06032677],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [36]:
# we might need simple (unweighted) countVector as well
#
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(
    max_df=0.95, min_df=2, max_features=n_features, stop_words="english"
)
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

tf.toarray()

Extracting tf features for LDA...
done in 0.187s.


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 3, 1, 1],
       [0, 0, 0, ..., 0, 0, 0]])

In [7]:

from time import time
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

n_components = 10
n_top_words = 20


def plot_top_words(model, feature_names, n_top_words, title):
    fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 30})
        ax.invert_yaxis()
        ax.tick_params(axis="both", which="major", labelsize=20)
        for i in "top right left".split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()






# Fit the NMF model
print(
    "Fitting the NMF model (Frobenius norm) with tf-idf features, "
    "n_samples=%d and n_features=%d..." % (n_samples, n_features)
)
t0 = time()
nmf = NMF(n_components=n_components, random_state=1, alpha=0.1, l1_ratio=0.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))


tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
plot_top_words(
    nmf, tfidf_feature_names, n_top_words, "Topics in NMF model (Frobenius norm)"
)

# Fit the NMF model
print(
    "\n" * 2,
    "Fitting the NMF model (generalized Kullback-Leibler "
    "divergence) with tf-idf features, n_samples=%d and n_features=%d..."
    % (n_samples, n_features),
)
t0 = time()
nmf = NMF(
    n_components=n_components,
    random_state=1,
    beta_loss="kullback-leibler",
    solver="mu",
    max_iter=1000,
    alpha=0.1,
    l1_ratio=0.5,
).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
plot_top_words(
    nmf,
    tfidf_feature_names,
    n_top_words,
    "Topics in NMF model (generalized Kullback-Leibler divergence)",
)

print(
    "\n" * 2,
    "Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
    % (n_samples, n_features),
)
lda = LatentDirichletAllocation(
    n_components=n_components,
    max_iter=5,
    learning_method="online",
    learning_offset=50.0,
    random_state=0,
)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

tf_feature_names = tf_vectorizer.get_feature_names_out()
plot_top_words(lda, tf_feature_names, n_top_words, "Topics in LDA model")