In [2]:
%matplotlib inline


# Topic extraction with Non-negative Matrix Factorization and Latent Dirichlet Allocation

This is an example of applying :class:`~sklearn.decomposition.NMF` and
:class:`~sklearn.decomposition.LatentDirichletAllocation` on a corpus
of documents and extract additive models of the topic structure of the
corpus.  The output is a plot of topics, each represented as bar plot
using top few words based on weights.

Non-negative Matrix Factorization is applied with two different objective
functions: the Frobenius norm, and the generalized Kullback-Leibler divergence.
The latter is equivalent to Probabilistic Latent Semantic Indexing.

The default parameters (n_samples / n_features / n_components) should make
the example runnable in a couple of tens of seconds. You can try to
increase the dimensions of the problem, but be aware that the time
complexity is polynomial in NMF. In LDA, the time complexity is
proportional to (n_samples * iterations).


In [3]:
from time import time
from sklearn.datasets import fetch_20newsgroups

n_samples = 2000

# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics
# to filter out useless terms early on: the posts are stripped of headers,
# footers and quoted replies

print("Loading dataset...")
t0 = time()
bunch = fetch_20newsgroups(
    shuffle=True,
    random_state=1,
    remove=("headers", "footers", "quotes"),
)

data_samples = bunch.data[:n_samples]
print("done in %0.3fs." % (time() - t0))

Loading dataset...
done in 0.731s.


In [4]:
data_samples
data_samples[0]

"Well i'm not sure about the story nad it did seem biased. What\nI disagree with is your statement that the U.S. Media is out to\nruin Israels reputation. That is rediculous. The U.S. media is\nthe most pro-israeli media in the world. Having lived in Europe\nI realize that incidences such as the one described in the\nletter have occured. The U.S. media as a whole seem to try to\nignore them. The U.S. is subsidizing Israels existance and the\nEuropeans are not (at least not to the same degree). So I think\nthat might be a reason they report more clearly on the\natrocities.\n\tWhat is a shame is that in Austria, daily reports of\nthe inhuman acts commited by Israeli soldiers and the blessing\nreceived from the Government makes some of the Holocaust guilt\ngo away. After all, look how the Jews are treating other races\nwhen they got power. It is unfortunate.\n"

In [5]:
bunch.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [6]:
labels_samples = bunch.target[:n_samples]
labels_samples

array([17,  0, 17, ...,  3, 15,  9])

## Understanding vectorization

Following tutorial from https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction

In [7]:
# playing with countvector
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer

CountVectorizer()

In [8]:
corpus = [
     'This is the first document.',
     'This is the second second document.',
     'And the third one.',
     'Is this the first document?',
]

X = vectorizer.fit_transform(corpus)
X

<4x9 sparse matrix of type '<class 'numpy.int64'>'
	with 19 stored elements in Compressed Sparse Row format>

In [9]:
# the vectorizer has tokenized all the words (vocabullary)

# this will show the actual names
vectorizer.get_feature_names_out()

array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
       'this'], dtype=object)

In [10]:
# the X represents presence of each word in each document
# We passed 4 documents, therefore X has 4 rows
# each row contains 9 elements because there are 9 total words
# the value 0,1,2 show the count of number of times that word is present in that document
X.toarray()

# in first document:
# and is present 0 times, document is present 1 time, first is present 1 time

array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 1, 0, 1, 0, 2, 1, 0, 1],
       [1, 0, 0, 0, 1, 0, 1, 1, 0],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]])

In [11]:
# we created a vocabulary of individual words in the above
# it ignored relative positions of the words
#
# we can create a vocabulary of phrases as well by using n_grams
# for example, an n_gram of 2 will create a dictionary of all
# individual words as well as all 2 consecutive words

In [12]:
# we didn't use stop words in the above

### Understanding tf-idf

Based on readings and examples from https://scikit-learn.org/stable/modules/feature_extraction.html#tfidf-term-weighting

In [13]:
# we used the above vectorisation strategy to convert a raw text
# into a matrix of numbers
# each row in the matrix represents the document row.
#
# each row has equal number of elements
# the number of elements is the dictionary of all words
# the number in each row represents the count of number of times the dictionary word is used in document
#
# now we can use this matrix of numbers (bag-of-words) to do further analysis
#
# tf-idf is a method to give weighted information for each term in each document
# we use tfidf-transformer to convert the count matrix into weight matrix
#
# this transformation from count matrix -> to weighted matrix is done using formula:
# term weight = term-frequency * 1 / uniqueness of term
# i.e. terms-weight is directly proportional to its frequency
# but inversely proportional to how frequently it is found in OTHER documents
# this final term-weight is a value between 0 to 1.

In [14]:
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer(smooth_idf=False)
transformer

TfidfTransformer(smooth_idf=False)

In [15]:
# convert X bag-of-words into weighted average
tfidf = transformer.fit_transform(X)
tfidf.toarray()

array([[0.        , 0.43306685, 0.56943086, 0.43306685, 0.        ,
        0.        , 0.33631504, 0.        , 0.43306685],
       [0.        , 0.24014568, 0.        , 0.24014568, 0.        ,
        0.89006176, 0.18649454, 0.        , 0.24014568],
       [0.56115953, 0.        , 0.        , 0.        , 0.56115953,
        0.        , 0.23515939, 0.56115953, 0.        ],
       [0.        , 0.43306685, 0.56943086, 0.43306685, 0.        ,
        0.        , 0.33631504, 0.        , 0.43306685]])

## Using k-means for clustering

In [None]:
true_k = np.unique(labels).shape[0]

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer


t0 = time()
vectorizer = TfidfVectorizer(
        max_df=0.5,
#         max_features=10000,
        min_df=2,
        stop_words="english",
        use_idf=True,
    )
X = vectorizer.fit_transform(data_samples)

print("done in %fs" % (time() - t0))
print("samples, features =", X.shape)

done in 0.171120s
samples, features = (2000, 12230)


In [85]:
X.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [16]:
# reduce number of features (dimensions) using SVD (latent semantic analysis)

from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline

t0 = time()
# Vectorizer results are normalized, which makes KMeans behave as
# spherical k-means for better results. Since LSA/SVD results are
# not normalized, we have to redo the normalization.
svd = TruncatedSVD()
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

X = lsa.fit_transform(X)

print("done in %fs" % (time() - t0))

explained_variance = svd.explained_variance_ratio_.sum()
print(
    "Explained variance of the SVD step: {}%".format(int(explained_variance * 100))
)

print('samples, features =', X.shape)

done in 0.087831s
Explained variance of the SVD step: 6%
samples, features = (2000, 2)


In [17]:
X

array([[ 0.86710376, -0.49812756],
       [ 0.97153074, -0.23691355],
       [ 0.933823  , -0.35773538],
       ...,
       [ 0.74689405,  0.66494307],
       [ 0.88778965, -0.46024943],
       [ 0.92892824, -0.37025982]])

## Trying topic extraction on announcements

Generating dump

```python
from announcements.models import BseAnnouncement
import datetime as dt

announcements = (
    BseAnnouncement.objects.filter(date__gt=dt.date.today() - dt.timedelta(days=365))
    .values('company_code__name', 'date', 'subject', 'headline', 'more')
)
print(len(announcements))

import json
from django.core.serializers.json import DjangoJSONEncoder
qs_json = json.dumps(list(announcements), indent=4, sort_keys=True, cls=DjangoJSONEncoder)

with open('/home/ubuntu/anns.json', 'w') as f:
    f.write(qs_json)
```

In [16]:
import json

with open('anns.json') as f: 
    announcements = json.load(f)
announcements[:3]

[{'company_code__name': 'Arvind Fashions Ltd',
  'date': '2021-03-14T12:57:41.197',
  'headline': 'Pursuant to Regulation 39(3) of the SEBI (Listing Obligations and Disclosure Requirements) Regulations, 2015, we wish to inform you that the Company has received requests for issue of duplicate share certificates in lieu of the old certificates which have been reported lost/misplaced by the shareholders.',
  'more': '',
  'subject': 'Compliances-Reg. 39 (3) - Details of Loss of Certificate / Duplicate Certificate'},
 {'company_code__name': 'Confidence Petroleum India Ltd',
  'date': '2021-03-14T13:09:18.373',
  'headline': 'With reference to above and further to our intimation date 27th February, 2021, this is to inform you that warrant holders (i.e. Promoter Group entities) have exercised their right for conversion of the warrants into equity shares of the Company. Consequently, the Board of Directors of the Company at their meeting held today i.e. on 14th March, 2021 has allotted 1,01,7

In [107]:
def _get_ann_content(ann):
    headline = ann['headline'] if len(ann['headline']) > len(ann['more']) else ann['more']
    return f"{ann['subject']}\n\n{headline}"

data_raw = [_get_ann_content(ann) for ann in announcements]
data_raw[:3]

['Compliances-Reg. 39 (3) - Details of Loss of Certificate / Duplicate Certificate\n\nPursuant to Regulation 39(3) of the SEBI (Listing Obligations and Disclosure Requirements) Regulations, 2015, we wish to inform you that the Company has received requests for issue of duplicate share certificates in lieu of the old certificates which have been reported lost/misplaced by the shareholders.',
 'Announcement under Regulation 30 (LODR)-Allotment\n\nWith reference to above and further to our intimation date 27th February, 2021, this is to inform you that warrant holders (i.e. Promoter Group entities) have exercised their right for conversion of the warrants into equity shares of the Company. Consequently, the Board of Directors of the Company at their meeting held today i.e. on 14th March, 2021 has allotted 1,01,76,923 equity shares of face value of Re. 1/- each (with a premium of Rs. 51/- per equity share) pursuant to exercise of 1,01,76,923 convertible warrants.<BR> <BR> These equity shar

In [153]:
import lxml.html
import re

replacements = {
    # lodr
    'listing obligations and disclosure requirements': 'lodr',
    '(lodr)': 'lodr',
    'regulations, 2015': 'regulations 2015',
    'sebi lodr regulations 2015': 'lodr',
    # sebi
    'Securities And Exchange Board Of India': 'sebi',
    'Substantial Acquisition of Shares & Takeovers': 'sast',
    'Insider Trading Prohibition Code': 'itpc',
}

re_replacements = {re.compile(re.escape(k), re.IGNORECASE): v for k,v in replacements.items()}

def _clean_doc(text):
    # remove html tags
    text = lxml.html.fromstring(text).text_content()

    # handle abbreviations
    for phrase, replacement in re_replacements.items():
        text = phrase.sub(replacement, text)
    return text
    
t0 = time()
data_clean = [_clean_doc(row) for row in data_raw]
print('time taken %fs' % (time() - t0))

data_clean[:3]
data = data_clean

time taken 9.620337s


## Define some good defaults

In [154]:
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

## Find most common phrases

In [155]:
from sklearn.feature_extraction.text import CountVectorizer

t0 = time()
count_vectorizer = CountVectorizer(max_df=.5, min_df=50, ngram_range=(2, 3), stop_words=stop_words)
X = count_vectorizer.fit_transform(data)

print('time taken %fs' % (time() - t0))

time taken 13.734051s


In [156]:
print('shape', X.shape)

shape (288726, 29453)


In [157]:
count_phrases = X.sum(axis=0)
count_phrases

matrix([[ 598,  511, 1664, ...,  153,  153,   51]])

In [158]:
phrase_freq = [(phrase, count_phrases[0, idx]) for phrase, idx in count_vectorizer.vocabulary_.items()]

In [160]:
phrase_freq = sorted(phrase_freq, key = lambda x: (x[0].count(' '), -x[1]), reverse=True)
phrase_freq

[('sub press release', 50),
 ('pm concluded 45', 50),
 ('regard attached kindly', 50),
 ('visual means transact', 50),
 ('directors company based', 50),
 ('read relevant rules', 50),
 ('bank ltdhas informed', 50),
 ('2021 approved adopted', 50),
 ('said shares rank', 50),
 ('copy disclosure required', 50),
 ('certificate shall issued', 50),
 ('displaying details request', 50),
 ('english editions jansatta', 50),
 ('meeting information records', 50),
 ('intimation separate meeting', 50),
 ('share certificate also', 50),
 ('2021 wherein intimated', 50),
 ('company continue remain', 50),
 ('list stop transfer', 50),
 ('auditors company fill', 50),
 ('held consideration approval', 50),
 ('intimation towards loss', 50),
 ('would like reiterate', 50),
 ('wish inform registrar', 50),
 ('vide circular sebi', 50),
 ('pm consider approve', 50),
 ('agents link intime', 50),
 ('open offer updates', 50),
 ('due exigencies information', 50),
 ('may kindly taken', 50),
 ('name depository registered',

In [161]:
def _print_samples(data, phrase):
    print([row for row in data if phrase in row.lower()][:3])
    
_print_samples(data[:10000], 'trading prohibitio')

["Closure of Trading Window\n\nThis is to inform you that in compliance with the sebi (Prohibition of Insider Trading) regulations 2015, and in terms of PCBL's 'Insider Trading Prohibition Code', the trading window for trading in the securities of the Company will remain closed for all the Designated Employees of the Company and their Immediate Relatives from 1st April, 2021 till 48 hours after the declaration of the audited financial results of the Company for the quarter and financial year ending 31st March, 2021. Request you to take the afore-mentioned information in record and oblige.      ", "Closure of Trading Window\n\nPursuant to Securities & Exchange Board of India (Prohibition of Insider Trading) regulations 2015 and Insider Trading Prohibition Code of the Company, the Trading Window shall be closed for all the Designated Persons of the Company as defined in the Code for dealing in securities of the Company with effect from April 1, 2021 till expiry of 48 hours after the decl

## Extract clusters

In [29]:
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/pratyush/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


True

In [71]:
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer
from functools import lru_cache

from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords

stemmer = WordNetLemmatizer()

@lru_cache(maxsize=1024)
def _get_stem(word):
    return stemmer.lemmatize(word)

class LemmaTokenizer:

    def __call__(self, doc):
        return [
            _get_stem(t)
            for t in wordpunct_tokenize(doc)
            if t not in stop_words
        ]

t0 = time()
vectorizer = TfidfVectorizer(
        max_df=0.5,
        min_df=50,
        use_idf=True,
        tokenizer=LemmaTokenizer(),
    )
X = vectorizer.fit_transform(data)

print("done in %fs" % (time() - t0))
print("samples, features =", X.shape)

done in 36.797934s
samples, features = (288726, 4023)


In [73]:
print(data[0])
vectorizer.inverse_transform(X[0])

Compliances-Reg. 39 (3) - Details of Loss of Certificate / Duplicate Certificate

Pursuant to Regulation 39(3) of the SEBI (Listing Obligations and Disclosure Requirements) Regulations, 2015, we wish to inform you that the Company has received requests for issue of duplicate share certificates in lieu of the old certificates which have been reported lost/misplaced by the shareholders.


[array(['shareholder', 'misplaced', 'lost', 'reported', 'old', 'lieu',
        'share', 'issue', 'request', 'received', 'inform', 'wish', '2015',
        'requirement', 'disclosure', 'obligation', 'listing', 'sebi',
        'pursuant', 'duplicate', 'certificate', 'loss', 'detail', '3',
        '39', 'reg', 'compliance'], dtype='<U31')]

In [74]:
def _inverse_transform_weight(vectorizer, row):
    names = vectorizer.get_feature_names_out()
    row = row.toarray()[0]
    name_vals = sorted(zip(row, names), reverse=True)
    return name_vals[:8]

_inverse_transform_weight(vectorizer, X[0])

[(0.45853668214260324, 'certificate'),
 (0.3091277638144188, 'duplicate'),
 (0.30784764785630025, '39'),
 (0.2917411448251722, 'old'),
 (0.2523860704775366, 'misplaced'),
 (0.23791789025335514, 'lieu'),
 (0.2349729333255911, 'reported'),
 (0.2322382775654531, 'lost')]

In [75]:
print(data[1])
_inverse_transform_weight(vectorizer, X[1])

Announcement under Regulation 30 (LODR)-Allotment

With reference to above and further to our intimation date 27th February, 2021, this is to inform you that warrant holders (i.e. Promoter Group entities) have exercised their right for conversion of the warrants into equity shares of the Company. Consequently, the Board of Directors of the Company at their meeting held today i.e. on 14th March, 2021 has allotted 1,01,76,923 equity shares of face value of Re. 1/- each (with a premium of Rs. 51/- per equity share) pursuant to exercise of 1,01,76,923 convertible warrants.<BR> <BR> These equity shares allotted on conversion of the warrants, shall rank pari passu, in all respects with the existing equity shares.<BR> <BR> Post the allotment of equity shares, the paid up equity shares of the Company has increased from INR 27,38,35,000/- (Rupees Twenty Seven Crore Thirty Eight Lakhs Thirty Five Thousand only) to INR 28,40,11,923/- (Rupees Twenty Eight Crore Forty Lakhs Eleven Thousand Nine Hun

[(0.3408888867106639, 'equity'),
 (0.33820467409049354, 'warrant'),
 (0.25158660904341285, 'twenty'),
 (0.22031012587188634, 'share'),
 (0.20522785789878936, '76'),
 (0.18753339942112654, 'conversion'),
 (0.1771640479800399, 'thirty'),
 (0.17380316552026637, 'thousand')]

In [77]:
print(data[2])
_inverse_transform_weight(vectorizer, X[2])

General Disclosure Under The Securities And Exchange Board Of India (Listing Obligations And Disclosure Requirements) Regulations, 2015 - 3B Binani Glassfibre SARL

General Disclosure under the Securities and Exchange Board of India (Listing Obligations and Disclosure Requirements) Regulations, 2015 - 3B Binani Glassfibre SARL


[(0.4591053022305802, 'disclosure'),
 (0.37457003306144, 'general'),
 (0.3423587474754493, 'india'),
 (0.3394602489975474, 'security'),
 (0.311154541512066, 'exchange'),
 (0.26345918850079786, 'obligation'),
 (0.26207377809477694, 'requirement'),
 (0.25654820106826776, 'listing')]

In [78]:
print(data[3])
_inverse_transform_weight(vectorizer, X[3])

Revised Result For Quarter And Nine Months Ended On 31.12.2020 In Format Asked Through Mail

Dear Sir,<BR> <BR> Please find enclosed herewith the revised Results for the quarter and nine months ended on 31.12.2020 as asked by you through mail and proforma approved through mail.<BR> <BR> Thanking You with Regards<BR> Yours Faithfully <BR> For  Rishab Special Yarns Limited<BR> <BR> <BR> <BR> Managing Director<BR> DIN: 00349697<BR>


[(0.49861799900604004, 'br'),
 (0.41198135239862016, '>'),
 (0.3649949605910017, 'mail'),
 (0.36056216133124364, '<'),
 (0.203550675320016, 'revised'),
 (0.16668978920390606, 'nine'),
 (0.16301968047322657, 'yarn'),
 (0.15921490242243477, 'month')]

## Returning to our code

In [33]:
# now data we have a sample data,
# lets convert it to a tf-idf vector

from sklearn.feature_extraction.text import TfidfVectorizer

# exclude common English words
# also exclude words only one document
# or in at least 95% of the documents
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(
    max_df=0.95, # ignore words which are present in over 95% documents
    min_df=2, # ignore words which are present in less than 2 documents
    max_features=1000, # consider only the top 1000 words by frequency
    stop_words="english", # ignore common english words such as the
)

t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

Extracting tf-idf features for NMF...
done in 0.173s.


In [35]:
tfidf_vectorizer.get_feature_names_out()

array(['00', '000', '10', '100', '11', '12', '128', '13', '130', '14',
       '15', '16', '17', '18', '19', '1992', '1993', '20', '200', '21',
       '22', '23', '24', '25', '250', '26', '27', '28', '29', '2nd', '30',
       '300', '31', '32', '33', '34', '35', '36', '37', '38', '3d', '40',
       '42', '43', '44', '45', '48', '49', '50', '500', '51', '55', '60',
       '66', '70', '72', '75', '80', '800', '86', '90', '92', '93', '__',
       'able', 'ac', 'accept', 'access', 'according', 'act', 'action',
       'actually', 'add', 'added', 'addition', 'address',
       'administration', 'advance', 'age', 'ago', 'agree', 'aids', 'air',
       'al', 'allow', 'allowed', 'alt', 'america', 'american', 'amiga',
       'analysis', 'anonymous', 'answer', 'answers', 'anti', 'anybody',
       'apartment', 'appears', 'apple', 'application', 'applications',
       'apply', 'appreciated', 'approach', 'appropriate', 'apr', 'april',
       'archive', 'area', 'areas', 'aren', 'argument', 'armenia',
  

In [34]:
tfidf.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.08365563, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.12574832, 0.04605022,
        0.06032677],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [36]:
# we might need simple (unweighted) countVector as well
#
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(
    max_df=0.95, min_df=2, max_features=n_features, stop_words="english"
)
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

tf.toarray()

Extracting tf features for LDA...
done in 0.187s.


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 3, 1, 1],
       [0, 0, 0, ..., 0, 0, 0]])

In [7]:

from time import time
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

n_components = 10
n_top_words = 20


def plot_top_words(model, feature_names, n_top_words, title):
    fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 30})
        ax.invert_yaxis()
        ax.tick_params(axis="both", which="major", labelsize=20)
        for i in "top right left".split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()






# Fit the NMF model
print(
    "Fitting the NMF model (Frobenius norm) with tf-idf features, "
    "n_samples=%d and n_features=%d..." % (n_samples, n_features)
)
t0 = time()
nmf = NMF(n_components=n_components, random_state=1, alpha=0.1, l1_ratio=0.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))


tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
plot_top_words(
    nmf, tfidf_feature_names, n_top_words, "Topics in NMF model (Frobenius norm)"
)

# Fit the NMF model
print(
    "\n" * 2,
    "Fitting the NMF model (generalized Kullback-Leibler "
    "divergence) with tf-idf features, n_samples=%d and n_features=%d..."
    % (n_samples, n_features),
)
t0 = time()
nmf = NMF(
    n_components=n_components,
    random_state=1,
    beta_loss="kullback-leibler",
    solver="mu",
    max_iter=1000,
    alpha=0.1,
    l1_ratio=0.5,
).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
plot_top_words(
    nmf,
    tfidf_feature_names,
    n_top_words,
    "Topics in NMF model (generalized Kullback-Leibler divergence)",
)

print(
    "\n" * 2,
    "Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
    % (n_samples, n_features),
)
lda = LatentDirichletAllocation(
    n_components=n_components,
    max_iter=5,
    learning_method="online",
    learning_offset=50.0,
    random_state=0,
)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

tf_feature_names = tf_vectorizer.get_feature_names_out()
plot_top_words(lda, tf_feature_names, n_top_words, "Topics in LDA model")