In [None]:
!pip install gensim nltk



In [None]:
from gensim.models import LdaModel, CoherenceModel
from gensim.corpora import Dictionary
from nltk.corpus import stopwords
import nltk, re

In [None]:
nltk.download('stopwords')
stop_words=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from sklearn.datasets import fetch_20newsgroups
newsgroups=fetch_20newsgroups(remove=("headers","footers","quotes"))
documents=newsgroups.data

In [None]:
def clean_text(text):
  text=re.sub(r'\W+', ' ',text.lower())
  tokens=[word for word in text.split() if word not in stop_words and len(word)>3]
  return tokens

In [None]:
#Tokenize all documents
tokenized_docs=[clean_text(doc) for doc in documents]

In [None]:
dictionary=Dictionary(tokenized_docs)
dictionary.filter_extremes(no_below=10, no_above=0.9)
corpus=[dictionary.doc2bow(text) for text in tokenized_docs]

In [None]:
lda_model=LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=8,
    random_state=42,
     alpha='auto',
     per_word_topics=True
    )



In [None]:
print("\nTOP WORDS PER TOPIC:\n")
for idx,topic in lda_model.show_topics(formatted=False):
    print(f"Topic: {idx+1}:{' | '.join([word[0] for word in topic])}")


TOP WORDS PER TOPIC:

Topic: 1:system | used | information | data | computer | also | would | need | space | high
Topic: 2:jesus | would | like | also | bible | people | time | church | christ | book
Topic: 3:like | people | would | right | team | time | well | could | year | know
Topic: 4:file | windows | thanks | program | like | also | card | available | files | using
Topic: 5:people | president | know | said | would | government | time | think | armenian | first
Topic: 6:would | know | drive | like | could | also | time | think | anyone | even
Topic: 7:space | would | 1993 | year | april | national | games | nasa | also | first
Topic: 8:would | people | think | good | like | many | even | well | make | believe


In [None]:
coherence_model=CoherenceModel(model=lda_model,texts=tokenized_docs,dictionary=dictionary,coherence='c_v')
coherence_score=coherence_model.get_coherence()
print(f"Coherence Score: {coherence_score:.4f}")
#coherence score above 0.3 is good

Coherence Score: 0.4904


In [None]:
!pip install pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl.metadata (5.9 kB)
Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m42.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-2.0 pyLDAvis-3.4.1


In [None]:
import pyLDAvis.gensim_models
import pyLDAvis

In [None]:
vis_Data=pyLDAvis.gensim_models.prepare(lda_model,corpus,dictionary,sort_topics=False)

In [None]:
pyLDAvis.enable_notebook()
pyLDAvis.display(vis_Data)