In [None]:
# Lab Assignment 5: Topic Modeling using Gensim
# •	Use the Gensim library to implement Latent Dirichlet Allocation (LDA).
# •	Train an LDA model on a dataset of news articles.
# •	Extract and visualize the top words from each topic.

In [None]:
# Install only necessary packages (without downgrades)
!pip install gensim pyLDAvis nltk spacy --quiet
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import nltk
import gensim
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import spacy
import pandas as pd
import re
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models.ldamodel import LdaModel
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Sample dataset (you can replace it with your own news article data)
documents = [
    "The economy is showing signs of recovery after the financial crisis.",
    "Football teams are preparing for the upcoming world cup season.",
    "The government announced a new policy on renewable energy.",
    "Scientists have discovered a new species in the Amazon rainforest.",
    "A major earthquake has struck the city, causing massive damage.",
    "The stock market has hit a new high with tech companies leading.",
    "New studies suggest eating vegetables can improve health.",
    "Political tensions are rising between neighboring countries.",
    "SpaceX launched another satellite into orbit successfully.",
    "Healthcare workers are demanding better working conditions."
]

In [None]:
# Load Spacy model
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words("english"))

def preprocess(text):
    doc = nlp(text.lower())  # Lowercase + tokenize with spacy
    return [token.lemma_ for token in doc if token.is_alpha and token.text not in stop_words and len(token.text) > 2]

processed_docs = [preprocess(doc) for doc in documents]

In [None]:
# Create dictionary and corpus
dictionary = corpora.Dictionary(processed_docs)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [None]:
# Train LDA model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=3, random_state=42, passes=10)

# Print the top words in each topic
for idx, topic in lda_model.print_topics():
    print(f"Topic #{idx}: {topic}")

Topic #0: 0.059*"new" + 0.034*"announce" + 0.034*"energy" + 0.034*"renewable" + 0.034*"policy" + 0.034*"government" + 0.034*"show" + 0.034*"economy" + 0.034*"crisis" + 0.034*"sign"
Topic #1: 0.059*"new" + 0.034*"stock" + 0.034*"discover" + 0.034*"scientist" + 0.034*"specie" + 0.034*"market" + 0.034*"amazon" + 0.034*"rainforest" + 0.034*"high" + 0.034*"hit"
Topic #2: 0.029*"football" + 0.029*"season" + 0.029*"world" + 0.029*"strike" + 0.029*"city" + 0.029*"cup" + 0.029*"cause" + 0.029*"prepare" + 0.029*"team" + 0.029*"massive"


In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, dictionary)
vis