In [5]:
!pip install --upgrade gensim scipy

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m54.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [1]:
import nltk
import gensim
from gensim import corpora
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import pandas as pd

# Download necessary resources
nltk.download('punkt_tab')
nltk.download('stopwords')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
# Sample diverse corpus with different topics
documents = [
    "Python programming is used for web development and data analysis.",
    "Data science involves statistics, machine learning, and data mining.",
    "Soccer is a popular sport played worldwide, with teams in many countries.",
    "The stock market fluctuates with the economy and government policies.",
    "Healthy eating includes vegetables, fruits, and whole grains.",
    "The tech industry is constantly evolving with new advancements in AI and cloud computing.",
    "Running and swimming are good cardiovascular exercises.",
    "Apple's iPhone is one of the leading smartphones in the world.",
    "Football matches attract millions of viewers on television.",
    "AI algorithms are being use to automate tasks in various industries."
]

In [3]:
# Preprocessing the documents
stop_words = set(stopwords.words('english'))

def preprocessing(text):
  # Tokenizing and converting to lower case
  tokens = word_tokenize(text.lower())

  # Removing punctuation and stopwords
  tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]

  return tokens

In [4]:
# Preprocess all documents
processed_docs = [preprocessing(doc) for doc in documents]

processed_docs

[['python', 'programming', 'used', 'web', 'development', 'data', 'analysis'],
 ['data',
  'science',
  'involves',
  'statistics',
  'machine',
  'learning',
  'data',
  'mining'],
 ['soccer',
  'popular',
  'sport',
  'played',
  'worldwide',
  'teams',
  'many',
  'countries'],
 ['stock', 'market', 'fluctuates', 'economy', 'government', 'policies'],
 ['healthy', 'eating', 'includes', 'vegetables', 'fruits', 'whole', 'grains'],
 ['tech',
  'industry',
  'constantly',
  'evolving',
  'new',
  'advancements',
  'ai',
  'cloud',
  'computing'],
 ['running', 'swimming', 'good', 'cardiovascular', 'exercises'],
 ['apple', "'s", 'iphone', 'one', 'leading', 'smartphones', 'world'],
 ['football', 'matches', 'attract', 'millions', 'viewers', 'television'],
 ['ai', 'algorithms', 'use', 'automate', 'tasks', 'various', 'industries']]

In [5]:
# Create dictionary
dictionary = corpora.Dictionary(processed_docs)

In [6]:
# Create a corpus: a list of Bag of Words (BoW)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [7]:
corpus

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)],
 [(1, 2), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)],
 [(13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1)],
 [(21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1)],
 [(27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1)],
 [(34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1)],
 [(43, 1), (44, 1), (45, 1), (46, 1), (47, 1)],
 [(48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1)],
 [(55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1)],
 [(35, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1)]]

In [8]:
# Create the LDA model
lda_model = gensim.models.LdaMulticore(corpus, num_topics=3, id2word=dictionary, passes=15)

In [9]:
# Get the topics discovered by LDA and make them more readable
topics = lda_model.print_topics(num_words=5)   # You can change num_words to show more or fewer words words per topic

In [10]:
topics

[(0,
  '0.027*"ai" + 0.026*"tech" + 0.026*"advancements" + 0.026*"cloud" + 0.026*"new"'),
 (1,
  '0.064*"data" + 0.025*"machine" + 0.025*"mining" + 0.025*"countries" + 0.025*"sport"'),
 (2,
  '0.039*"stock" + 0.039*"millions" + 0.039*"viewers" + 0.039*"football" + 0.039*"matches"')]

In [11]:
# Organize the topics into a more readable table
topic_words = []
for topic in topics:
  topic_num, words = topic
  words = [word.split("*") for word in words.split(" + ")]
  words = [(float(weight), word.strip().strip('"')) for weight, word in words]
  topic_words.append({"Topic": topic_num, "Words": words})

In [12]:
topic_words

[{'Topic': 0,
  'Words': [(0.027, 'ai'),
   (0.026, 'tech'),
   (0.026, 'advancements'),
   (0.026, 'cloud'),
   (0.026, 'new')]},
 {'Topic': 1,
  'Words': [(0.064, 'data'),
   (0.025, 'machine'),
   (0.025, 'mining'),
   (0.025, 'countries'),
   (0.025, 'sport')]},
 {'Topic': 2,
  'Words': [(0.039, 'stock'),
   (0.039, 'millions'),
   (0.039, 'viewers'),
   (0.039, 'football'),
   (0.039, 'matches')]}]

In [13]:
# Display topics as a DataFrame for better readability
topics_df = pd.DataFrame(topic_words)

In [14]:
topics_df

Unnamed: 0,Topic,Words
0,0,"[(0.027, ai), (0.026, tech), (0.026, advanceme..."
1,1,"[(0.064, data), (0.025, machine), (0.025, mini..."
2,2,"[(0.039, stock), (0.039, millions), (0.039, vi..."


In [15]:
# Assign topic titles based on the most frequent words in the topic
def get_topic_title(topic_words):
  topic_keywords = [word for _, word in topic_words]
  if 'python' in topic_keywords or 'programming' in topic_keywords:
    return "Programming & Data Science"
  elif 'soccer' in topic_keywords or 'football' in topic_keywords:
    return "Sports (Football/Soccer)"
  elif 'ai' in topic_keywords or 'cloud' in topic_keywords:
    return "Technology & AI"
  else:
    return "General"

In [16]:
# Add titles for each topic
topics_df['Topic Title'] = topics_df['Words'].apply(get_topic_title)

In [17]:
# Print the topics and their most important words
for idx, row in topics_df.iterrows():
  print(f"Topic {row['Topic']} - {row['Topic Title']}:")
  for weight, word in row['Words']:
    print(f"  - {word} (weight: {weight:.4f})")
  print("\n")

Topic 0 - Technology & AI:
  - ai (weight: 0.0270)
  - tech (weight: 0.0260)
  - advancements (weight: 0.0260)
  - cloud (weight: 0.0260)
  - new (weight: 0.0260)


Topic 1 - General:
  - data (weight: 0.0640)
  - machine (weight: 0.0250)
  - mining (weight: 0.0250)
  - countries (weight: 0.0250)
  - sport (weight: 0.0250)


Topic 2 - Sports (Football/Soccer):
  - stock (weight: 0.0390)
  - millions (weight: 0.0390)
  - viewers (weight: 0.0390)
  - football (weight: 0.0390)
  - matches (weight: 0.0390)




In [18]:
!pip install pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl.metadata (5.9 kB)
Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m58.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-2.0 pyLDAvis-3.4.1


In [19]:
!pip install --upgrade pyLDAvis gensim



In [20]:
import pyLDAvis.gensim     # Import directly from pyLDAis
import pyLDAvis

In [21]:
# vis = pyLDAis. gensim_models.prepare(lda_model, corpus, dictionary)
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
pyLDAvis.save_html(vis, 'lda_visualization.html')

In [22]:
# Vizualization with pyLDAvis
from google.colab import files
files.download('lda_visualization.html')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>