Paragraphs 

In [1]:

data_mining = """Data mining is the process of discovering patterns, 
trends, and relationships in large datasets by analyzing data from multiple perspectives. 
It uses techniques from statistics, machine learning, and database systems to extract valuable 
insights that can inform decision-making. Often used in fields such as marketing, healthcare, 
and finance, data mining identifies anomalies, clusters, and associations that might otherwise go unnoticed. 
Tools like decision trees, clustering algorithms, and association rule mining enable businesses to improve strategies,
 such as predicting customer behavior or detecting fraud."""

machine_learning = """Machine learning is a subset of artificial intelligence that allows systems to 
learn and improve from experience without being explicitly programmed. By utilizing algorithms that process 
and analyze data, machine learning models can perform tasks like classification, regression, and clustering. 
Applications range from personalized recommendations on streaming platforms to real-time language translation. 
The key to successful machine learning lies in providing high-quality data and selecting appropriate algorithms, 
such as neural networks for deep learning or support vector machines for classification tasks."""

meta_heuristics = """Metaheuristics are high-level strategies designed to solve complex optimization 
problems that are difficult to address with traditional methods. These techniques, such as genetic algorithms, 
simulated annealing, and particle swarm optimization, provide approximate solutions by exploring the search 
space efficiently. Metaheuristics are particularly useful for problems in logistics, engineering design, 
and scheduling, where the objective is to maximize or minimize certain criteria. Their ability to avoid local 
optima and handle large problem sizes makes them indispensable in fields requiring innovative problem-solving approaches."""

vision = """In the context of artificial intelligence, vision refers to the capability of machines to 
interpret and understand visual data from the world. Computer vision, a prominent area of study, enables systems 
to analyze images and videos to perform tasks like object detection, facial recognition, and image segmentation. 
Using deep learning models, such as convolutional neural networks (CNNs), vision systems are applied in industries 
like healthcare for medical imaging, autonomous vehicles for navigation, and retail for inventory management. 
Advances in vision continue to push the boundaries of machine perception, making systems increasingly adept 
at mimicking human sight."""


Preprocessing

In [2]:
import spacy
nlp = spacy.load("en_core_web_sm")
from gensim import corpora

In [50]:
paragraphs = [data_mining, machine_learning, meta_heuristics, vision]
texts = []
for p in paragraphs:
    text = []
    doc = nlp(p.lower().replace("\n", " "))
    for w in doc:
        if not w.is_stop and not w.is_punct and not w.like_num and len(w.text) > 2:
            text.append(w.lemma_)
    texts.append(text)

dictionary = corpora.Dictionary(texts)
print(dictionary.token2id)

corpus = [dictionary.doc2bow(text) for text in texts]

{'algorithm': 0, 'analyze': 1, 'anomaly': 2, 'association': 3, 'behavior': 4, 'business': 5, 'cluster': 6, 'clustering': 7, 'customer': 8, 'data': 9, 'database': 10, 'dataset': 11, 'datum': 12, 'decision': 13, 'detect': 14, 'discover': 15, 'enable': 16, 'extract': 17, 'field': 18, 'finance': 19, 'fraud': 20, 'healthcare': 21, 'identifie': 22, 'improve': 23, 'inform': 24, 'insight': 25, 'large': 26, 'learning': 27, 'like': 28, 'machine': 29, 'making': 30, 'marketing': 31, 'mining': 32, 'multiple': 33, 'pattern': 34, 'perspective': 35, 'predict': 36, 'process': 37, 'relationship': 38, 'rule': 39, 'statistic': 40, 'strategy': 41, 'system': 42, 'technique': 43, 'tool': 44, 'tree': 45, 'trend': 46, 'unnoticed': 47, 'use': 48, 'valuable': 49, 'allow': 50, 'application': 51, 'appropriate': 52, 'artificial': 53, 'classification': 54, 'deep': 55, 'experience': 56, 'explicitly': 57, 'high': 58, 'intelligence': 59, 'key': 60, 'language': 61, 'learn': 62, 'lie': 63, 'model': 64, 'network': 65, 'ne

Latent Dirichlet allocation (LDA)

In [None]:
from gensim.models import CoherenceModel
from gensim.models import LdaModel
import numpy as np

alpha_values = list(np.arange(0.01, 1.01, 0.3)) + ['symmetric', 'asymmetric']
eta_values = list(np.arange(0.01, 1.01, 0.3)) + ['symmetric']
coherence_scores = []

for alpha in alpha_values:
    for eta in eta_values:
        ldamodel = LdaModel(
            corpus=corpus,
            id2word=dictionary,
            num_topics=4,
            random_state=42,
            alpha=alpha,
            eta=eta,
            passes=10
        )
        coherence_model_lda = CoherenceModel(
            model=ldamodel,
            texts=texts,
            dictionary=dictionary,
            coherence='c_v'
        )
        coherence_score = coherence_model_lda.get_coherence()
        coherence_scores.append((alpha, eta, coherence_score))


best_params = max(coherence_scores, key=lambda x: x[2])
print(f"Best Alpha: {best_params[0]}, Best Eta: {best_params[1]}, Coherence Score: {best_params[2]}")


Best Alpha: asymmetric, Best Eta: 0.01, Coherence Score: 0.9001317374368984


In [33]:
'''
---------------------------------
num_topics=3: three main topics and machine learning overlap with all of them 
---------------------------------
'''

ldamodel = LdaModel(corpus=corpus, num_topics=3, id2word=dictionary, eta=0.01,alpha='asymmetric')
ldamodel.show_topics()


[(0,
  '0.031*"problem" + 0.020*"system" + 0.019*"machine" + 0.018*"vision" + 0.015*"learning" + 0.015*"like" + 0.014*"solve" + 0.014*"metaheuristic" + 0.013*"algorithm" + 0.013*"design"'),
 (1,
  '0.044*"machine" + 0.040*"learning" + 0.025*"task" + 0.024*"datum" + 0.023*"system" + 0.020*"algorithm" + 0.020*"classification" + 0.018*"like" + 0.017*"model" + 0.015*"vision"'),
 (2,
  '0.028*"machine" + 0.027*"datum" + 0.023*"learning" + 0.022*"mining" + 0.020*"system" + 0.018*"like" + 0.017*"algorithm" + 0.017*"vision" + 0.016*"analyze" + 0.015*"association"')]

In [34]:

'''
---------------------------------
num_topics=4:Topics overlap significantly, especially between topics 2 and 3.
The results lack strong separation between the four distinct input concepts.
---------------------------------
'''

ldamodel = LdaModel(corpus=corpus, num_topics=4, id2word=dictionary,eta=0.01,alpha='asymmetric')
ldamodel.show_topics()


[(0,
  '0.042*"problem" + 0.025*"solve" + 0.021*"optimization" + 0.020*"metaheuristic" + 0.018*"design" + 0.016*"machine" + 0.015*"make" + 0.014*"algorithm" + 0.014*"local" + 0.014*"high"'),
 (1,
  '0.025*"datum" + 0.023*"algorithm" + 0.023*"mining" + 0.023*"problem" + 0.022*"machine" + 0.021*"learning" + 0.017*"association" + 0.017*"decision" + 0.016*"large" + 0.015*"technique"'),
 (2,
  '0.055*"vision" + 0.040*"system" + 0.026*"image" + 0.026*"like" + 0.026*"machine" + 0.016*"learning" + 0.015*"analyze" + 0.015*"datum" + 0.015*"enable" + 0.014*"healthcare"'),
 (3,
  '0.052*"machine" + 0.049*"learning" + 0.029*"datum" + 0.025*"like" + 0.024*"algorithm" + 0.020*"task" + 0.019*"classification" + 0.019*"system" + 0.016*"analyze" + 0.015*"mining"')]

In [35]:

'''

---------------------------------
num_topics=6: Topics still overlap
Topic 1 and Topic 3 focus on problem-solving and optimization, heavily associated with metaheuristics.
Topic 0 and Topic 4 include terms from both data mining and machine learning, showing blurred boundaries.
Topic 5 highlights machine learning concepts but overlaps with vision-related terms.
------------------------------------

'''

ldamodel = LdaModel(corpus=corpus, num_topics=6, id2word=dictionary, eta=0.01,alpha='asymmetric')
ldamodel.show_topics()

[(0,
  '0.040*"learning" + 0.038*"machine" + 0.028*"datum" + 0.022*"system" + 0.020*"vision" + 0.019*"like" + 0.018*"task" + 0.017*"algorithm" + 0.017*"analyze" + 0.013*"neural"'),
 (1,
  '0.032*"vision" + 0.030*"system" + 0.024*"like" + 0.020*"image" + 0.020*"machine" + 0.016*"mining" + 0.014*"make" + 0.014*"learning" + 0.014*"analyze" + 0.012*"datum"'),
 (2,
  '0.043*"problem" + 0.024*"solve" + 0.022*"optimization" + 0.019*"design" + 0.018*"metaheuristic" + 0.018*"system" + 0.015*"algorithm" + 0.014*"make" + 0.014*"vision" + 0.013*"minimize"'),
 (3,
  '0.031*"mining" + 0.030*"datum" + 0.028*"machine" + 0.026*"system" + 0.026*"like" + 0.020*"association" + 0.019*"enable" + 0.018*"decision" + 0.018*"healthcare" + 0.016*"analyze"'),
 (4,
  '0.075*"machine" + 0.059*"learning" + 0.040*"classification" + 0.039*"algorithm" + 0.034*"task" + 0.030*"datum" + 0.022*"application" + 0.021*"stream" + 0.021*"utilize" + 0.020*"system"'),
 (5,
  '0.071*"problem" + 0.041*"metaheuristic" + 0.037*"desig

Latent semantic indexing (LSA)

In [None]:
from gensim.models import LsiModel
from gensim.models import CoherenceModel
import numpy as np


num_topics_values = [3, 5, 10, 15]
chunksize_values = [200, 500, 1000]
decay_values = [0.5, 0.75, 1.0]
power_iters_values = [2, 5, 10]

best_coherence = -1
best_params = {}


for num_topics in num_topics_values:
    for chunksize in chunksize_values:
        for decay in decay_values:
            for power_iters in power_iters_values:
                lsi_model = LsiModel(
                    corpus=corpus,
                    num_topics=num_topics,
                    id2word=dictionary,
                    chunksize=chunksize,
                    decay=decay,
                    power_iters=power_iters
                )

                
                coherence_model = CoherenceModel(
                    model=lsi_model,
                    texts=texts,  
                    dictionary=dictionary,
                    coherence='c_v'
                )
                coherence_score = coherence_model.get_coherence()

                
                if coherence_score > best_coherence:
                    best_coherence = coherence_score
                    best_params = {
                        'num_topics': num_topics,
                        'chunksize': chunksize,
                        'decay': decay,
                        'power_iters': power_iters
                    }

print("Best Parameters:", best_params)
print("Best Coherence Score:", best_coherence)


Best Parameters: {'num_topics': 3, 'chunksize': 200, 'decay': 0.5, 'power_iters': 2}
Best Coherence Score: 0.9462417049114343


In [39]:

lsimodel = LsiModel(corpus=corpus, num_topics=3, id2word=dictionary, chunksize=200,decay=0.5,power_iters=2)
lsimodel.show_topics() 


[(0,
  '-0.382*"machine" + -0.326*"learning" + -0.261*"system" + -0.239*"datum" + -0.226*"vision" + -0.204*"like" + -0.175*"task" + -0.160*"algorithm" + -0.148*"analyze" + -0.119*"classification"'),
 (1,
  '-0.435*"problem" + -0.218*"design" + -0.218*"solve" + -0.218*"optimization" + -0.218*"metaheuristic" + -0.127*"field" + -0.127*"strategy" + -0.127*"large" + -0.127*"technique" + -0.126*"algorithm"'),
 (2,
  '0.350*"vision" + -0.197*"mining" + -0.189*"learning" + 0.175*"image" + -0.149*"datum" + 0.144*"system" + -0.138*"algorithm" + -0.131*"association" + -0.131*"decision" + 0.131*"problem"')]

In [40]:
lsimodel = LsiModel(corpus=corpus, num_topics=4, id2word=dictionary, chunksize=200,decay=0.5,power_iters=2)
lsimodel.show_topics() 

[(0,
  '-0.382*"machine" + -0.326*"learning" + -0.261*"system" + -0.239*"datum" + -0.226*"vision" + -0.204*"like" + -0.175*"task" + -0.160*"algorithm" + -0.148*"analyze" + -0.119*"classification"'),
 (1,
  '0.435*"problem" + 0.218*"design" + 0.218*"metaheuristic" + 0.218*"optimization" + 0.218*"solve" + 0.127*"large" + 0.127*"technique" + 0.127*"strategy" + 0.127*"field" + 0.126*"algorithm"'),
 (2,
  '0.350*"vision" + -0.197*"mining" + -0.189*"learning" + 0.175*"image" + -0.149*"datum" + 0.144*"system" + -0.138*"algorithm" + -0.131*"decision" + -0.131*"association" + 0.131*"problem"'),
 (3,
  '0.308*"mining" + 0.205*"decision" + 0.205*"association" + -0.200*"learning" + -0.169*"machine" + -0.166*"classification" + -0.136*"task" + 0.133*"enable" + 0.133*"healthcare" + 0.121*"vision"')]

In [45]:

'''
the model gave only 4 topics might be due to :
Lack of Distinct Topics
Sparse or Limited Data
Overlap in Topics
Not enough variability in the document-term matrix.
'''

lsimodel = LsiModel(corpus=corpus, num_topics=6, id2word=dictionary, chunksize=200,decay=0.5,power_iters=2)
lsimodel.show_topics() 


[(0,
  '0.382*"machine" + 0.326*"learning" + 0.261*"system" + 0.239*"datum" + 0.226*"vision" + 0.204*"like" + 0.175*"task" + 0.160*"algorithm" + 0.148*"analyze" + 0.119*"classification"'),
 (1,
  '0.435*"problem" + 0.218*"design" + 0.218*"metaheuristic" + 0.218*"solve" + 0.218*"optimization" + 0.127*"large" + 0.127*"technique" + 0.127*"field" + 0.127*"strategy" + 0.126*"algorithm"'),
 (2,
  '0.350*"vision" + -0.197*"mining" + -0.189*"learning" + 0.175*"image" + -0.149*"datum" + 0.144*"system" + -0.138*"algorithm" + -0.131*"decision" + -0.131*"association" + 0.131*"problem"'),
 (3,
  '0.308*"mining" + 0.205*"decision" + 0.205*"association" + -0.200*"learning" + -0.169*"machine" + -0.166*"classification" + -0.136*"task" + 0.133*"healthcare" + 0.133*"enable" + 0.121*"vision"')]

Hierarchical Dirichlet process (HDP)

In [53]:
from gensim.models import HdpModel

hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)
hdpmodel.show_topics()

[(0,
  '0.023*enable + 0.020*analyze + 0.019*prominent + 0.018*lie + 0.018*technique + 0.017*minimize + 0.017*decision + 0.017*world + 0.016*facial + 0.015*pattern + 0.014*healthcare + 0.014*datum + 0.014*mining + 0.014*sight + 0.013*recommendation + 0.013*fraud + 0.013*difficult + 0.013*increasingly + 0.013*apply + 0.012*human'),
 (1,
  '0.025*computer + 0.024*like + 0.020*decision + 0.020*system + 0.020*facial + 0.019*vision + 0.018*multiple + 0.016*model + 0.016*minimize + 0.016*subset + 0.015*continue + 0.015*task + 0.014*study + 0.013*size + 0.013*vehicle + 0.013*indispensable + 0.013*visual + 0.012*autonomous + 0.012*refer + 0.012*detect'),
 (2,
  '0.026*platform + 0.025*learning + 0.021*machine + 0.019*algorithm + 0.019*classification + 0.017*cluster + 0.017*tree + 0.016*system + 0.015*local + 0.015*lie + 0.014*objective + 0.014*artificial + 0.013*successful + 0.013*behavior + 0.013*appropriate + 0.012*vision + 0.012*improve + 0.012*regression + 0.012*making + 0.012*explicitly')

In [None]:
'''
Topic 1: Computer Vision and Machine Learning Systems
Topic 2: Machine Learning and Algorithms
Topic 3: Natural Language Processing and Text Mining
Topic 4: Data Science and Big Data Analytics
Topic 5: Healthcare and Predictive Analytics
Topic 6: Social Media and Sentiment Analysis
Topic 7: Artificial Intelligence and Robotics
Topic 8: Financial Analysis and Risk Management
'''