# Topic Modeling

### import libraries

In [4]:
#pip install pyLDAvis

In [5]:
import pandas as pd
import numpy as np
import os
import nltk
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models import LdaModel
from sklearn.feature_extraction.text import TfidfVectorizer
import pyLDAvis.gensim as gensimvis
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pyLDAvis
pyLDAvis.enable_notebook()
import warnings
warnings.filterwarnings('ignore')


In [6]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

### Loading data

In [7]:
df = pd.read_csv('https://github.com/saeed-saffari/LLM_education_topic_modeling/raw/refs/heads/main/Tastic%20AI%20-%20Opensource%20-%20AI%20tools.csv')
print('Shape:', df.shape)
print(df.columns)
df.head()

Shape: (4665, 7)
Index(['title', 'minidesc', 'desc', 'cat', 'new_image', 'url-href', 'vote'], dtype='object')


Unnamed: 0,title,minidesc,desc,cat,new_image,url-href,vote
0,ChatGPT,ChatGPT optimizes language models for dialogue,ChatGPT FeaturesChatGPT is an AI-powered conve...,AI Productivity tools,https://tasticai.com/wp-content/uploads/aitool...,https://chat.openai.com/chat,4813.0
1,Namelix,"Generate short, brandable business names & dom...",Namelix FeaturesNamelix is an AI-powered busin...,AI startup tools,https://tasticai.com/wp-content/uploads/aitool...,https://namelix.com,4012.0
2,Tome,Unlock your best work with AI-powered generati...,Tome FeaturesThe future of generative storytel...,AI presentation tools,https://tasticai.com/wp-content/uploads/aitool...,https://tome.app,3593.0
3,Fliki,Create videos from blog posts in 2 mins,Fliki FeaturesFliki is an AI-powered text-to-v...,AI video Generator tools,https://tasticai.com/wp-content/uploads/aitool...,https://fliki.ai,3580.0
4,Midjourney,AI lab exploring new mediums to expand human i...,Midjourney FeaturesMidjourney is an independen...,AI tools for art,https://tasticai.com/wp-content/uploads/aitool...,https://www.midjourney.com/home,3000.0


In [8]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

### clean data general

In [9]:
def preprocess(text):
    if isinstance(text, str):
        tokens = word_tokenize(text.lower())
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
        return tokens
    return []


descriptions = df['desc'].tolist()
processed_descriptions = [preprocess(desc) for desc in descriptions]

# Create a dictionary from the processed descriptions
dictionary = corpora.Dictionary(processed_descriptions)

# Create a corpus (bag of words for each document)
corpus = [dictionary.doc2bow(text) for text in processed_descriptions]

### Training model general


In [10]:
# Train the LDA model
lda_model = LdaModel(corpus, num_topics=7, id2word=dictionary, passes=30) # passes: number of whole text reviews / num_topics: number of topics

In [11]:
# Print the topics found by the LDA model
topics = lda_model.print_topics(num_words=10)
for idx, topic in topics:
    print(f"Topic {idx}: {topic}")

Topic 0: 0.020*"user" + 0.015*"personalized" + 0.013*"learning" + 0.011*"music" + 0.010*"feature" + 0.009*"ai" + 0.009*"plan" + 0.009*"individual" + 0.008*"offer" + 0.008*"travel"
Topic 1: 0.051*"content" + 0.019*"video" + 0.014*"tool" + 0.014*"create" + 0.014*"ai" + 0.012*"creation" + 0.012*"feature" + 0.012*"generate" + 0.011*"user" + 0.011*"medium"
Topic 2: 0.025*"user" + 0.018*"data" + 0.017*"tool" + 0.012*"feature" + 0.012*"ai" + 0.011*"email" + 0.011*"information" + 0.010*"writing" + 0.010*"language" + 0.009*"text"
Topic 3: 0.016*"job" + 0.016*"insight" + 0.011*"analysis" + 0.010*"ai" + 0.010*"tool" + 0.009*"data" + 0.009*"interview" + 0.007*"health" + 0.007*"feature" + 0.007*"resume"
Topic 4: 0.036*"image" + 0.020*"user" + 0.019*"ai" + 0.015*"design" + 0.014*"create" + 0.014*"tool" + 0.014*"photo" + 0.012*"unique" + 0.012*"feature" + 0.011*"creative"
Topic 5: 0.042*"ai" + 0.035*"user" + 0.012*"model" + 0.012*"experience" + 0.011*"feature" + 0.010*"conversation" + 0.009*"tool" + 

In [None]:
#doc_topics = lda_model.get_document_topics(corpus)

#document_topic_mapping = []

#for doc in doc_topics:
    # Find the topic with the highest probability for the current document
#    dominant_topic = max(doc, key=lambda x: x[1])[0]  # Get topic index with highest probability
#    document_topic_mapping.append(dominant_topic)

# Assuming 'df' is your original dataframe and 'descriptions' is the column containing LLM descriptions
#df['Dominant_Topic'] = document_topic_mapping

# This will show the first few rows with the assigned topic for each LLM
#print(df[['desc', 'Dominant_Topic']].head(10))

In [None]:
# Assuming 'df' is your original dataframe and 'descriptions' is the column containing LLM descriptions
#df['Dominant_Topic'] = document_topic_mapping

# This will show the first few rows with the assigned topic for each LLM
#print(df[['desc', 'Dominant_Topic']].head(10))

In [None]:
# Prepare the visualization
lda_display = gensimvis.prepare(lda_model, corpus, dictionary)

# Show the visualization in a browser
pyLDAvis.display(lda_display)

In [None]:
#pyLDAvis.save_html(lda_display, 'lda_visualization_general.html')

## Education filter only

In [12]:
education_keywords = ['teach', 'teaching', 'learn', 'learning', 'classroom', 'student', 'rephrase', 'plagiarism', 'instructor', 'curriculum', 'assessment', 'homework', 'school', 'educate', 'education', 'tutor']

In [13]:
def contains_education_terms(text, education_keywords):
    return any(kw in text for kw in education_keywords)

### Filter data

In [14]:
# Filter descriptions that mention any educational keywords
filtered_df = df[df['desc'].apply(lambda x: contains_education_terms(str(x).lower(), education_keywords))]
descriptions = filtered_df['desc'].tolist()
processed_descriptions = [preprocess(desc) for desc in descriptions]

# Create dictionary and corpus
dictionary = corpora.Dictionary(processed_descriptions)
corpus = [dictionary.doc2bow(text) for text in processed_descriptions]

### Training model education


#### in 5 topics

In [None]:
lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=50)
# Print the topics
topics = lda_model.print_topics(num_words=10)
for idx, topic in topics:
    print(f"Topic {idx}: {topic}")

Topic 0: 0.017*"writing" + 0.016*"tool" + 0.014*"user" + 0.014*"content" + 0.012*"feature" + 0.011*"ai" + 0.009*"email" + 0.008*"job" + 0.008*"case" + 0.007*"professional"
Topic 1: 0.031*"content" + 0.018*"create" + 0.016*"ai" + 0.014*"user" + 0.013*"tool" + 0.012*"feature" + 0.010*"generate" + 0.009*"image" + 0.009*"creation" + 0.009*"story"
Topic 2: 0.026*"ai" + 0.021*"user" + 0.018*"learning" + 0.010*"feature" + 0.010*"personalized" + 0.009*"tool" + 0.009*"language" + 0.008*"experience" + 0.008*"offer" + 0.008*"conversation"
Topic 3: 0.022*"ai" + 0.020*"video" + 0.016*"data" + 0.013*"user" + 0.012*"tool" + 0.011*"customer" + 0.009*"learning" + 0.009*"business" + 0.008*"feature" + 0.008*"model"
Topic 4: 0.022*"user" + 0.012*"tool" + 0.011*"information" + 0.010*"feature" + 0.009*"ai" + 0.008*"summary" + 0.008*"research" + 0.007*"case" + 0.007*"time" + 0.007*"document"


In [None]:
# Prepare the visualization
lda_display = gensimvis.prepare(lda_model, corpus, dictionary)

# Show the visualization in a browser
pyLDAvis.display(lda_display)

In [None]:
#pyLDAvis.save_html(lda_display, 'lda_visualization_5_group.html')

##### **5 Group Topics**

- **Topic 0: Personalized Learning and Creativity for Children**
  - ai, story, user, create, tool, personalized, feature, creative, unique, child

- **Topic 1: Data-Driven Learning Platforms and Chatbots**
  - ai, data, user, customer, model, learning, tool, feature, chatbot, platform

- **Topic 2: Language Learning and Skill Development**
  - user, ai, learning, language, tool, personalized, offer, experience, skill

- **Topic 3: Writing Assistance and Educational Content Creation**
  - user, tool, writing, feature, ai, content, audio, case, offer, email

- **Topic 4: Video and Multimedia Content Creation for Education**
  - content, video, ai, tool, create, feature, user, case, various, creation

#### with 7 topics

In [None]:
lda_model7 = LdaModel(corpus, num_topics=7, id2word=dictionary, passes=50)
# Print the topics
topics7 = lda_model7.print_topics(num_words=10)
for idx, topic in topics7:
    print(f"Topic {idx}: {topic}")

Topic 0: 0.026*"user" + 0.021*"ai" + 0.013*"learning" + 0.013*"conversation" + 0.012*"personalized" + 0.012*"language" + 0.010*"feature" + 0.009*"offer" + 0.008*"skill" + 0.007*"email"
Topic 1: 0.026*"learning" + 0.023*"ai" + 0.017*"story" + 0.011*"platform" + 0.011*"feature" + 0.010*"tool" + 0.010*"model" + 0.009*"personalized" + 0.009*"create" + 0.009*"experience"
Topic 2: 0.039*"content" + 0.023*"ai" + 0.022*"writing" + 0.015*"tool" + 0.011*"language" + 0.011*"feature" + 0.010*"text" + 0.009*"create" + 0.008*"generate" + 0.008*"various"
Topic 3: 0.016*"code" + 0.013*"tool" + 0.011*"quiz" + 0.010*"process" + 0.010*"ai" + 0.009*"question" + 0.009*"assessment" + 0.008*"coding" + 0.008*"lesson" + 0.008*"student"
Topic 4: 0.018*"ai" + 0.017*"data" + 0.016*"user" + 0.014*"customer" + 0.011*"tool" + 0.010*"feature" + 0.009*"support" + 0.009*"answer" + 0.008*"business" + 0.008*"case"
Topic 5: 0.036*"video" + 0.020*"user" + 0.013*"audio" + 0.012*"tool" + 0.012*"ai" + 0.012*"feature" + 0.010*

In [None]:
# Prepare the visualization
lda_display = gensimvis.prepare(lda_model7, corpus, dictionary)

# Show the visualization in a browser
pyLDAvis.display(lda_display)

In [None]:
#pyLDAvis.save_html(lda_display, 'lda_visualization_7_group.html')

##### **7 Group Topics**

- **Topic 0: Customer Support and Business Insights**
  - user, ai, customer, tool, feature, insight, data, support, business, chatbot

- **Topic 1: Writing and Student Tools**
  - writing, ai, user, tool, student, feature, case, key, various, offer

- **Topic 2: Job Applications and AI Assistance**
  - data, job, ai, user, interview, email, feature, tool, resume, model

- **Topic 3: AI Coding Platforms and Developer Tools**
  - ai, code, coding, feature, user, tool, developer, platform, case, various

- **Topic 4: Learning and Quiz-Based Assessments**
  - quiz, ai, user, tool, question, learning, skill, candidate, feature, process

- **Topic 5: Multimedia Content Creation**
  - content, tool, video, create, user, ai, feature, creation, image, generate

- **Topic 6: Personalized Learning and Interactive AI**
  - user, learning, ai, personalized, language, experience, story, feature, conversation, interactive

#### with 10 topics

In [15]:
lda_model10 = LdaModel(corpus, num_topics=10, id2word=dictionary, passes=30)
# Print the topics
topics10 = lda_model10.print_topics(num_words=20)
for idx, topic in topics10:
    print(f"Topic {idx}: {topic}")

Topic 0: 0.010*"page" + 0.009*"meeting" + 0.008*"formula" + 0.006*"landing" + 0.006*"user" + 0.006*"mentat" + 0.005*"writing" + 0.005*"team" + 0.005*"excel" + 0.005*"font" + 0.005*"feature" + 0.005*"devgpt" + 0.004*"coding" + 0.004*"tool" + 0.004*"test" + 0.004*"key" + 0.004*"generation" + 0.004*"specific" + 0.004*"like" + 0.004*"grammar"
Topic 1: 0.025*"video" + 0.023*"user" + 0.019*"email" + 0.014*"tool" + 0.010*"music" + 0.010*"feature" + 0.008*"youtube" + 0.007*"search" + 0.006*"audio" + 0.006*"song" + 0.006*"ai" + 0.006*"content" + 0.005*"case" + 0.005*"interface" + 0.005*"map" + 0.004*"time" + 0.004*"professional" + 0.004*"support" + 0.004*"photo" + 0.004*"learning"
Topic 2: 0.017*"user" + 0.015*"tool" + 0.012*"quiz" + 0.011*"feature" + 0.010*"student" + 0.010*"learning" + 0.010*"lesson" + 0.010*"question" + 0.008*"case" + 0.008*"create" + 0.007*"task" + 0.007*"plan" + 0.007*"ai" + 0.006*"offer" + 0.006*"various" + 0.006*"platform" + 0.006*"designed" + 0.006*"educator" + 0.005*"a

In [None]:
# Prepare the visualization
lda_display = gensimvis.prepare(lda_model10, corpus, dictionary)

# Show the visualization in a browser
pyLDAvis.display(lda_display)

In [None]:
#pyLDAvis.save_html(lda_display, 'lda_visualization_10_group.html')

##### **10 Group Topics**


- **Topic 0: AI Tools for Data, Search, and Learning**
  - user, ai, data, tool, feature, information, case, search, code, insight

- **Topic 1: AI-Enhanced Content Creation and Job Tools**
  - content, ai, job, tool, user, interview, feature, offer, writing, creation

- **Topic 2: Customer Support and Communication Tools**
  - customer, user, chatbot, support, email, ai, business, communication, feature, personalized

- **Topic 3: Writing Assistance and Academic Tools**
  - writing, tool, user, ai, feature, summary, key, content, text, student

- **Topic 4: Personalized Learning and Language Experience**
  - ai, learning, user, language, feature, personalized, experience, story, conversation, tool

- **Topic 5: Data Analysis and Business Insights**
  - data, book, ai, insight, analysis, tool, model, compliance, business, user

- **Topic 6: Document Processing and Legal AI Tools**
  - ai, user, document, learning, tool, feature, model, machine, case, various

- **Topic 7: Image and Media Generation Tools**
  - content, create, image, ai, tool, user, generate, feature, medium, creation

- **Topic 8: Video Editing and Transcription Tools**
  - video, user, tool, audio, ai, transcription, feature, editing, youtube, candidate

- **Topic 9: Audio and Music Content Creation**
  - content, audio, voice, course, ai, user, music, feature, create, tool

In [22]:
doc_topics = lda_model10.get_document_topics(corpus)
document_topic_mapping = []

for doc in doc_topics:
    # Find the topic with the highest probability for the current document
    dominant_topic = max(doc, key=lambda x: x[1])[0]  # Get topic index with highest probability
    document_topic_mapping.append(dominant_topic)

In [17]:
topic_name = ['Topic 0: AI Tools for Data, Search, and Learning',
              'Topic 1: AI-Enhanced Content Creation and Job Tools',
              'Topic 2: Customer Support and Communication Tools',
              'Topic 3: Writing Assistance and Academic Tools',
              'Topic 4: Personalized Learning and Language Experience',
              'Topic 5: Data Analysis and Business Insights',
              'Topic 6: Document Processing and Legal AI Tools',
              'Topic 7: Image and Media Generation Tools',
              'Topic 8: Video Editing and Transcription Tools',
              'Topic 9: Audio and Music Content Creation'
              ]

In [18]:
df.columns

Index(['title', 'minidesc', 'desc', 'cat', 'new_image', 'url-href', 'vote'], dtype='object')

In [21]:
len(document_topic_mapping)

1153

In [29]:

edu_with_topic = filtered_df#df[df['cat'] == 'AI tools for education']
edu_with_topic['Topic Number'] = document_topic_mapping

edu_with_topic['Topic Name'] = edu_with_topic['Topic Number'].apply(lambda x: topic_name[x])
print('shape:', edu_with_topic.shape)
edu_with_topic[['title', 'desc', 'Topic Number', 'Topic Name']].head(10)
#edu_with_topic.to_csv('edu_with_topic.csv', index=False)

shape: (1153, 9)


Unnamed: 0,title,desc,Topic Number,Topic Name
0,ChatGPT,ChatGPT FeaturesChatGPT is an AI-powered conve...,6,Topic 6: Document Processing and Legal AI Tools
1,Namelix,Namelix FeaturesNamelix is an AI-powered busin...,8,Topic 8: Video Editing and Transcription Tools
8,Piggy Magic,Piggy Magic FeaturesPiggy Magic is an AI-power...,5,Topic 5: Data Analysis and Business Insights
9,Writesonic,Writesonic FeaturesWritesonic is a unique AI-b...,5,Topic 5: Data Analysis and Business Insights
16,Humata AI,Humata AI FeaturesHumata is an AI-powered chat...,6,Topic 6: Document Processing and Legal AI Tools
23,TutorAI,TutorAI FeaturesTutor AI is an AI-powered lear...,4,Topic 4: Personalized Learning and Language Ex...
29,Stable Diffusion,Stable Diffusion FeaturesStable Diffusion exce...,5,Topic 5: Data Analysis and Business Insights
33,Jasper,Jasper FeaturesJasper is an AI-powered copywri...,5,Topic 5: Data Analysis and Business Insights
35,Autodraw,Autodraw FeaturesAutoDraw is a machine learnin...,6,Topic 6: Document Processing and Legal AI Tools
37,Fy! Studio,Fy! Studio FeaturesFy! Studio is an AI-driven ...,5,Topic 5: Data Analysis and Business Insights


IndexError: list index out of range

In [45]:
# Get the topic distribution for each document
doc_topics = lda_model10.get_document_topics(corpus)

# Initialize lists to store the top 3 topics for each document
topic_1_mapping = []
topic_2_mapping = []
topic_3_mapping = []

for doc in doc_topics:
    # Sort the topics for each document by probability in descending order
    sorted_topics = sorted(doc, key=lambda x: x[1], reverse=True)

    # Check if there are fewer than 3 topics for a document (edge case)
    if len(sorted_topics) >= 3:
        # Extract the top 3 topics
        top_3_topics = [topic[0] for topic in sorted_topics[:3]]  # Get topic index
    else:
        # Handle documents with fewer than 3 topics
        top_3_topics = [topic[0] for topic in sorted_topics] + [None] * (3 - len(sorted_topics))

    # Store the topics in separate lists
    topic_1_mapping.append(top_3_topics[0])
    topic_2_mapping.append(top_3_topics[1])
    topic_3_mapping.append(top_3_topics[2])

# Define the topic names corresponding to the topic numbers
topic_name = [
    'Topic 0: AI Tools for Data, Search, and Learning',
    'Topic 1: AI-Enhanced Content Creation and Job Tools',
    'Topic 2: Customer Support and Communication Tools',
    'Topic 3: Writing Assistance and Academic Tools',
    'Topic 4: Personalized Learning and Language Experience',
    'Topic 5: Data Analysis and Business Insights',
    'Topic 6: Document Processing and Legal AI Tools',
    'Topic 7: Image and Media Generation Tools',
    'Topic 8: Video Editing and Transcription Tools',
    'Topic 9: Audio and Music Content Creation'
]

# Filter for 'AI tools for education' category
edu_with_topic_3 = filtered_df  # Assuming 'filtered_df' is already filtered for 'AI tools for education'

# Add the top 3 topics as new columns with their corresponding numbers
edu_with_topic_3['Topic 1 Number'] = topic_1_mapping
edu_with_topic_3['Topic 2 Number'] = topic_2_mapping
edu_with_topic_3['Topic 3 Number'] = topic_3_mapping

# Map the topic numbers to their corresponding topic names, handling None and NaN values
edu_with_topic_3['Topic 1 Name'] = edu_with_topic_3['Topic 1 Number'].apply(lambda x: topic_name[int(x)] if pd.notna(x) else 'N/A')
edu_with_topic_3['Topic 2 Name'] = edu_with_topic_3['Topic 2 Number'].apply(lambda x: topic_name[int(x)] if pd.notna(x) else 'N/A')
edu_with_topic_3['Topic 3 Name'] = edu_with_topic_3['Topic 3 Number'].apply(lambda x: topic_name[int(x)] if pd.notna(x) else 'N/A')


# Show the resulting dataframe with descriptions and the top 3 topics
print('shape:', edu_with_topic_3.shape)
edu_with_topic_3[['title', 'desc', 'Topic 1 Name', 'Topic 2 Name', 'Topic 3 Name']].head(10)
#edu_with_topic_3.head()
# Optional: Save to CSV if needed
#edu_with_topic_3.to_csv('edu_with_top_3_topics.csv', index=False)

shape: (1153, 15)


Unnamed: 0,title,desc,Topic 1 Name,Topic 2 Name,Topic 3 Name
0,ChatGPT,ChatGPT FeaturesChatGPT is an AI-powered conve...,Topic 6: Document Processing and Legal AI Tools,Topic 5: Data Analysis and Business Insights,
1,Namelix,Namelix FeaturesNamelix is an AI-powered busin...,Topic 8: Video Editing and Transcription Tools,Topic 5: Data Analysis and Business Insights,Topic 6: Document Processing and Legal AI Tools
8,Piggy Magic,Piggy Magic FeaturesPiggy Magic is an AI-power...,Topic 5: Data Analysis and Business Insights,Topic 8: Video Editing and Transcription Tools,Topic 2: Customer Support and Communication Tools
9,Writesonic,Writesonic FeaturesWritesonic is a unique AI-b...,Topic 5: Data Analysis and Business Insights,,
16,Humata AI,Humata AI FeaturesHumata is an AI-powered chat...,Topic 6: Document Processing and Legal AI Tools,Topic 3: Writing Assistance and Academic Tools,
23,TutorAI,TutorAI FeaturesTutor AI is an AI-powered lear...,Topic 4: Personalized Learning and Language Ex...,,
29,Stable Diffusion,Stable Diffusion FeaturesStable Diffusion exce...,Topic 5: Data Analysis and Business Insights,,
33,Jasper,Jasper FeaturesJasper is an AI-powered copywri...,Topic 5: Data Analysis and Business Insights,Topic 8: Video Editing and Transcription Tools,
35,Autodraw,Autodraw FeaturesAutoDraw is a machine learnin...,Topic 6: Document Processing and Legal AI Tools,,
37,Fy! Studio,Fy! Studio FeaturesFy! Studio is an AI-driven ...,Topic 5: Data Analysis and Business Insights,Topic 8: Video Editing and Transcription Tools,


#### with 20 topics

In [33]:
len(topics30)

20

In [34]:
lda_model30 = LdaModel(corpus, num_topics=20, id2word=dictionary, passes=30)
# Print the topics
topics30 = lda_model30.print_topics(num_words=20)
for idx, topic in topics30:
    print(f"Topic {idx}: {topic}")

Topic 0: 0.035*"user" + 0.019*"ai" + 0.011*"conversation" + 0.010*"quiz" + 0.010*"feature" + 0.009*"engage" + 0.009*"offer" + 0.009*"tool" + 0.009*"chat" + 0.009*"learning" + 0.008*"knowledge" + 0.008*"insight" + 0.008*"language" + 0.007*"question" + 0.007*"interactive" + 0.007*"guidance" + 0.007*"case" + 0.007*"book" + 0.006*"experience" + 0.006*"topic"
Topic 1: 0.025*"customer" + 0.016*"data" + 0.015*"user" + 0.014*"chatbot" + 0.013*"ai" + 0.013*"business" + 0.013*"support" + 0.011*"personalized" + 0.009*"feature" + 0.009*"insight" + 0.009*"tool" + 0.009*"team" + 0.008*"platform" + 0.008*"provide" + 0.008*"case" + 0.007*"learning" + 0.007*"website" + 0.006*"experience" + 0.006*"offer" + 0.006*"machine"
Topic 2: 0.012*"car" + 0.009*"writer" + 0.007*"homehelper" + 0.007*"home" + 0.007*"resoomer" + 0.006*"improvement" + 0.006*"social" + 0.006*"writeplus" + 0.006*"podcast" + 0.005*"professional" + 0.005*"cost" + 0.005*"purchasing" + 0.005*"buying" + 0.005*"ritebot" + 0.005*"idea" + 0.005

In [35]:
# Prepare the visualization
lda_display = gensimvis.prepare(lda_model30, corpus, dictionary)

# Show the visualization in a browser
pyLDAvis.display(lda_display)

In [36]:
#pyLDAvis.save_html(lda_display, 'lda_visualization_20_group.html')

##### **20 Group Topics**

1. **AI for Conversations and Learning Quizzes**
   - Keywords: user, ai, conversation, quiz, engage, chat, learning, interactive

2. **Customer Support and Business AI Solutions**
   - Keywords: customer, data, chatbot, ai, support, business, personalized, team

3. **Tools for Professional and Home Assistance**
   - Keywords: car, writer, homehelper, resoomer, professional, social, podcast

4. **Language Learning and Skill Development Tools**
   - Keywords: language, learning, skill, audio, practice, student, feedback

5. **Coding and Developer Assistance Tools**
   - Keywords: code, professional, developer, ai, language, medical, productivity

6. **AI for Content Creation and Social Media**
   - Keywords: content, ai, creation, generate, video, social, marketing, creator

7. **Storytelling and Personalized AI Tools**
   - Keywords: story, storytelling, character, ai, child, personalized, engaging

8. **Image Editing and AI Photo Tools**
   - Keywords: image, tool, photo, ai, background, editing, business, enhance

9. **AI Learning Platforms and Interactive Courses**
   - Keywords: learning, course, ai, interactive, platform, video, team, personalized

10. **AI for Research and Information Retrieval**
    - Keywords: tool, information, summary, ai, search, research, document, content

11. **Email and Communication AI Tools**
    - Keywords: email, tool, ai, support, communication, transcription, plan, student

12. **Writing and Video Creation Tools**
    - Keywords: writing, tool, video, ai, content, prompt, generate, essay, text

13. **Job Applications and Resume Building Tools**
    - Keywords: job, resume, letter, cover, mental health, fitness, workout, seeker

14. **Interview and Career Preparation Tools**
    - Keywords: interview, ai, tool, sql, query, database, career, individual, support

15. **AI for Data Detection and Content Analysis**
    - Keywords: content, data, tool, text, ai, plagiarism, detection, business

16. **AI for Personalized Learning and Models**
    - Keywords: ai, user, learning, personalized, experience, model, book, machine

17. **Creative AI Tools for Music and Art**
    - Keywords: music, face, map, ai, tool, creative, technology, art, inspiration

18. **AI for Recruitment and Skill Assessment**
    - Keywords: skill, ai, interview, hiring, recruitment, talent, candidate, platform

19. **Art and Design AI Tools**
    - Keywords: design, art, style, ai, project, artistic, creativity, artist, font

20. **AI for Investment and Crypto Analysis**
    - Keywords: child, investment, crypto, learning, ai, trading, stock, personalized


## NEW Filter data on columns

In [55]:
freq = df.cat.value_counts()
freq = pd.DataFrame(freq)
freq['percentage'] = round(freq / len(df) * 100,2)
print('total:', len(df))
freq.head(10)

total: 4665


Unnamed: 0_level_0,count,percentage
cat,Unnamed: 1_level_1,Unnamed: 2_level_1
AI writing tools,361,7.74
AI Productivity tools,262,5.62
AI day-to-day assistant,217,4.65
AI image generator,191,4.09
AI tools for developper,189,4.05
AI tools for education,172,3.69
AI Customer Service tools,162,3.47
Social media AI tools,153,3.28
AI startup tools,150,3.22
AI image editing tool,136,2.92


In [57]:
edu_df = df[df['cat'] == 'AI tools for education']
print('shape:', edu_df.shape)
edu_df.head()

shape: (172, 7)


Unnamed: 0,title,minidesc,desc,cat,new_image,url-href,vote
23,TutorAI,"AI-powered learning platform; enter topic, get...",TutorAI FeaturesTutor AI is an AI-powered lear...,AI tools for education,https://tasticai.com/wp-content/uploads/aitool...,https://www.tutorai.me,1638.0
41,PaperBrain,Platform to access & understand research paper...,PaperBrain FeaturesA platform for you to acces...,AI tools for education,https://tasticai.com/wp-content/uploads/aitool...,https://www.paperbrain.study,1310.0
47,Wisdolia,AI generated flashcards for any article / PDF,Wisdolia FeaturesA Chrome Extension that uses ...,AI tools for education,https://tasticai.com/wp-content/uploads/aitool...,https://wisdolia.com,1233.0
52,CheckForAI,Detect AI text in essays/emails w/ Open AI & p...,CheckForAI FeaturesCheckforAi is an AI detecti...,AI tools for education,https://tasticai.com/wp-content/uploads/aitool...,https://checkforai.com,1152.0
77,WolframAlpha,"Compute expert-level answers in Math, Science,...",WolframAlpha FeaturesCompute expert-level answ...,AI tools for education,https://tasticai.com/wp-content/uploads/aitool...,https://www.wolframalpha.com,848.0


In [58]:
descriptions = edu_df['desc'].tolist()
processed_descriptions = [preprocess(desc) for desc in descriptions]

# Create dictionary and corpus
dictionary = corpora.Dictionary(processed_descriptions)
corpus = [dictionary.doc2bow(text) for text in processed_descriptions]

#### NEW with 20 topics

In [60]:
lda_model_new_20 = LdaModel(corpus, num_topics=20, id2word=dictionary, passes=50)
# Print the topics
topics = lda_model_new_20.print_topics(num_words=20)
for idx, topic in topics:
    print(f"Topic {idx}: {topic}")

Topic 0: 0.027*"student" + 0.026*"learning" + 0.022*"lesson" + 0.019*"platform" + 0.016*"academic" + 0.016*"tool" + 0.011*"feature" + 0.011*"designed" + 0.011*"educator" + 0.011*"plan" + 0.010*"various" + 0.009*"include" + 0.009*"case" + 0.009*"grading" + 0.008*"writing" + 0.008*"ai" + 0.008*"feedback" + 0.008*"offer" + 0.008*"education" + 0.008*"advantage"
Topic 1: 0.063*"lesson" + 0.027*"langmob" + 0.024*"plan" + 0.019*"ai" + 0.017*"resource" + 0.015*"reference" + 0.012*"generative" + 0.012*"homework" + 0.012*"assistance" + 0.011*"ensuring" + 0.009*"chatbot" + 0.009*"subject" + 0.009*"upload" + 0.009*"beginner" + 0.008*"style" + 0.008*"targeted" + 0.008*"support" + 0.008*"format" + 0.007*"teaching" + 0.007*"instant"
Topic 2: 0.037*"news" + 0.023*"lesson" + 0.020*"student" + 0.019*"create" + 0.019*"tool" + 0.018*"teacher" + 0.014*"content" + 0.013*"quiz" + 0.012*"feature" + 0.012*"generate" + 0.011*"various" + 0.011*"plan" + 0.010*"planning" + 0.009*"article" + 0.009*"case" + 0.009*"e

In [61]:
# Prepare the visualization
lda_display = gensimvis.prepare(lda_model_new_20, corpus, dictionary)

# Show the visualization in a browser
pyLDAvis.display(lda_display)

In [62]:
#pyLDAvis.save_html(lda_display, 'lda_visualization_20_group_new.html')

##### New 20 topics

1. **Student Learning and Educational Platforms**
   - Keywords: student, learning, lesson, platform, academic, tool, educator, grading, feedback, education

2. **Lesson Planning and Homework Assistance**
   - Keywords: lesson, langmob, plan, ai, resource, reference, homework, assistance, support, teaching

3. **News and Quiz-Based Learning Tools**
   - Keywords: news, lesson, student, teacher, content, quiz, generate, plan, article, engaging

4. **AI-Powered Academic Tools and Assignments**
   - Keywords: ai, user, content, technology, assignment, learning, academic, conversation, algorithm, student

5. **Coding and Educational Journey Tools**
   - Keywords: grant, coding, learning, student, educational, tool, mastery, python, performance, academic

6. **AI-Enhanced Insights for Learning**
   - Keywords: lesson, insight, student, assistant, ai, data, analysis, generate, personal, interface

7. **Course and Quiz Creation Tools**
   - Keywords: question, course, user, assessment, quiz, generate, creation, content, feature, learning

8. **AI Coaching and Career Guidance**
   - Keywords: coaching, ai, career, learning, platform, personalized, expert, guidance, communication, advice

9. **Language Learning and Speech Detection Tools**
   - Keywords: content, language, emma, skill, video, english, speech, conversational, detection, communication

10. **AI-Driven Learning Courses**
    - Keywords: learning, ai, course, feature, skill, experience, personalized, knowledge, interactive, tool

11. **Math and Personalized Learning Tools**
    - Keywords: book, math, summary, ai, problem, personalized, key, professional, learning, efficient

12. **AI for Personalized Student Support**
    - Keywords: ai, user, personalized, essay, teacher, language, recommendation, feedback, computer vision, report

13. **AI for Learning and Content Creation**
    - Keywords: ai, user, learning, interview, knowledge, insight, analysis, blog, chatgpt, understanding

14. **AI for Reading and Grammar Tools**
    - Keywords: ai, kindle, reading, grammar, basmo, book, experience, tool, checker, provide

15. **Language Learning and Conversational AI**
    - Keywords: language, learning, practice, skill, ai, conversation, personalized, experience, grammar, feedback

16. **AI-Powered Study and Quiz Tools**
    - Keywords: study, tool, quiz, feature, test, personalized, create, educator, knowledge, student

17. **Vocabulary and Math Learning for Children**
    - Keywords: learning, vocabulary, math, sentence, child, skill, map, mind, interactive, engaging

18. **Lesson Planning and Educational Resources**
    - Keywords: lesson, plan, resource, educator, ai, assessment, trivia, create, time, teaching

19. **AI-Powered Learning Platforms**
    - Keywords: learning, user, platform, buddy, brain, feedback, snackz, history, library, experience

20. **AI for Video and Animated Content Creation**
    - Keywords: ai, video, animated, content, art, create, audience, editing, caption, transform

#### NEW with 10 topics

In [63]:
lda_model_new_10 = LdaModel(corpus, num_topics=10, id2word=dictionary, passes=50)
# Print the topics
topics = lda_model_new_10.print_topics(num_words=20)
for idx, topic in topics:
    print(f"Topic {idx}: {topic}")

Topic 0: 0.020*"ai" + 0.014*"learning" + 0.010*"expert" + 0.010*"coaching" + 0.009*"tool" + 0.008*"feature" + 0.007*"access" + 0.007*"insight" + 0.007*"student" + 0.007*"kid" + 0.007*"grant" + 0.006*"enhance" + 0.006*"video" + 0.006*"luminary" + 0.006*"chat" + 0.006*"case" + 0.006*"question" + 0.006*"personalized" + 0.005*"educational" + 0.005*"history"
Topic 1: 0.025*"ai" + 0.022*"video" + 0.020*"news" + 0.013*"content" + 0.013*"animated" + 0.013*"art" + 0.010*"grammar" + 0.010*"create" + 0.009*"user" + 0.009*"tool" + 0.008*"feature" + 0.007*"audience" + 0.007*"online" + 0.007*"platform" + 0.006*"adding" + 0.006*"writing" + 0.006*"idea" + 0.006*"test" + 0.005*"experience" + 0.005*"checker"
Topic 2: 0.027*"learning" + 0.020*"career" + 0.018*"skill" + 0.018*"ai" + 0.011*"personalized" + 0.010*"learn" + 0.010*"designed" + 0.010*"offer" + 0.009*"platform" + 0.009*"individual" + 0.008*"insight" + 0.008*"various" + 0.008*"development" + 0.007*"course" + 0.007*"data" + 0.007*"case" + 0.007*"

In [64]:
# Prepare the visualization
lda_display = gensimvis.prepare(lda_model_new_10, corpus, dictionary)

# Show the visualization in a browser
pyLDAvis.display(lda_display)

In [65]:
#pyLDAvis.save_html(lda_display, 'lda_visualization_10_group_new.html')

##### New 10 topics

1. **AI Coaching and Educational Tools for Kids**
   - Keywords: ai, learning, expert, coaching, tool, student, video, grant, personalized, educational

2. **AI-Powered Video Creation and Animated Content**
   - Keywords: ai, video, news, animated, content, art, grammar, tool, platform, audience

3. **Career Development and Personalized Learning Platforms**
   - Keywords: learning, career, skill, ai, personalized, platform, insight, development, course

4. **AI for Quiz and Lesson Creation in Education**
   - Keywords: learning, lesson, student, ai, quiz, tool, assessment, educator, generate, support

5. **Online Course Creation and Educational Content Platforms**
   - Keywords: course, user, ai, creation, coursebox, online, tool, learning, feature, structure

6. **Language Learning and Math Skill Development Tools**
   - Keywords: learning, language, tool, ai, skill, personalized, math, practice, learner, interactive

7. **AI for Learning Path and Content Creation**
   - Keywords: ai, learning, course, buddy, brain, content, creator, student, personalized, knowledge

8. **AI for Personalized Reading and Language Learning**
   - Keywords: learning, ai, user, book, language, personalized, reading, platform, knowledge, recommendation

9. **Grammar and Vocabulary Practice Tools**
   - Keywords: practice, user, language, grammar, skill, vocabulary, english, coding, tool, feedback

10. **AI for Language Detection and Productivity Tools**
    - Keywords: language, tool, ai, content, student, conversation, platform, detection, plagiarism, search