# Topic Modeling

### import libraries

In [24]:
#pip install pyLDAvis

In [25]:
import pandas as pd
import numpy as np
import os
import nltk
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models import LdaModel
from sklearn.feature_extraction.text import TfidfVectorizer
import pyLDAvis.gensim as gensimvis
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pyLDAvis
pyLDAvis.enable_notebook()
import warnings
warnings.filterwarnings('ignore')


In [26]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

### Loading data

In [27]:
df = pd.read_csv('https://github.com/saeed-saffari/LLM_education_topic_modeling/raw/refs/heads/main/Tastic%20AI%20-%20Opensource%20-%20AI%20tools.csv')
print('Shape:', df.shape)
print(df.columns)
df.head()

Shape: (4665, 7)
Index(['title', 'minidesc', 'desc', 'cat', 'new_image', 'url-href', 'vote'], dtype='object')


Unnamed: 0,title,minidesc,desc,cat,new_image,url-href,vote
0,ChatGPT,ChatGPT optimizes language models for dialogue,ChatGPT FeaturesChatGPT is an AI-powered conve...,AI Productivity tools,https://tasticai.com/wp-content/uploads/aitool...,https://chat.openai.com/chat,4813.0
1,Namelix,"Generate short, brandable business names & dom...",Namelix FeaturesNamelix is an AI-powered busin...,AI startup tools,https://tasticai.com/wp-content/uploads/aitool...,https://namelix.com,4012.0
2,Tome,Unlock your best work with AI-powered generati...,Tome FeaturesThe future of generative storytel...,AI presentation tools,https://tasticai.com/wp-content/uploads/aitool...,https://tome.app,3593.0
3,Fliki,Create videos from blog posts in 2 mins,Fliki FeaturesFliki is an AI-powered text-to-v...,AI video Generator tools,https://tasticai.com/wp-content/uploads/aitool...,https://fliki.ai,3580.0
4,Midjourney,AI lab exploring new mediums to expand human i...,Midjourney FeaturesMidjourney is an independen...,AI tools for art,https://tasticai.com/wp-content/uploads/aitool...,https://www.midjourney.com/home,3000.0


In [28]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

### clean data general

In [29]:
def preprocess(text):
    if isinstance(text, str):
        tokens = word_tokenize(text.lower())
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
        return tokens
    return []


descriptions = df['desc'].tolist()
processed_descriptions = [preprocess(desc) for desc in descriptions]

# Create a dictionary from the processed descriptions
dictionary = corpora.Dictionary(processed_descriptions)

# Create a corpus (bag of words for each document)
corpus = [dictionary.doc2bow(text) for text in processed_descriptions]

### Training model general


In [30]:
# Train the LDA model
lda_model = LdaModel(corpus, num_topics=7, id2word=dictionary, passes=30) # passes: number of whole text reviews / num_topics: number of topics

In [31]:
# Print the topics found by the LDA model
topics = lda_model.print_topics(num_words=10)
for idx, topic in topics:
    print(f"Topic {idx}: {topic}")

Topic 0: 0.041*"content" + 0.017*"user" + 0.016*"ai" + 0.015*"tool" + 0.012*"feature" + 0.010*"business" + 0.010*"email" + 0.010*"writing" + 0.009*"generate" + 0.009*"customer"
Topic 1: 0.022*"team" + 0.015*"sale" + 0.014*"meeting" + 0.011*"feature" + 0.011*"management" + 0.010*"task" + 0.010*"personalized" + 0.009*"customer" + 0.009*"communication" + 0.008*"message"
Topic 2: 0.032*"image" + 0.023*"ai" + 0.021*"user" + 0.015*"create" + 0.014*"design" + 0.014*"tool" + 0.012*"feature" + 0.012*"photo" + 0.010*"generate" + 0.010*"unique"
Topic 3: 0.036*"data" + 0.016*"user" + 0.015*"insight" + 0.013*"tool" + 0.013*"ai" + 0.010*"analysis" + 0.009*"feature" + 0.009*"customer" + 0.009*"business" + 0.008*"information"
Topic 4: 0.028*"ai" + 0.018*"video" + 0.016*"user" + 0.013*"code" + 0.012*"tool" + 0.012*"feature" + 0.009*"audio" + 0.008*"developer" + 0.007*"model" + 0.007*"case"
Topic 5: 0.029*"user" + 0.026*"ai" + 0.016*"conversation" + 0.012*"experience" + 0.012*"learning" + 0.011*"chat" +

In [32]:
# Prepare the visualization
lda_display = gensimvis.prepare(lda_model, corpus, dictionary)

# Show the visualization in a browser
pyLDAvis.display(lda_display)

In [33]:
pyLDAvis.save_html(lda_display, 'lda_visualization_general.html')

## Education filter only

In [34]:
education_keywords = ['teach', 'teaching', 'learn', 'learning', 'classroom', 'student', 'rephrase', 'plagiarism', 'instructor', 'curriculum', 'assessment', 'homework', 'school', 'educate', 'education', 'tutor']

In [35]:
def contains_education_terms(text, education_keywords):
    return any(kw in text for kw in education_keywords)

### Filter data

In [36]:
# Filter descriptions that mention any educational keywords
filtered_df = df[df['desc'].apply(lambda x: contains_education_terms(str(x).lower(), education_keywords))]
descriptions = filtered_df['desc'].tolist()
processed_descriptions = [preprocess(desc) for desc in descriptions]

# Create dictionary and corpus
dictionary = corpora.Dictionary(processed_descriptions)
corpus = [dictionary.doc2bow(text) for text in processed_descriptions]

### Training model education


#### in 5 topics

In [37]:
lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=50)
# Print the topics
topics = lda_model.print_topics(num_words=10)
for idx, topic in topics:
    print(f"Topic {idx}: {topic}")

Topic 0: 0.026*"ai" + 0.015*"user" + 0.013*"video" + 0.013*"tool" + 0.013*"create" + 0.012*"feature" + 0.010*"image" + 0.009*"learning" + 0.008*"story" + 0.007*"case"
Topic 1: 0.017*"ai" + 0.015*"user" + 0.012*"data" + 0.011*"tool" + 0.010*"learning" + 0.009*"feature" + 0.008*"language" + 0.008*"insight" + 0.007*"skill" + 0.007*"case"
Topic 2: 0.026*"user" + 0.016*"ai" + 0.011*"tool" + 0.010*"learning" + 0.010*"feature" + 0.009*"content" + 0.008*"video" + 0.008*"information" + 0.008*"case" + 0.007*"language"
Topic 3: 0.013*"ai" + 0.011*"feature" + 0.010*"various" + 0.008*"case" + 0.007*"include" + 0.007*"key" + 0.007*"user" + 0.007*"question" + 0.007*"advantage" + 0.007*"generate"
Topic 4: 0.038*"content" + 0.023*"writing" + 0.017*"tool" + 0.012*"ai" + 0.012*"user" + 0.011*"feature" + 0.008*"generate" + 0.008*"create" + 0.008*"case" + 0.008*"text"


In [38]:
# Prepare the visualization
lda_display = gensimvis.prepare(lda_model, corpus, dictionary)

# Show the visualization in a browser
pyLDAvis.display(lda_display)

In [39]:
pyLDAvis.save_html(lda_display, 'lda_visualization_5_group.html')

##### **5 Group Topics**

- **Topic 0: Personalized Learning and Creativity for Children**
  - ai, story, user, create, tool, personalized, feature, creative, unique, child

- **Topic 1: Data-Driven Learning Platforms and Chatbots**
  - ai, data, user, customer, model, learning, tool, feature, chatbot, platform

- **Topic 2: Language Learning and Skill Development**
  - user, ai, learning, language, tool, personalized, offer, experience, skill

- **Topic 3: Writing Assistance and Educational Content Creation**
  - user, tool, writing, feature, ai, content, audio, case, offer, email

- **Topic 4: Video and Multimedia Content Creation for Education**
  - content, video, ai, tool, create, feature, user, case, various, creation

#### with 7 topics

In [40]:
lda_model7 = LdaModel(corpus, num_topics=7, id2word=dictionary, passes=50)
# Print the topics
topics7 = lda_model7.print_topics(num_words=10)
for idx, topic in topics7:
    print(f"Topic {idx}: {topic}")

Topic 0: 0.014*"ai" + 0.013*"code" + 0.012*"tool" + 0.009*"feature" + 0.009*"process" + 0.007*"case" + 0.007*"assessment" + 0.006*"quiz" + 0.006*"developer" + 0.006*"question"
Topic 1: 0.038*"content" + 0.018*"ai" + 0.017*"create" + 0.016*"video" + 0.014*"user" + 0.014*"tool" + 0.013*"image" + 0.011*"feature" + 0.010*"generate" + 0.009*"creation"
Topic 2: 0.023*"user" + 0.020*"ai" + 0.016*"personalized" + 0.015*"learning" + 0.012*"story" + 0.011*"experience" + 0.010*"feature" + 0.010*"conversation" + 0.008*"interactive" + 0.008*"offer"
Topic 3: 0.033*"writing" + 0.023*"ai" + 0.014*"tool" + 0.012*"feature" + 0.012*"user" + 0.012*"email" + 0.009*"learning" + 0.009*"essay" + 0.009*"content" + 0.009*"case"
Topic 4: 0.028*"job" + 0.015*"resume" + 0.010*"user" + 0.010*"tool" + 0.010*"feature" + 0.008*"interview" + 0.008*"letter" + 0.008*"professional" + 0.008*"cover" + 0.007*"offer"
Topic 5: 0.026*"user" + 0.015*"tool" + 0.012*"ai" + 0.011*"feature" + 0.011*"language" + 0.011*"video" + 0.010

In [41]:
# Prepare the visualization
lda_display = gensimvis.prepare(lda_model7, corpus, dictionary)

# Show the visualization in a browser
pyLDAvis.display(lda_display)

In [42]:
pyLDAvis.save_html(lda_display, 'lda_visualization_7_group.html')

##### **7 Group Topics**

- **Topic 0: Customer Support and Business Insights**
  - user, ai, customer, tool, feature, insight, data, support, business, chatbot

- **Topic 1: Writing and Student Tools**
  - writing, ai, user, tool, student, feature, case, key, various, offer

- **Topic 2: Job Applications and AI Assistance**
  - data, job, ai, user, interview, email, feature, tool, resume, model

- **Topic 3: AI Coding Platforms and Developer Tools**
  - ai, code, coding, feature, user, tool, developer, platform, case, various

- **Topic 4: Learning and Quiz-Based Assessments**
  - quiz, ai, user, tool, question, learning, skill, candidate, feature, process

- **Topic 5: Multimedia Content Creation**
  - content, tool, video, create, user, ai, feature, creation, image, generate

- **Topic 6: Personalized Learning and Interactive AI**
  - user, learning, ai, personalized, language, experience, story, feature, conversation, interactive

#### with 10 topics

In [43]:
lda_model10 = LdaModel(corpus, num_topics=10, id2word=dictionary, passes=30)
# Print the topics
topics10 = lda_model10.print_topics(num_words=20)
for idx, topic in topics10:
    print(f"Topic {idx}: {topic}")

Topic 0: 0.025*"user" + 0.014*"ai" + 0.013*"tool" + 0.011*"information" + 0.010*"feature" + 0.009*"question" + 0.009*"content" + 0.009*"learning" + 0.009*"summary" + 0.008*"case" + 0.008*"video" + 0.008*"various" + 0.008*"key" + 0.007*"student" + 0.007*"answer" + 0.006*"text" + 0.006*"offer" + 0.006*"research" + 0.006*"time" + 0.006*"access"
Topic 1: 0.055*"content" + 0.017*"tool" + 0.015*"ai" + 0.013*"user" + 0.012*"feature" + 0.011*"writing" + 0.011*"audio" + 0.009*"language" + 0.009*"text" + 0.009*"create" + 0.009*"creation" + 0.008*"generate" + 0.007*"case" + 0.007*"creator" + 0.006*"various" + 0.006*"voice" + 0.006*"medium" + 0.006*"time" + 0.006*"offer" + 0.006*"process"
Topic 2: 0.024*"user" + 0.018*"ai" + 0.016*"writing" + 0.016*"story" + 0.013*"tool" + 0.012*"feature" + 0.009*"create" + 0.008*"case" + 0.007*"child" + 0.007*"image" + 0.007*"offer" + 0.007*"personalized" + 0.007*"enhance" + 0.006*"unique" + 0.006*"experience" + 0.006*"storytelling" + 0.006*"creative" + 0.006*"va

In [44]:
# Prepare the visualization
lda_display = gensimvis.prepare(lda_model10, corpus, dictionary)

# Show the visualization in a browser
pyLDAvis.display(lda_display)

In [45]:
pyLDAvis.save_html(lda_display, 'lda_visualization_10_group.html')

##### **10 Group Topics**


- **Topic 0: AI Tools for Data, Search, and Learning**
  - user, ai, data, tool, feature, information, case, search, code, insight

- **Topic 1: AI-Enhanced Content Creation and Job Tools**
  - content, ai, job, tool, user, interview, feature, offer, writing, creation

- **Topic 2: Customer Support and Communication Tools**
  - customer, user, chatbot, support, email, ai, business, communication, feature, personalized

- **Topic 3: Writing Assistance and Academic Tools**
  - writing, tool, user, ai, feature, summary, key, content, text, student

- **Topic 4: Personalized Learning and Language Experience**
  - ai, learning, user, language, feature, personalized, experience, story, conversation, tool

- **Topic 5: Data Analysis and Business Insights**
  - data, book, ai, insight, analysis, tool, model, compliance, business, user

- **Topic 6: Document Processing and Legal AI Tools**
  - ai, user, document, learning, tool, feature, model, machine, case, various

- **Topic 7: Image and Media Generation Tools**
  - content, create, image, ai, tool, user, generate, feature, medium, creation

- **Topic 8: Video Editing and Transcription Tools**
  - video, user, tool, audio, ai, transcription, feature, editing, youtube, candidate

- **Topic 9: Audio and Music Content Creation**
  - content, audio, voice, course, ai, user, music, feature, create, tool