# Topic Modeling

### import libraries

In [None]:
#pip install pyLDAvis

In [None]:
import pandas as pd
import numpy as np
import os
import nltk
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models import LdaModel
from sklearn.feature_extraction.text import TfidfVectorizer
import pyLDAvis.gensim as gensimvis
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pyLDAvis
pyLDAvis.enable_notebook()
import warnings
warnings.filterwarnings('ignore')


In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

### Loading data

In [None]:
df = pd.read_csv('https://github.com/saeed-saffari/LLM_education_topic_modeling/raw/refs/heads/main/Tastic%20AI%20-%20Opensource%20-%20AI%20tools.csv')
print('Shape:', df.shape)
print(df.columns)
df.head()

Shape: (4665, 7)
Index(['title', 'minidesc', 'desc', 'cat', 'new_image', 'url-href', 'vote'], dtype='object')


Unnamed: 0,title,minidesc,desc,cat,new_image,url-href,vote
0,ChatGPT,ChatGPT optimizes language models for dialogue,ChatGPT FeaturesChatGPT is an AI-powered conve...,AI Productivity tools,https://tasticai.com/wp-content/uploads/aitool...,https://chat.openai.com/chat,4813.0
1,Namelix,"Generate short, brandable business names & dom...",Namelix FeaturesNamelix is an AI-powered busin...,AI startup tools,https://tasticai.com/wp-content/uploads/aitool...,https://namelix.com,4012.0
2,Tome,Unlock your best work with AI-powered generati...,Tome FeaturesThe future of generative storytel...,AI presentation tools,https://tasticai.com/wp-content/uploads/aitool...,https://tome.app,3593.0
3,Fliki,Create videos from blog posts in 2 mins,Fliki FeaturesFliki is an AI-powered text-to-v...,AI video Generator tools,https://tasticai.com/wp-content/uploads/aitool...,https://fliki.ai,3580.0
4,Midjourney,AI lab exploring new mediums to expand human i...,Midjourney FeaturesMidjourney is an independen...,AI tools for art,https://tasticai.com/wp-content/uploads/aitool...,https://www.midjourney.com/home,3000.0


In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
def preprocess(text):
    if isinstance(text, str):
        tokens = word_tokenize(text.lower())
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
        return tokens
    return []

### clean data general

In [None]:
def preprocess(text):
    if isinstance(text, str):
        tokens = word_tokenize(text.lower())
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
        return tokens
    return []


descriptions = df['desc'].tolist()
processed_descriptions = [preprocess(desc) for desc in descriptions]

# Create a dictionary from the processed descriptions
dictionary = corpora.Dictionary(processed_descriptions)

# Create a corpus (bag of words for each document)
corpus = [dictionary.doc2bow(text) for text in processed_descriptions]

### Training model general


In [None]:
# Train the LDA model
lda_model = LdaModel(corpus, num_topics=7, id2word=dictionary, passes=30) # passes: number of whole text reviews / num_topics: number of topics

In [None]:
# Print the topics found by the LDA model
topics = lda_model.print_topics(num_words=10)
for idx, topic in topics:
    print(f"Topic {idx}: {topic}")

Topic 0: 0.020*"user" + 0.015*"personalized" + 0.013*"learning" + 0.011*"music" + 0.010*"feature" + 0.009*"ai" + 0.009*"plan" + 0.009*"individual" + 0.008*"offer" + 0.008*"travel"
Topic 1: 0.051*"content" + 0.019*"video" + 0.014*"tool" + 0.014*"create" + 0.014*"ai" + 0.012*"creation" + 0.012*"feature" + 0.012*"generate" + 0.011*"user" + 0.011*"medium"
Topic 2: 0.025*"user" + 0.018*"data" + 0.017*"tool" + 0.012*"feature" + 0.012*"ai" + 0.011*"email" + 0.011*"information" + 0.010*"writing" + 0.010*"language" + 0.009*"text"
Topic 3: 0.016*"job" + 0.016*"insight" + 0.011*"analysis" + 0.010*"ai" + 0.010*"tool" + 0.009*"data" + 0.009*"interview" + 0.007*"health" + 0.007*"feature" + 0.007*"resume"
Topic 4: 0.036*"image" + 0.020*"user" + 0.019*"ai" + 0.015*"design" + 0.014*"create" + 0.014*"tool" + 0.014*"photo" + 0.012*"unique" + 0.012*"feature" + 0.011*"creative"
Topic 5: 0.042*"ai" + 0.035*"user" + 0.012*"model" + 0.012*"experience" + 0.011*"feature" + 0.010*"conversation" + 0.009*"tool" + 

In [None]:
#doc_topics = lda_model.get_document_topics(corpus)

#document_topic_mapping = []

#for doc in doc_topics:
    # Find the topic with the highest probability for the current document
#    dominant_topic = max(doc, key=lambda x: x[1])[0]  # Get topic index with highest probability
#    document_topic_mapping.append(dominant_topic)

# Assuming 'df' is your original dataframe and 'descriptions' is the column containing LLM descriptions
#df['Dominant_Topic'] = document_topic_mapping

# This will show the first few rows with the assigned topic for each LLM
#print(df[['desc', 'Dominant_Topic']].head(10))

In [None]:
# Assuming 'df' is your original dataframe and 'descriptions' is the column containing LLM descriptions
#df['Dominant_Topic'] = document_topic_mapping

# This will show the first few rows with the assigned topic for each LLM
#print(df[['desc', 'Dominant_Topic']].head(10))

In [None]:
# Prepare the visualization
lda_display = gensimvis.prepare(lda_model, corpus, dictionary)

# Show the visualization in a browser
pyLDAvis.display(lda_display)

In [None]:
#pyLDAvis.save_html(lda_display, 'lda_visualization_general.html')

## Education filter only

In [None]:
education_keywords = ['teach', 'teaching', 'learn', 'learning', 'classroom', 'student', 'rephrase', 'plagiarism', 'instructor', 'curriculum', 'assessment', 'homework', 'school', 'educate', 'education', 'tutor']

In [None]:
def contains_education_terms(text, education_keywords):
    return any(kw in text for kw in education_keywords)

In [None]:
exclusion_keywords = [
    'advertising', 'branding', 'budgeting', 'client', 'CRM', 'customer',
    'e-commerce', 'expense', 'finance', 'investment', 'logo', 'marketing',
    'operations', 'pipeline', 'product', 'profit', 'resource allocation',
    'sales', 'SEO'
]

In [None]:
def contains_exclusion_terms(text, exclusion_keywords):
    return any(kw in text for kw in exclusion_keywords)

### Filter data

In [None]:
# Filter descriptions that mention any educational keywords
filtered_df = df[df['desc'].apply(lambda x: contains_education_terms(str(x).lower(), education_keywords))]
filtered_df = filtered_df[filtered_df['desc'].apply(lambda x: contains_exclusion_terms(str(x).lower(), exclusion_keywords))]

descriptions = filtered_df['desc'].tolist()
processed_descriptions = [preprocess(desc) for desc in descriptions]

# Create dictionary and corpus
dictionary = corpora.Dictionary(processed_descriptions)
corpus = [dictionary.doc2bow(text) for text in processed_descriptions]

In [None]:
filtered_df.shape

(478, 7)

In [None]:
filtered_df.shape

(1153, 7)

### Training model education


#### in 5 topics

In [None]:
lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=50)
# Print the topics
topics = lda_model.print_topics(num_words=10)
for idx, topic in topics:
    print(f"Topic {idx}: {topic}")

Topic 0: 0.017*"writing" + 0.016*"tool" + 0.014*"user" + 0.014*"content" + 0.012*"feature" + 0.011*"ai" + 0.009*"email" + 0.008*"job" + 0.008*"case" + 0.007*"professional"
Topic 1: 0.031*"content" + 0.018*"create" + 0.016*"ai" + 0.014*"user" + 0.013*"tool" + 0.012*"feature" + 0.010*"generate" + 0.009*"image" + 0.009*"creation" + 0.009*"story"
Topic 2: 0.026*"ai" + 0.021*"user" + 0.018*"learning" + 0.010*"feature" + 0.010*"personalized" + 0.009*"tool" + 0.009*"language" + 0.008*"experience" + 0.008*"offer" + 0.008*"conversation"
Topic 3: 0.022*"ai" + 0.020*"video" + 0.016*"data" + 0.013*"user" + 0.012*"tool" + 0.011*"customer" + 0.009*"learning" + 0.009*"business" + 0.008*"feature" + 0.008*"model"
Topic 4: 0.022*"user" + 0.012*"tool" + 0.011*"information" + 0.010*"feature" + 0.009*"ai" + 0.008*"summary" + 0.008*"research" + 0.007*"case" + 0.007*"time" + 0.007*"document"


In [None]:
# Prepare the visualization
lda_display = gensimvis.prepare(lda_model, corpus, dictionary)

# Show the visualization in a browser
pyLDAvis.display(lda_display)

In [None]:
#pyLDAvis.save_html(lda_display, 'lda_visualization_5_group.html')

##### **5 Group Topics**

- **Topic 0: Personalized Learning and Creativity for Children**
  - ai, story, user, create, tool, personalized, feature, creative, unique, child

- **Topic 1: Data-Driven Learning Platforms and Chatbots**
  - ai, data, user, customer, model, learning, tool, feature, chatbot, platform

- **Topic 2: Language Learning and Skill Development**
  - user, ai, learning, language, tool, personalized, offer, experience, skill

- **Topic 3: Writing Assistance and Educational Content Creation**
  - user, tool, writing, feature, ai, content, audio, case, offer, email

- **Topic 4: Video and Multimedia Content Creation for Education**
  - content, video, ai, tool, create, feature, user, case, various, creation

#### with 7 topics

In [None]:
lda_model7 = LdaModel(corpus, num_topics=7, id2word=dictionary, passes=50)
# Print the topics
topics7 = lda_model7.print_topics(num_words=10)
for idx, topic in topics7:
    print(f"Topic {idx}: {topic}")

Topic 0: 0.026*"user" + 0.021*"ai" + 0.013*"learning" + 0.013*"conversation" + 0.012*"personalized" + 0.012*"language" + 0.010*"feature" + 0.009*"offer" + 0.008*"skill" + 0.007*"email"
Topic 1: 0.026*"learning" + 0.023*"ai" + 0.017*"story" + 0.011*"platform" + 0.011*"feature" + 0.010*"tool" + 0.010*"model" + 0.009*"personalized" + 0.009*"create" + 0.009*"experience"
Topic 2: 0.039*"content" + 0.023*"ai" + 0.022*"writing" + 0.015*"tool" + 0.011*"language" + 0.011*"feature" + 0.010*"text" + 0.009*"create" + 0.008*"generate" + 0.008*"various"
Topic 3: 0.016*"code" + 0.013*"tool" + 0.011*"quiz" + 0.010*"process" + 0.010*"ai" + 0.009*"question" + 0.009*"assessment" + 0.008*"coding" + 0.008*"lesson" + 0.008*"student"
Topic 4: 0.018*"ai" + 0.017*"data" + 0.016*"user" + 0.014*"customer" + 0.011*"tool" + 0.010*"feature" + 0.009*"support" + 0.009*"answer" + 0.008*"business" + 0.008*"case"
Topic 5: 0.036*"video" + 0.020*"user" + 0.013*"audio" + 0.012*"tool" + 0.012*"ai" + 0.012*"feature" + 0.010*

In [None]:
# Prepare the visualization
lda_display = gensimvis.prepare(lda_model7, corpus, dictionary)

# Show the visualization in a browser
pyLDAvis.display(lda_display)

In [None]:
#pyLDAvis.save_html(lda_display, 'lda_visualization_7_group.html')

##### **7 Group Topics**

- **Topic 0: Customer Support and Business Insights**
  - user, ai, customer, tool, feature, insight, data, support, business, chatbot

- **Topic 1: Writing and Student Tools**
  - writing, ai, user, tool, student, feature, case, key, various, offer

- **Topic 2: Job Applications and AI Assistance**
  - data, job, ai, user, interview, email, feature, tool, resume, model

- **Topic 3: AI Coding Platforms and Developer Tools**
  - ai, code, coding, feature, user, tool, developer, platform, case, various

- **Topic 4: Learning and Quiz-Based Assessments**
  - quiz, ai, user, tool, question, learning, skill, candidate, feature, process

- **Topic 5: Multimedia Content Creation**
  - content, tool, video, create, user, ai, feature, creation, image, generate

- **Topic 6: Personalized Learning and Interactive AI**
  - user, learning, ai, personalized, language, experience, story, feature, conversation, interactive

#### with 10 topics

In [None]:
lda_model10 = LdaModel(corpus, num_topics=10, id2word=dictionary, passes=30)
# Print the topics
topics10 = lda_model10.print_topics(num_words=20)
for idx, topic in topics10:
    print(f"Topic {idx}: {topic}")

Topic 0: 0.032*"writing" + 0.019*"video" + 0.018*"tool" + 0.016*"content" + 0.016*"ai" + 0.013*"user" + 0.012*"feature" + 0.010*"text" + 0.009*"essay" + 0.009*"writer" + 0.009*"case" + 0.008*"enhance" + 0.008*"professional" + 0.008*"various" + 0.007*"offer" + 0.007*"student" + 0.007*"improve" + 0.007*"create" + 0.007*"academic" + 0.006*"help"
Topic 1: 0.021*"image" + 0.020*"quiz" + 0.014*"question" + 0.012*"tool" + 0.011*"user" + 0.010*"background" + 0.009*"feature" + 0.008*"ai" + 0.008*"create" + 0.007*"test" + 0.007*"case" + 0.006*"team" + 0.006*"assessment" + 0.006*"various" + 0.006*"communication" + 0.005*"editing" + 0.005*"generate" + 0.005*"audio" + 0.005*"algorithm" + 0.005*"content"
Topic 2: 0.031*"ai" + 0.009*"learning" + 0.008*"agent" + 0.008*"developer" + 0.007*"feature" + 0.006*"user" + 0.006*"tool" + 0.006*"platform" + 0.006*"code" + 0.006*"data" + 0.005*"case" + 0.005*"various" + 0.005*"database" + 0.005*"professional" + 0.004*"machine" + 0.004*"process" + 0.004*"designed

In [None]:
# Prepare the visualization
lda_display = gensimvis.prepare(lda_model10, corpus, dictionary)

# Show the visualization in a browser
pyLDAvis.display(lda_display)

In [None]:
#pyLDAvis.save_html(lda_display, 'lda_visualization_10_group.html')

##### **10 Group Topics**


- **Topic 0: AI Tools for Data, Search, and Learning**
  - user, ai, data, tool, feature, information, case, search, code, insight

- **Topic 1: AI-Enhanced Content Creation and Job Tools**
  - content, ai, job, tool, user, interview, feature, offer, writing, creation

- **Topic 2: Customer Support and Communication Tools**
  - customer, user, chatbot, support, email, ai, business, communication, feature, personalized

- **Topic 3: Writing Assistance and Academic Tools**
  - writing, tool, user, ai, feature, summary, key, content, text, student

- **Topic 4: Personalized Learning and Language Experience**
  - ai, learning, user, language, feature, personalized, experience, story, conversation, tool

- **Topic 5: Data Analysis and Business Insights**
  - data, book, ai, insight, analysis, tool, model, compliance, business, user

- **Topic 6: Document Processing and Legal AI Tools**
  - ai, user, document, learning, tool, feature, model, machine, case, various

- **Topic 7: Image and Media Generation Tools**
  - content, create, image, ai, tool, user, generate, feature, medium, creation

- **Topic 8: Video Editing and Transcription Tools**
  - video, user, tool, audio, ai, transcription, feature, editing, youtube, candidate

- **Topic 9: Audio and Music Content Creation**
  - content, audio, voice, course, ai, user, music, feature, create, tool

In [None]:
doc_topics = lda_model10.get_document_topics(corpus)
document_topic_mapping = []

for doc in doc_topics:
    # Find the topic with the highest probability for the current document
    dominant_topic = max(doc, key=lambda x: x[1])[0]  # Get topic index with highest probability
    document_topic_mapping.append(dominant_topic)

In [None]:
topic_name = ['Topic 0: AI Tools for Data, Search, and Learning',
              'Topic 1: AI-Enhanced Content Creation and Job Tools',
              'Topic 2: Customer Support and Communication Tools',
              'Topic 3: Writing Assistance and Academic Tools',
              'Topic 4: Personalized Learning and Language Experience',
              'Topic 5: Data Analysis and Business Insights',
              'Topic 6: Document Processing and Legal AI Tools',
              'Topic 7: Image and Media Generation Tools',
              'Topic 8: Video Editing and Transcription Tools',
              'Topic 9: Audio and Music Content Creation'
              ]

In [None]:
edu_with_topic = filtered_df#df[df['cat'] == 'AI tools for education']
edu_with_topic['Topic Number'] = document_topic_mapping

edu_with_topic['Topic Name'] = edu_with_topic['Topic Number'].apply(lambda x: topic_name[x])
print('shape:', edu_with_topic.shape)
edu_with_topic[['title', 'desc', 'Topic Number', 'Topic Name']].head(10)
#edu_with_topic.to_csv('edu_with_topic.csv', index=False)

shape: (1153, 9)


Unnamed: 0,title,desc,Topic Number,Topic Name
0,ChatGPT,ChatGPT FeaturesChatGPT is an AI-powered conve...,6,Topic 6: Document Processing and Legal AI Tools
1,Namelix,Namelix FeaturesNamelix is an AI-powered busin...,8,Topic 8: Video Editing and Transcription Tools
8,Piggy Magic,Piggy Magic FeaturesPiggy Magic is an AI-power...,5,Topic 5: Data Analysis and Business Insights
9,Writesonic,Writesonic FeaturesWritesonic is a unique AI-b...,5,Topic 5: Data Analysis and Business Insights
16,Humata AI,Humata AI FeaturesHumata is an AI-powered chat...,6,Topic 6: Document Processing and Legal AI Tools
23,TutorAI,TutorAI FeaturesTutor AI is an AI-powered lear...,4,Topic 4: Personalized Learning and Language Ex...
29,Stable Diffusion,Stable Diffusion FeaturesStable Diffusion exce...,5,Topic 5: Data Analysis and Business Insights
33,Jasper,Jasper FeaturesJasper is an AI-powered copywri...,5,Topic 5: Data Analysis and Business Insights
35,Autodraw,Autodraw FeaturesAutoDraw is a machine learnin...,6,Topic 6: Document Processing and Legal AI Tools
37,Fy! Studio,Fy! Studio FeaturesFy! Studio is an AI-driven ...,5,Topic 5: Data Analysis and Business Insights


In [None]:
df.head()

Unnamed: 0,title,minidesc,desc,cat,new_image,url-href,vote
0,ChatGPT,ChatGPT optimizes language models for dialogue,ChatGPT FeaturesChatGPT is an AI-powered conve...,AI Productivity tools,https://tasticai.com/wp-content/uploads/aitool...,https://chat.openai.com/chat,4813.0
1,Namelix,"Generate short, brandable business names & dom...",Namelix FeaturesNamelix is an AI-powered busin...,AI startup tools,https://tasticai.com/wp-content/uploads/aitool...,https://namelix.com,4012.0
2,Tome,Unlock your best work with AI-powered generati...,Tome FeaturesThe future of generative storytel...,AI presentation tools,https://tasticai.com/wp-content/uploads/aitool...,https://tome.app,3593.0
3,Fliki,Create videos from blog posts in 2 mins,Fliki FeaturesFliki is an AI-powered text-to-v...,AI video Generator tools,https://tasticai.com/wp-content/uploads/aitool...,https://fliki.ai,3580.0
4,Midjourney,AI lab exploring new mediums to expand human i...,Midjourney FeaturesMidjourney is an independen...,AI tools for art,https://tasticai.com/wp-content/uploads/aitool...,https://www.midjourney.com/home,3000.0


In [None]:
doc_topics = lda_model10.get_document_topics(corpus)

topic_1_mapping = []
topic_2_mapping = []
topic_3_mapping = []

for doc in doc_topics:
    sorted_topics = sorted(doc, key=lambda x: x[1], reverse=True)

    if len(sorted_topics) >= 3:
        top_3_topics = [topic[0] for topic in sorted_topics[:3]]
    else:
        top_3_topics = [topic[0] for topic in sorted_topics] + [None] * (3 - len(sorted_topics))

    topic_1_mapping.append(top_3_topics[0])
    topic_2_mapping.append(top_3_topics[1])
    topic_3_mapping.append(top_3_topics[2])

topic_name = [
    'Topic 0: AI Tools for Data, Search, and Learning',
    'Topic 1: AI-Enhanced Content Creation and Job Tools',
    'Topic 2: Customer Support and Communication Tools',
    'Topic 3: Writing Assistance and Academic Tools',
    'Topic 4: Personalized Learning and Language Experience',
    'Topic 5: Data Analysis and Business Insights',
    'Topic 6: Document Processing and Legal AI Tools',
    'Topic 7: Image and Media Generation Tools',
    'Topic 8: Video Editing and Transcription Tools',
    'Topic 9: Audio and Music Content Creation'
]

edu_with_topic_3 = filtered_df

edu_with_topic_3['Topic 1 Number'] = topic_1_mapping
edu_with_topic_3['Topic 2 Number'] = topic_2_mapping
edu_with_topic_3['Topic 3 Number'] = topic_3_mapping

edu_with_topic_3['Topic 1 Name'] = edu_with_topic_3['Topic 1 Number'].apply(lambda x: topic_name[int(x)] if pd.notna(x) else 'N/A')
edu_with_topic_3['Topic 2 Name'] = edu_with_topic_3['Topic 2 Number'].apply(lambda x: topic_name[int(x)] if pd.notna(x) else 'N/A')
edu_with_topic_3['Topic 3 Name'] = edu_with_topic_3['Topic 3 Number'].apply(lambda x: topic_name[int(x)] if pd.notna(x) else 'N/A')


print('shape:', edu_with_topic_3.shape)
edu_with_topic_3[['title', 'desc', 'Topic 1 Name', 'Topic 2 Name', 'Topic 3 Name']].head(10)
#edu_with_topic_3.head()
#edu_with_topic_3.to_csv('edu_with_top_3_topics.csv', index=False)

shape: (1153, 15)


Unnamed: 0,title,desc,Topic 1 Name,Topic 2 Name,Topic 3 Name
0,ChatGPT,ChatGPT FeaturesChatGPT is an AI-powered conve...,Topic 6: Document Processing and Legal AI Tools,Topic 5: Data Analysis and Business Insights,
1,Namelix,Namelix FeaturesNamelix is an AI-powered busin...,Topic 8: Video Editing and Transcription Tools,Topic 5: Data Analysis and Business Insights,Topic 6: Document Processing and Legal AI Tools
8,Piggy Magic,Piggy Magic FeaturesPiggy Magic is an AI-power...,Topic 5: Data Analysis and Business Insights,Topic 8: Video Editing and Transcription Tools,Topic 2: Customer Support and Communication Tools
9,Writesonic,Writesonic FeaturesWritesonic is a unique AI-b...,Topic 5: Data Analysis and Business Insights,,
16,Humata AI,Humata AI FeaturesHumata is an AI-powered chat...,Topic 6: Document Processing and Legal AI Tools,Topic 3: Writing Assistance and Academic Tools,
23,TutorAI,TutorAI FeaturesTutor AI is an AI-powered lear...,Topic 4: Personalized Learning and Language Ex...,,
29,Stable Diffusion,Stable Diffusion FeaturesStable Diffusion exce...,Topic 5: Data Analysis and Business Insights,,
33,Jasper,Jasper FeaturesJasper is an AI-powered copywri...,Topic 5: Data Analysis and Business Insights,Topic 8: Video Editing and Transcription Tools,
35,Autodraw,Autodraw FeaturesAutoDraw is a machine learnin...,Topic 6: Document Processing and Legal AI Tools,,
37,Fy! Studio,Fy! Studio FeaturesFy! Studio is an AI-driven ...,Topic 5: Data Analysis and Business Insights,Topic 8: Video Editing and Transcription Tools,


#### with 15 topics

In [None]:
lda_model15 = LdaModel(corpus, num_topics=15, id2word=dictionary, passes=30)
# Print the topics
topics15 = lda_model15.print_topics(num_words=20)
for idx, topic in topics15:
    print(f"Topic {idx}: {topic}")

Topic 0: 0.011*"user" + 0.010*"background" + 0.009*"character" + 0.009*"sale" + 0.009*"ai" + 0.007*"feature" + 0.007*"various" + 0.007*"ad" + 0.007*"learning" + 0.006*"case" + 0.006*"trend" + 0.006*"food" + 0.006*"brand" + 0.006*"email" + 0.005*"conversation" + 0.005*"personalized" + 0.005*"tool" + 0.005*"offer" + 0.005*"performance" + 0.005*"solution"
Topic 1: 0.016*"business" + 0.015*"website" + 0.012*"image" + 0.012*"web" + 0.009*"model" + 0.008*"quickpenai" + 0.007*"page" + 0.007*"owner" + 0.007*"legal" + 0.007*"writeplus" + 0.007*"customer" + 0.006*"text" + 0.006*"language" + 0.006*"entrepreneur" + 0.006*"one" + 0.006*"user" + 0.005*"document" + 0.005*"optimization" + 0.005*"engine" + 0.005*"search"
Topic 2: 0.026*"ai" + 0.017*"writing" + 0.017*"user" + 0.017*"tool" + 0.013*"enhance" + 0.011*"feature" + 0.011*"prompt" + 0.010*"access" + 0.009*"productivity" + 0.008*"case" + 0.008*"seeking" + 0.008*"improve" + 0.008*"offer" + 0.007*"looking" + 0.007*"various" + 0.006*"coding" + 0.0

In [None]:
# Prepare the visualization
lda_display = gensimvis.prepare(lda_model15, corpus, dictionary)

# Show the visualization in a browser
pyLDAvis.display(lda_display)

In [None]:
#pyLDAvis.save_html(lda_display, 'lda_visualization_15_group.html')

##### **15 Group Topics**

**Topic 0**:

Here’s the complete list from **Topic 0 to Topic 14**:

1. **Ad and Brand Management Tools**
   - Keywords: user, background, character, sale, ai, trend, brand, food, conversation, performance

2. **Business and Website Optimization Tools**
   - Keywords: business, website, image, web, optimization, legal, search engine

3. **Productivity and Writing Assistance**
   - Keywords: ai, writing, productivity, tool, enhance, prompt, assistant, coding

4. **AI Platforms for Personalized Learning and Development**
   - Keywords: ai, platform, chatbots, create, model, personalized, grant, business

5. **AI-Driven Design and Interior Planning Tools**
   - Keywords: design, ai, tool, query, interior, platform, strategy, homeowner

6. **Investment and Crypto Insights**
   - Keywords: ai, user, investment, insight, crypto, voice, study, investor

7. **Team Productivity and Data Management**
   - Keywords: team, insight, data, management, tool, task, customer feedback

8. **Communication and Security Tools**
   - Keywords: team, security, communication, chatgpt, empathy, health, work

9. **Video and Media Creation Tools**
   - Keywords: video, tool, create, ai, transcription, background, professional

10. **Customer Support and Chatbot Solutions**
    - Keywords: customer, support, chatbot, response, service, ai, team

11. **Professional Document and Email Management**
    - Keywords: email, data, professional, communication, document, project analysis

12. **Sales and Content Verification**
    - Keywords: sale, tool, language, news, checker, article, product, grammar

13. **Content Creation and Social Media Marketing**
    - Keywords: content, user, ai, create, social media, marketing, engaging

14. **SQL and Essay Writing Assistance**
    - Keywords: essay, sql, voice, query, generator, citation, coaching

15. **Language and Grammar Practice Tools**
    - Keywords: practice, user, language, grammar, skill, vocabulary, writing, support



## NEW Filter data on columns

In [None]:
freq = df.cat.value_counts()
freq = pd.DataFrame(freq)
freq['percentage'] = round(freq / len(df) * 100,2)
print('total:', len(df))
freq.to_csv('freq.csv')
freq.head(10)

total: 4665


Unnamed: 0_level_0,count,percentage
cat,Unnamed: 1_level_1,Unnamed: 2_level_1
AI writing tools,361,7.74
AI Productivity tools,262,5.62
AI day-to-day assistant,217,4.65
AI image generator,191,4.09
AI tools for developper,189,4.05
AI tools for education,172,3.69
AI Customer Service tools,162,3.47
Social media AI tools,153,3.28
AI startup tools,150,3.22
AI image editing tool,136,2.92


In [None]:
edu_df = df[df['cat'] == 'AI tools for education']
print('shape:', edu_df.shape)
edu_df.head()

shape: (172, 7)


Unnamed: 0,title,minidesc,desc,cat,new_image,url-href,vote
23,TutorAI,"AI-powered learning platform; enter topic, get...",TutorAI FeaturesTutor AI is an AI-powered lear...,AI tools for education,https://tasticai.com/wp-content/uploads/aitool...,https://www.tutorai.me,1638.0
41,PaperBrain,Platform to access & understand research paper...,PaperBrain FeaturesA platform for you to acces...,AI tools for education,https://tasticai.com/wp-content/uploads/aitool...,https://www.paperbrain.study,1310.0
47,Wisdolia,AI generated flashcards for any article / PDF,Wisdolia FeaturesA Chrome Extension that uses ...,AI tools for education,https://tasticai.com/wp-content/uploads/aitool...,https://wisdolia.com,1233.0
52,CheckForAI,Detect AI text in essays/emails w/ Open AI & p...,CheckForAI FeaturesCheckforAi is an AI detecti...,AI tools for education,https://tasticai.com/wp-content/uploads/aitool...,https://checkforai.com,1152.0
77,WolframAlpha,"Compute expert-level answers in Math, Science,...",WolframAlpha FeaturesCompute expert-level answ...,AI tools for education,https://tasticai.com/wp-content/uploads/aitool...,https://www.wolframalpha.com,848.0


In [None]:
descriptions = edu_df['desc'].tolist()
processed_descriptions = [preprocess(desc) for desc in descriptions]

# Create dictionary and corpus
dictionary = corpora.Dictionary(processed_descriptions)
corpus = [dictionary.doc2bow(text) for text in processed_descriptions]

#### NEW with 20 topics

In [None]:
lda_model_new_20 = LdaModel(corpus, num_topics=20, id2word=dictionary, passes=50)
# Print the topics
topics = lda_model_new_20.print_topics(num_words=20)
for idx, topic in topics:
    print(f"Topic {idx}: {topic}")

Topic 0: 0.027*"student" + 0.026*"learning" + 0.022*"lesson" + 0.019*"platform" + 0.016*"academic" + 0.016*"tool" + 0.011*"feature" + 0.011*"designed" + 0.011*"educator" + 0.011*"plan" + 0.010*"various" + 0.009*"include" + 0.009*"case" + 0.009*"grading" + 0.008*"writing" + 0.008*"ai" + 0.008*"feedback" + 0.008*"offer" + 0.008*"education" + 0.008*"advantage"
Topic 1: 0.063*"lesson" + 0.027*"langmob" + 0.024*"plan" + 0.019*"ai" + 0.017*"resource" + 0.015*"reference" + 0.012*"generative" + 0.012*"homework" + 0.012*"assistance" + 0.011*"ensuring" + 0.009*"chatbot" + 0.009*"subject" + 0.009*"upload" + 0.009*"beginner" + 0.008*"style" + 0.008*"targeted" + 0.008*"support" + 0.008*"format" + 0.007*"teaching" + 0.007*"instant"
Topic 2: 0.037*"news" + 0.023*"lesson" + 0.020*"student" + 0.019*"create" + 0.019*"tool" + 0.018*"teacher" + 0.014*"content" + 0.013*"quiz" + 0.012*"feature" + 0.012*"generate" + 0.011*"various" + 0.011*"plan" + 0.010*"planning" + 0.009*"article" + 0.009*"case" + 0.009*"e

In [None]:
# Prepare the visualization
lda_display = gensimvis.prepare(lda_model_new_20, corpus, dictionary)

# Show the visualization in a browser
pyLDAvis.display(lda_display)

In [None]:
#pyLDAvis.save_html(lda_display, 'lda_visualization_20_group_new.html')

##### New 20 topics

1. **Student Learning and Educational Platforms**
   - Keywords: student, learning, lesson, platform, academic, tool, educator, grading, feedback, education

2. **Lesson Planning and Homework Assistance**
   - Keywords: lesson, langmob, plan, ai, resource, reference, homework, assistance, support, teaching

3. **News and Quiz-Based Learning Tools**
   - Keywords: news, lesson, student, teacher, content, quiz, generate, plan, article, engaging

4. **AI-Powered Academic Tools and Assignments**
   - Keywords: ai, user, content, technology, assignment, learning, academic, conversation, algorithm, student

5. **Coding and Educational Journey Tools**
   - Keywords: grant, coding, learning, student, educational, tool, mastery, python, performance, academic

6. **AI-Enhanced Insights for Learning**
   - Keywords: lesson, insight, student, assistant, ai, data, analysis, generate, personal, interface

7. **Course and Quiz Creation Tools**
   - Keywords: question, course, user, assessment, quiz, generate, creation, content, feature, learning

8. **AI Coaching and Career Guidance**
   - Keywords: coaching, ai, career, learning, platform, personalized, expert, guidance, communication, advice

9. **Language Learning and Speech Detection Tools**
   - Keywords: content, language, emma, skill, video, english, speech, conversational, detection, communication

10. **AI-Driven Learning Courses**
    - Keywords: learning, ai, course, feature, skill, experience, personalized, knowledge, interactive, tool

11. **Math and Personalized Learning Tools**
    - Keywords: book, math, summary, ai, problem, personalized, key, professional, learning, efficient

12. **AI for Personalized Student Support**
    - Keywords: ai, user, personalized, essay, teacher, language, recommendation, feedback, computer vision, report

13. **AI for Learning and Content Creation**
    - Keywords: ai, user, learning, interview, knowledge, insight, analysis, blog, chatgpt, understanding

14. **AI for Reading and Grammar Tools**
    - Keywords: ai, kindle, reading, grammar, basmo, book, experience, tool, checker, provide

15. **Language Learning and Conversational AI**
    - Keywords: language, learning, practice, skill, ai, conversation, personalized, experience, grammar, feedback

16. **AI-Powered Study and Quiz Tools**
    - Keywords: study, tool, quiz, feature, test, personalized, create, educator, knowledge, student

17. **Vocabulary and Math Learning for Children**
    - Keywords: learning, vocabulary, math, sentence, child, skill, map, mind, interactive, engaging

18. **Lesson Planning and Educational Resources**
    - Keywords: lesson, plan, resource, educator, ai, assessment, trivia, create, time, teaching

19. **AI-Powered Learning Platforms**
    - Keywords: learning, user, platform, buddy, brain, feedback, snackz, history, library, experience

20. **AI for Video and Animated Content Creation**
    - Keywords: ai, video, animated, content, art, create, audience, editing, caption, transform

#### NEW with 8 topics

In [None]:
lda_model_new_8 = LdaModel(corpus, num_topics=8, id2word=dictionary, passes=50)
# Print the topics
topics = lda_model_new_8.print_topics(num_words=20)
for idx, topic in topics:
    print(f"Topic {idx}: {topic}")

Topic 0: 0.034*"language" + 0.028*"learning" + 0.017*"ai" + 0.012*"skill" + 0.012*"tool" + 0.012*"practice" + 0.012*"feature" + 0.011*"user" + 0.009*"feedback" + 0.009*"grammar" + 0.009*"offer" + 0.008*"writing" + 0.008*"student" + 0.008*"personalized" + 0.008*"conversation" + 0.008*"learner" + 0.008*"case" + 0.007*"advanced" + 0.007*"enhance" + 0.007*"platform"
Topic 1: 0.021*"news" + 0.020*"math" + 0.016*"user" + 0.014*"tool" + 0.009*"feature" + 0.009*"personalized" + 0.009*"individual" + 0.008*"problem" + 0.008*"albertbro" + 0.007*"learning" + 0.007*"emma" + 0.007*"various" + 0.007*"student" + 0.007*"case" + 0.007*"ai" + 0.007*"search" + 0.007*"career" + 0.006*"content" + 0.005*"seeking" + 0.005*"coaching"
Topic 2: 0.027*"book" + 0.019*"user" + 0.016*"student" + 0.016*"study" + 0.012*"tool" + 0.010*"personalized" + 0.010*"answer" + 0.009*"feature" + 0.009*"question" + 0.009*"flashcard" + 0.009*"ai" + 0.008*"reading" + 0.008*"case" + 0.007*"summary" + 0.007*"help" + 0.007*"video" + 0

In [None]:
# Prepare the visualization
lda_display = gensimvis.prepare(lda_model_new_8, corpus, dictionary)

# Show the visualization in a browser
pyLDAvis.display(lda_display)

In [None]:
#pyLDAvis.save_html(lda_display, 'lda_visualization_8_group_new.html')

**With 8 Topic:**

1. **Language Learning and Practice Tools**
   - Keywords: language, learning, ai, skill, practice, grammar, conversation, personalized

2. **Math and News-Based Problem Solving**
   - Keywords: news, math, problem, user, personalized, coaching, career

3. **Study Aids and Flashcard Tools**
   - Keywords: book, study, flashcard, question, reading, personalized, recommendation, performance

4. **Lesson Planning and Educational Resources**
   - Keywords: news, lesson, plan, resource, school, teacher, quiz, topic

5. **Interactive Language and Coding Practice**
   - Keywords: ai, learning, skill, language, coding, vocabulary, interactive, kid

6. **Personalized Online Learning Courses**
   - Keywords: learning, course, personalized, content, online, experience, knowledge, platform

7. **Video Creation and Art-Based Educational Tools**
   - Keywords: video, art, animated, quiz, knowledge, editing, content, explanation

8. **Lesson Creation and Assessment Tools**
   - Keywords: learning, lesson, question, quiz, create, educator, assessment, educational



#### NEW with 15 topics

In [None]:
lda_model_new_15 = LdaModel(corpus, num_topics=15, id2word=dictionary, passes=50)
# Print the topics
topics = lda_model_new_15.print_topics(num_words=20)
for idx, topic in topics:
    print(f"Topic {idx}: {topic}")

Topic 0: 0.052*"question" + 0.040*"quiz" + 0.023*"generate" + 0.018*"create" + 0.018*"assessment" + 0.016*"various" + 0.015*"user" + 0.013*"text" + 0.013*"tool" + 0.012*"content" + 0.012*"generation" + 0.009*"case" + 0.009*"engaging" + 0.009*"test" + 0.009*"feature" + 0.009*"teacher" + 0.008*"educator" + 0.008*"educational" + 0.008*"material" + 0.008*"creation"
Topic 1: 0.032*"retinello" + 0.015*"learner" + 0.015*"presence" + 0.012*"chat" + 0.011*"repetition" + 0.010*"question" + 0.010*"focus" + 0.009*"flashcard" + 0.009*"aicheatcheck" + 0.008*"spaced" + 0.008*"student" + 0.008*"content" + 0.007*"future" + 0.007*"thinking" + 0.006*"policy" + 0.006*"authenticity" + 0.006*"accuracy" + 0.006*"university" + 0.006*"maintain" + 0.006*"work"
Topic 2: 0.050*"learning" + 0.042*"ai" + 0.013*"personalized" + 0.013*"experience" + 0.010*"knowledge" + 0.010*"user" + 0.010*"video" + 0.010*"content" + 0.009*"feature" + 0.008*"skill" + 0.008*"enhance" + 0.008*"engaging" + 0.007*"language" + 0.007*"crea

In [None]:
# Prepare the visualization
lda_display = gensimvis.prepare(lda_model_new_15, corpus, dictionary)

# Show the visualization in a browser
pyLDAvis.display(lda_display)

**With 15 Topic:**


1. **Educational Quizzes and Assessment Creation**
   - Keywords: question, quiz, generate, create, assessment, educator, engaging

2. **Study Tools with Retention and Authenticity Focus**
   - Keywords: retinello, learner, presence, repetition, flashcard, thinking, accuracy, authenticity

3. **AI-Enhanced Personalized Learning Experiences**
   - Keywords: learning, ai, personalized, experience, skill, engaging, platform, interactive

4. **Coding and Technical Interview Preparation**
   - Keywords: ai, coding, interview, tool, analysis, reading, skill, support

5. **Language Practice and Personalized Lessons**
   - Keywords: learning, language, lesson, practice, vocabulary, skill, platform, english

6. **Grammar and Math Assistance Tools**
   - Keywords: grammar, language, math, writing, question, support, problem-solving, buddy

7. **News Summaries and Expert Coaching Platforms**
   - Keywords: news, luminary, expert, coaching, guidance, educational, article, summary

8. **Language Learning and Conversational Skills Development**
   - Keywords: language, learning, conversational, skill, interactive, academic, feedback

9. **Exam Preparation and Study Planning**
   - Keywords: learning, study, personalized, bloom, exam, performance, content, progress

10. **Video Content Creation and Educational Insights**
    - Keywords: video, content, student, educator, youtube, lesson, insight, summary

11. **Academic Support and Grant Assistance for Students**
    - Keywords: student, grant, assignment, academic, school, assistant, math, high school

12. **Online Course Creation and Interactive Learning**
    - Keywords: course, creation, coursebox, online, flashcard, instructor, interactive

13. **Language Fluency and Conversational Practice**
    - Keywords: language, english, skill, conversation, fluency, grammar, speaking

14. **Book Recommendations and Reading Engagement**
    - Keywords: book, reading, recommendation, personalized, insight, summary, academic

15. **Lesson Planning and Educational Content Development**
    - Keywords: lesson, platform, plan, news, teacher, educator, teaching, assessment


## NEW Filter on more groups data on columns

In [None]:
categories = [
    "AI tools for education",
    "AI writing tools",
    "AI tools for research",
    "AI summarizer tools",
    "AI coding tools",
    "AI presentation tools"
]

# Filter the DataFrame for rows where 'cat' matches any of the specified categories
filtered_df = df[df['cat'].isin(categories)]
filtered_df.head()

Unnamed: 0,title,minidesc,desc,cat,new_image,url-href,vote
2,Tome,Unlock your best work with AI-powered generati...,Tome FeaturesThe future of generative storytel...,AI presentation tools,https://tasticai.com/wp-content/uploads/aitool...,https://tome.app,3593.0
5,Codeium,AI-powered code acceleration toolkit to code s...,Codeium FeaturesCodeium is an AI-powered code ...,AI coding tools,https://tasticai.com/wp-content/uploads/aitool...,https://www.codeium.com,2821.0
11,Copy.ai,AI-powered copywriter for marketing: get start...,Copy.ai FeaturesCopy.ai is an AI-powered conte...,AI writing tools,https://tasticai.com/wp-content/uploads/aitool...,https://www.copy.ai,2274.0
23,TutorAI,"AI-powered learning platform; enter topic, get...",TutorAI FeaturesTutor AI is an AI-powered lear...,AI tools for education,https://tasticai.com/wp-content/uploads/aitool...,https://www.tutorai.me,1638.0
24,Adcreative.ai,Create conversion-focused ads & posts quickly ...,Adcreative.ai FeaturesAdCreative.ai is an AI-p...,AI writing tools,https://tasticai.com/wp-content/uploads/aitool...,https://adcreative.ai,1634.0


In [None]:
freq = filtered_df.cat.value_counts()
freq = pd.DataFrame(freq)
print('total:', len(filtered_df))
freq.head(6)

total: 894


Unnamed: 0_level_0,count
cat,Unnamed: 1_level_1
AI writing tools,361
AI tools for education,172
AI tools for research,114
AI summarizer tools,111
AI coding tools,105
AI presentation tools,31


In [None]:
descriptions = filtered_df['desc'].tolist()
processed_descriptions = [preprocess(desc) for desc in descriptions]

# Create dictionary and corpus
dictionary = corpora.Dictionary(processed_descriptions)
corpus = [dictionary.doc2bow(text) for text in processed_descriptions]

#### More group with 8 topics

In [None]:
#lda_model_group_8 = LdaModel.load("lda_model_group_8.model")
lda_model_group_8 = LdaModel(corpus, num_topics=8, id2word=dictionary, passes=60)
# Print the topics
topics = lda_model_group_8.print_topics(num_words=20)
for idx, topic in topics:
    print(f"Topic {idx}: {topic}")

Topic 0: 0.024*"ai" + 0.014*"essay" + 0.014*"tool" + 0.013*"student" + 0.013*"text" + 0.013*"writing" + 0.011*"content" + 0.010*"lesson" + 0.010*"question" + 0.010*"feature" + 0.009*"create" + 0.008*"quiz" + 0.008*"academic" + 0.007*"course" + 0.007*"plan" + 0.006*"generate" + 0.006*"platform" + 0.006*"case" + 0.006*"user" + 0.006*"process"
Topic 1: 0.042*"code" + 0.017*"ai" + 0.014*"coding" + 0.014*"developer" + 0.012*"tool" + 0.010*"feature" + 0.009*"process" + 0.009*"user" + 0.008*"team" + 0.007*"case" + 0.007*"development" + 0.006*"time" + 0.006*"programming" + 0.005*"help" + 0.005*"summary" + 0.005*"podcast" + 0.005*"language" + 0.005*"insight" + 0.005*"meeting" + 0.004*"customer"
Topic 2: 0.033*"presentation" + 0.015*"newsletter" + 0.014*"create" + 0.012*"feature" + 0.011*"slide" + 0.009*"tool" + 0.009*"user" + 0.008*"case" + 0.008*"professional" + 0.007*"interactive" + 0.007*"various" + 0.007*"visually" + 0.007*"video" + 0.007*"engaging" + 0.006*"offer" + 0.006*"platform" + 0.00

In [None]:
# Save the model to a file
lda_model_group_8.save("lda_model_group_8_4.model")

In [None]:
# Save the topics to a text file
with open("lda_topics.txt", "w") as file:
    for idx, topic in lda_model_group_8.print_topics(num_words=10):
        file.write(f"Topic {idx}: {topic}\n")

In [None]:
import os

In [None]:
os.listdir()

['.config', 'lda_model_group_8.model', 'sample_data']

In [None]:
#lda_model_group_8 = LdaModel.load("lda_model_group_8.model")

In [None]:
# Prepare the visualization
#lda_display = gensimvis.prepare(lda_model_group_8, corpus, dictionary)

# Show the visualization in a browser
#pyLDAvis.display(lda_display)

In [None]:
#pyLDAvis.save_html(lda_display, 'lda_model_group_8.html')

**With 8 Topic:**

1. **AI Writing and Essay Tools**  
   (Focuses on tools for writing essays, generating content, and supporting various writing tasks.)

2. **Interactive Learning and Educational Videos**  
   (Highlights tools for creating engaging, interactive educational content like videos and animations.)

3. **Language Learning and Personalized Skill Development**  
   (Focuses on tools for learning languages, personalized feedback, and skill practice through AI.)

4. **Data Analysis and Business Insights Tools**  
   (Covers tools for analyzing data, providing insights, and supporting customer-focused business analysis.)

5. **AI Coding and Developer Tools**  
   (Emphasizes tools for coding, programming, and supporting developers in software development tasks.)

6. **AI Summarization and Research Assistance**  
   (Focuses on summarizing information, assisting with research, and delivering concise video summaries.)

7. **AI Tools for Writing and Text Processing**  
   (Combines tools for text processing, learning content generation, and writing enhancement.)

8. **AI Content Creation and Generation Tools**  
   (Highlights tools for creating and generating content, supporting processes, and facilitating creative workflows.)


In [None]:
# 5 topic
doc_topics = lda_model_group_8.get_document_topics(corpus)
topic_1_mapping = []
topic_2_mapping = []
topic_3_mapping = []
topic_4_mapping = []
topic_5_mapping = []

for doc in doc_topics:
    sorted_topics = sorted(doc, key=lambda x: x[1], reverse=True)

    if len(sorted_topics) >= 5:
        top_3_topics = [topic[0] for topic in sorted_topics[:5]]
    else:
        top_3_topics = [topic[0] for topic in sorted_topics] + [None] * (5 - len(sorted_topics))

    topic_1_mapping.append(top_3_topics[0])
    topic_2_mapping.append(top_3_topics[1])
    topic_3_mapping.append(top_3_topics[2])
    topic_4_mapping.append(top_3_topics[3])
    topic_5_mapping.append(top_3_topics[4])

In [None]:
topic_name = ['Topic 0: AI Writing and Essay Tools',
              'Topic 1: Interactive Learning and Educational Videos',
              'Topic 2: Language Learning and Personalized Skill Development',
              'Topic 3: Data Analysis and Business Insights Tools',
              'Topic 4: AI Coding and Developer Tools',
              'Topic 5: AI Summarization and Research Assistance',
              'Topic 6: AI Tools for Writing and Text Processing',
              'Topic 7: AI Content Creation and Generation Tools'
              ]

In [None]:
edu_with_topic_5 = filtered_df

edu_with_topic_5['Topic 1 Number'] = topic_1_mapping
edu_with_topic_5['Topic 2 Number'] = topic_2_mapping
edu_with_topic_5['Topic 3 Number'] = topic_3_mapping
edu_with_topic_5['Topic 4 Number'] = topic_4_mapping
edu_with_topic_5['Topic 5 Number'] = topic_5_mapping

edu_with_topic_5['Topic 1 Name'] = edu_with_topic_5['Topic 1 Number'].apply(lambda x: topic_name[int(x)] if pd.notna(x) else 'N/A')
edu_with_topic_5['Topic 2 Name'] = edu_with_topic_5['Topic 2 Number'].apply(lambda x: topic_name[int(x)] if pd.notna(x) else 'N/A')
edu_with_topic_5['Topic 3 Name'] = edu_with_topic_5['Topic 3 Number'].apply(lambda x: topic_name[int(x)] if pd.notna(x) else 'N/A')
edu_with_topic_5['Topic 4 Name'] = edu_with_topic_5['Topic 4 Number'].apply(lambda x: topic_name[int(x)] if pd.notna(x) else 'N/A')
edu_with_topic_5['Topic 5 Name'] = edu_with_topic_5['Topic 5 Number'].apply(lambda x: topic_name[int(x)] if pd.notna(x) else 'N/A')


In [None]:
edu_with_topic_5.shape

(894, 17)

In [None]:
edu_with_topic_5.shape[0]
for i in range(1,6):
    n = edu_with_topic_5['Topic {} Number'.format(i)].isna().sum()
    #print('Topic {} Number: have {} N/A value'.format(i, n))
    print('We have {} LLMs with {} topic allocated.'.format(edu_with_topic_5.shape[0] - n, i))
    #print(edu_with_topic_5.shape[0] - n)

We have 894 LLMs with 1 topic allocated.
We have 691 LLMs with 2 topic allocated.
We have 347 LLMs with 3 topic allocated.
We have 114 LLMs with 4 topic allocated.
We have 28 LLMs with 5 topic allocated.


In [None]:
#edu_with_topic_5.to_csv('only_edu_with_top_5_topics.csv', index=False)

In [None]:
# with 3 topic
doc_topics = lda_model_group_8.get_document_topics(corpus)
topic_1_mapping = []
topic_2_mapping = []
topic_3_mapping = []

for doc in doc_topics:
    sorted_topics = sorted(doc, key=lambda x: x[1], reverse=True)

    if len(sorted_topics) >= 3:
        top_3_topics = [topic[0] for topic in sorted_topics[:3]]
    else:
        top_3_topics = [topic[0] for topic in sorted_topics] + [None] * (3 - len(sorted_topics))

    topic_1_mapping.append(top_3_topics[0])
    topic_2_mapping.append(top_3_topics[1])
    topic_3_mapping.append(top_3_topics[2])

edu_with_topic_3 = filtered_df

edu_with_topic_3['Topic 1 Number'] = topic_1_mapping
edu_with_topic_3['Topic 2 Number'] = topic_2_mapping
edu_with_topic_3['Topic 3 Number'] = topic_3_mapping

edu_with_topic_3['Topic 1 Name'] = edu_with_topic_3['Topic 1 Number'].apply(lambda x: topic_name[int(x)] if pd.notna(x) else 'N/A')
edu_with_topic_3['Topic 2 Name'] = edu_with_topic_3['Topic 2 Number'].apply(lambda x: topic_name[int(x)] if pd.notna(x) else 'N/A')
edu_with_topic_3['Topic 3 Name'] = edu_with_topic_3['Topic 3 Number'].apply(lambda x: topic_name[int(x)] if pd.notna(x) else 'N/A')

In [None]:
edu_with_topic_3.shape

(894, 13)

In [None]:
edu_with_topic_3.shape[0]
for i in range(1,4):
    n = edu_with_topic_3['Topic {} Number'.format(i)].isna().sum()
    #print('Topic {} Number: have {} N/A value'.format(i, n))
    print('We have {} LLMs with {} topic allocated.'.format(edu_with_topic_3.shape[0] - n, i))
    #print(edu_with_topic_5.shape[0] - n)

We have 894 LLMs with 1 topic allocated.
We have 691 LLMs with 2 topic allocated.
We have 347 LLMs with 3 topic allocated.


In [None]:
#edu_with_topic_3.to_csv('only_edu_with_top_3_topics.csv', index=False)

#### More group with 10 topics

In [None]:
lda_model_group_10 = LdaModel(corpus, num_topics=10, id2word=dictionary, passes=50)
# Print the topics
topics = lda_model_group_10.print_topics(num_words=20)
for idx, topic in topics:
    print(f"Topic {idx}: {topic}")

Topic 0: 0.063*"content" + 0.023*"ai" + 0.018*"creation" + 0.017*"create" + 0.017*"generate" + 0.015*"tool" + 0.014*"user" + 0.013*"blog" + 0.011*"engaging" + 0.011*"feature" + 0.010*"marketing" + 0.010*"generation" + 0.010*"post" + 0.009*"case" + 0.009*"medium" + 0.009*"social" + 0.009*"process" + 0.009*"various" + 0.008*"time" + 0.008*"copy"
Topic 1: 0.014*"business" + 0.012*"ai" + 0.011*"feature" + 0.010*"tool" + 0.009*"case" + 0.009*"user" + 0.008*"help" + 0.008*"idea" + 0.008*"product" + 0.007*"design" + 0.006*"generate" + 0.006*"various" + 0.006*"name" + 0.005*"key" + 0.005*"offer" + 0.005*"create" + 0.005*"brand" + 0.005*"include" + 0.005*"analysis" + 0.004*"process"
Topic 2: 0.058*"writing" + 0.019*"tool" + 0.017*"user" + 0.017*"content" + 0.016*"text" + 0.015*"ai" + 0.014*"writer" + 0.014*"feature" + 0.013*"essay" + 0.010*"enhance" + 0.009*"grammar" + 0.009*"improve" + 0.008*"assistance" + 0.008*"offer" + 0.007*"case" + 0.007*"help" + 0.007*"professional" + 0.006*"support" + 0

In [None]:
# Prepare the visualization
lda_display = gensimvis.prepare(lda_model_group_10, corpus, dictionary)

# Show the visualization in a browser
pyLDAvis.display(lda_display)

In [None]:
#pyLDAvis.save_html(lda_display, 'lda_model_group_10.html')

**With 10 Topic:**

1. **Content Creation and Marketing Tools**  
   (Focuses on creating and generating engaging content, with mentions of blogs, marketing, and social media.)

2. **AI Tools for Business and Branding**  
   (Covers business-related applications such as branding, product design, and analysis.)

3. **AI-Powered Writing and Grammar Assistance**  
   (Highlights tools for writing, text enhancement, essay assistance, and grammar improvement.)

4. **Web Content and Academic Tools**  
   (Emphasizes AI tools for website management, academic solutions, and content detection.)

5. **Presentation and Summarization Tools**  
   (Focuses on tools for creating presentations, summarizing articles or documents, and quizzes.)

6. **AI for Collaboration and Task Management**  
   (Includes tools for team collaboration, Slack integration, task handling, and conversation-based AI.)

7. **AI for Data Analysis and Lesson Planning**  
   (Focuses on AI-powered tools for data handling, lesson planning, and education-related processes.)

8. **AI Video Analysis and Research Tools**  
   (Covers AI tools for analyzing videos, generating summaries, and supporting research and knowledge building.)

9. **AI Coding and Programming Tools**  
   (Emphasizes tools for coding, programming practice, skill development, and improving coding proficiency.)

10. **AI for Personalized News and Insights**  
   (Highlights tools for staying informed with personalized news, podcasts, and expert insights.)

#### More group with 15 topics

In [None]:
lda_model_group_15 = LdaModel(corpus, num_topics=15, id2word=dictionary, passes=50)
# Print the topics
topics = lda_model_group_15.print_topics(num_words=20)
for idx, topic in topics:
    print(f"Topic {idx}: {topic}")

Topic 0: 0.026*"question" + 0.026*"quiz" + 0.018*"user" + 0.016*"create" + 0.015*"generate" + 0.013*"tool" + 0.011*"feature" + 0.011*"test" + 0.011*"assessment" + 0.011*"various" + 0.011*"generation" + 0.010*"web" + 0.009*"case" + 0.008*"text" + 0.006*"learning" + 0.006*"creation" + 0.006*"process" + 0.005*"based" + 0.005*"educator" + 0.005*"generator"
Topic 1: 0.040*"data" + 0.017*"learning" + 0.013*"language" + 0.009*"tool" + 0.009*"feature" + 0.007*"map" + 0.007*"user" + 0.006*"offer" + 0.005*"plan" + 0.005*"article" + 0.005*"getbotz" + 0.005*"platform" + 0.005*"insight" + 0.005*"intelligence" + 0.005*"explore" + 0.005*"kid" + 0.004*"unriddle" + 0.004*"vocabulary" + 0.004*"journey" + 0.004*"mind"
Topic 2: 0.046*"book" + 0.033*"podcast" + 0.030*"summary" + 0.016*"reading" + 0.012*"ai" + 0.010*"user" + 0.010*"access" + 0.009*"tool" + 0.009*"podcasts" + 0.009*"insight" + 0.009*"personalized" + 0.008*"favorite" + 0.008*"time" + 0.008*"stay" + 0.007*"recommendation" + 0.007*"case" + 0.00

In [None]:
# Prepare the visualization
lda_display = gensimvis.prepare(lda_model_group_15, corpus, dictionary)

# Show the visualization in a browser
pyLDAvis.display(lda_display)

In [None]:
#pyLDAvis.save_html(lda_display, 'lda_model_group_15.html')

**With 15 Topic:**


1. **Quiz and Test Generation Tools**  
   (Focuses on generating quizzes, tests, assessments, and tools for educators.)

2. **AI Tools for Data Insights and Exploration**  
   (Covers tools for data-driven learning, mapping, insights, and educational journeys.)

3. **Book Summaries and Personalized Podcast Recommendations**  
   (Focuses on summarizing books, podcasts, and providing personalized recommendations.)

4. **AI Research Tools and Document Summarization**  
   (Highlights tools for research, summarizing documents, and analyzing content.)

5. **AI Writing and Grammar Enhancement Tools**  
   (Includes tools for writing essays, grammar correction, academic assistance, and student help.)

6. **AI Learning Assistants and Math Coaching**  
   (Focuses on conversational AI for personalized learning, math support, and knowledge enhancement.)

7. **Newsletter and Text Generation Tools**  
   (Covers tools for generating newsletters, text, and analysis for various professional needs.)

8. **AI News Aggregators and Insight Platforms**  
   (Highlights tools for staying informed, news aggregation, and data insights.)

9. **Coding and Software Development Support Tools**  
   (Focuses on AI for coding, programming, and streamlining software development.)

10. **Language Learning and Conversational AI Tools**  
   (Includes tools for learning new languages, conversational AI, and improving speaking skills.)

11. **AI for Content Creation and Social Media Management**  
   (Focuses on content generation for blogs, social media, and marketing purposes.)

12. **AI Presentation and Business Collaboration Tools**  
   (Includes tools for creating presentations, team collaboration, and professional insights.)

13. **Creative AI for Modeling and Application Design**  
   (Highlights AI for creating content, generating names, and supporting creative solutions.)

14. **AI Platforms for Personalized Student Learning**  
   (Focuses on personalized learning platforms, courses, and student support.)

15. **Adaptive Learning and Skill Development Tools**  
   (Highlights tools for personalized learning experiences, skill building, and individual development.)


### Metacognitive

In [None]:
# Define keywords for topics
topics = {
    "Cognitive": [
        "learning", "analysis", "knowledge", "understanding", "intelligence",
        "comprehension", "creativity", "thinking", "attention", "evaluation",
        "synthesis", "memory", "reasoning"
    ],
    "Metacognitive": [
        "keyword", "adapt", "evaluate", "evaluation", "monitor", "plan", "adjustment"
    ],
    "Affective": [
        "appreciation", "balance", "care", "collaboration", "community", "connection",
        "drive", "emotions", "empathy", "engagement", "feeling", "friendliness",
        "gratitude", "inspiration", "interaction", "joy", "love", "mindfulness",
        "mood", "motivation", "positivity", "relationship", "resilience",
        "satisfaction", "sentiment", "social", "stress", "support", "trust",
        "understanding"
    ]
}

In [None]:
# Lemmatize keywords for topics
lemmatized_topics = {topic: [lemmatizer.lemmatize(word) for word in keywords]
                     for topic, keywords in topics.items()}

# Function to check relevance of text to a topic
def check_topic_relevance(text_tokens, lemmatized_keywords):
    return any(word in text_tokens for word in lemmatized_keywords)

# Function to calculate relevance score
def calculate_relevance(text_tokens, keywords):
    score = 0
    for token in text_tokens:
        if token in keywords:
            score += 1  # Exact match
        else:
            # Fuzzy match with threshold
            if any(fuzz.partial_ratio(token, kw) > 80 for kw in keywords):
                score += 0.5
    return score

# Add columns for topic relevance
for topic, keywords in lemmatized_topics.items():
    filtered_df[f"{topic}_Related"] = filtered_df['desc'].apply(lambda x: check_topic_relevance(preprocess(x), keywords))

In [None]:
# Filtered datasets by topic relevance
cognitive_related = filtered_df[filtered_df["Cognitive_Related"]]
metacognitive_related = filtered_df[filtered_df["Metacognitive_Related"]]
affective_related = filtered_df[filtered_df["Affective_Related"]]

In [None]:
# Filtered datasets by topic relevance
print(cognitive_related.shape)
print(metacognitive_related.shape)
print(affective_related.shape)

(414, 10)
(143, 10)
(502, 10)


In [None]:
filtered_df.shape

(894, 11)

In [None]:
pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [None]:
from fuzzywuzzy import fuzz

In [None]:
# Keywords for each topic
keywords = {
    "Cognitive": [
        "learning", "analysis", "knowledge", "understanding", "intelligence",
        "comprehension", "creativity", "thinking", "attention", "evaluation",
        "synthesis", "memory", "reasoning"
    ],
    "Metacognitive": [
        "keyword", "adapt", "evaluate", "evaluation", "monitor", "plan",
        "adjustment", "adaptability", "reflection", "regulating", "review", "planning"
    ],
    "Affective": [
        "appreciation", "balance", "care", "collaboration", "community", "connection",
        "drive", "emotions", "empathy", "engagement", "feeling", "friendliness",
        "gratitude", "inspiration", "interaction", "joy", "love", "mindfulness",
        "mood", "motivation", "positivity", "relationship", "resilience", "satisfaction",
        "sentiment", "social", "stress", "support", "trust", "understanding"
    ]
}

# Preprocess function
def preprocess(text):
    if isinstance(text, str):
        tokens = word_tokenize(text.lower())
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
        return tokens
    return []

# Function to calculate relevance score for a topic
def calculate_relevance(text_tokens, keywords):
    score = 0
    for token in text_tokens:
        if token in keywords:
            score += 1  # Exact match
        else:
            # Fuzzy match with threshold
            if any(fuzz.partial_ratio(token, kw) > 80 for kw in keywords):
                score += 0.5
    return score

# Apply preprocessing
filtered_df['Processed_Text'] = filtered_df['desc'].apply(preprocess)

# Calculate scores for each topic
for topic, topic_keywords in keywords.items():
    column_name = f"{topic}_Score"
    filtered_df[column_name] = filtered_df['Processed_Text'].apply(lambda x: calculate_relevance(x, topic_keywords))

# Filter relevant LLMs for each topic
cognitive_related = filtered_df[filtered_df['Cognitive_Score'] > 0].sort_values(by='Cognitive_Score', ascending=False)
metacognitive_related = filtered_df[filtered_df['Metacognitive_Score'] > 0].sort_values(by='Metacognitive_Score', ascending=False)
affective_related = filtered_df[filtered_df['Affective_Score'] > 0].sort_values(by='Affective_Score', ascending=False)

# Save results
cognitive_related.to_csv('cognitive_related_llms.csv', index=False)
metacognitive_related.to_csv('metacognitive_related_llms.csv', index=False)
affective_related.to_csv('affective_related_llms.csv', index=False)

In [None]:
cognitive_related.to_excel('cognitive_related_llms.xlsx', index=False)
metacognitive_related.to_excel('metacognitive_related_llms.xlsx', index=False)
affective_related.to_excel('affective_related_llms.xlsx', index=False)

In [None]:
filtered_df.Cognitive_Score.unique()

array([ 0. ,  0.5,  1.5,  2.5,  3. ,  1. ,  2. ,  3.5,  4.5,  5. ,  4. ,
        5.5,  9.5,  7.5,  6.5,  8.5, 16.5,  6. , 11.5, 18.5,  9. , 10.5,
       17.5,  8. , 11. , 12.5,  7. , 12. , 13.5, 15. , 15.5, 13. , 18. ,
       17. , 14.5, 25.5, 10. ])

In [None]:
filtered_df.Affective_Score.unique()

array([ 0. ,  3.5,  2. ,  0.5,  2.5,  1.5,  3. ,  6. ,  1. ,  6.5,  4.5,
        5.5,  8. , 12. ,  5. , 12.5,  7. ,  9. ,  4. , 10. , 13.5,  8.5,
       11. , 10.5,  7.5, 11.5, 15. ,  9.5, 13. , 16. ])

In [None]:
filtered_df.Metacognitive_Score.unique()

In [None]:
# Filtered datasets by topic relevance
print(cognitive_related.shape)
print(affective_related.shape)
print(metacognitive_related.shape)

(756, 11)
(824, 11)
(689, 11)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
filtered_df.to_excel('edu_LLMs_related_keyword.xlsx', index=False)