In [21]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm
import matplotlib.pyplot as plt
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import MaximalMarginalRelevance
from bertopic.vectorizers import ClassTfidfTransformer
from sentence_transformers import SentenceTransformer
from bertopic.representation import KeyBERTInspired
import re

In [22]:
# Load abstracts from CSV (adjust path as needed)
df = pd.read_csv("scopus.csv")
df = df[df['Abstract']!='[No abstract available]']

In [23]:
abstracts = df['Abstract'].dropna().tolist()  # Remove missing values

# Verify data
print(f"Number of abstracts: {len(abstracts)}")
print("Sample abstract:", abstracts[0][:200])  # Preview first 200 chars

Number of abstracts: 1371
Sample abstract: The idea of sustainable cities has drawn a lot of attention due to the quick expansion of metropolitan areas as well as the growing problems brought on by resource scarcity and climate change. Cities 


In [24]:
print("Loading embedding model...")
embedding_model = SentenceTransformer('all-mpnet-base-v2')
print("Calculating document embeddings...")
document_embeddings = embedding_model.encode(abstracts, show_progress_bar=True)
print("Embeddings calculated.")

Loading embedding model...
Calculating document embeddings...


Batches: 100%|██████████| 43/43 [07:49<00:00, 10.93s/it]

Embeddings calculated.





In [31]:
# Load embedding model
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True, bm25_weighting=True)
countv = CountVectorizer(stop_words='english')
representation_models = KeyBERTInspired()

# Initialize BERTopic with NMF
topic_model = BERTopic(
    embedding_model=embedding_model,
    representation_model=representation_models,
    vectorizer_model=countv,
    ctfidf_model=ctfidf_model,
    verbose=True
)
print("Fitting BERTopic model using pre-calculated embeddings...")
topic_model.fit(abstracts, embeddings=document_embeddings)
print("BERTopic model fitted.")

print("Transforming documents to get topic assignments and probabilities...")
topics, probabilities = topic_model.transform(abstracts, embeddings=document_embeddings)
print("Transformation complete.")

topic_model.reduce_topics(abstracts)
# View topics
for topic_id, words in topic_model.get_topics().items():
    if topic_id != -1:  # Exclude outlier topic (-1)
        print(f"Topic {topic_id + 1}: {[word for word, _ in words]}")

2025-04-28 11:58:40,185 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


Fitting BERTopic model using pre-calculated embeddings...


2025-04-28 11:58:43,820 - BERTopic - Dimensionality - Completed ✓
2025-04-28 11:58:43,822 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-28 11:58:43,913 - BERTopic - Cluster - Completed ✓
2025-04-28 11:58:43,913 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-28 11:59:59,979 - BERTopic - Representation - Completed ✓
2025-04-28 12:00:00,316 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-04-28 12:00:00,325 - BERTopic - Dimensionality - Completed ✓
2025-04-28 12:00:00,325 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-04-28 12:00:00,371 - BERTopic - Cluster - Completed ✓
2025-04-28 12:00:00,375 - BERTopic - Topic reduction - Reducing number of topics
2025-04-28 12:00:00,382 - BERTopic - Representation - Fine-tuning topics using representation models.


BERTopic model fitted.
Transforming documents to get topic assignments and probabilities...
Transformation complete.


2025-04-28 12:01:00,817 - BERTopic - Representation - Completed ✓
2025-04-28 12:01:00,825 - BERTopic - Topic reduction - Reduced number of topics from 26 to 20


Topic 1: ['sustainability', 'ecological', 'cities', 'environmental', 'sustainable', 'settlements', 'heritage', 'land', 'city', 'housing']
Topic 2: ['sustainability', 'urbanism', 'cities', 'sustainable', 'iot', 'data', 'environmental', 'sensors', 'analytics', 'intelligent']
Topic 3: ['transportation', 'vehicles', 'logistics', 'transport', 'commuting', 'walkability', 'vehicle', 'emissions', 'freight', 'traffic']
Topic 4: ['streetscape', 'urbanization', 'ecological', 'gardens', 'ecology', 'landscapes', 'garden', 'ecosystem', 'ecosystems', 'vegetation']
Topic 5: ['sustainability', 'pollution', 'climate', 'emissions', 'environmental', 'eco', 'economic', 'vilnius', 'countries', 'competitiveness']
Topic 6: ['urbanization', 'shenzhen', 'city', 'yangzhou', 'governance', 'tianjin', 'zhuhai', 'towns', 'governments', 'eco']
Topic 7: ['agroecology', 'agriculture', 'agricultural', 'agroecological', 'floriculture', 'farmers', 'horticultural', 'farms', 'cultivation', 'gardens']
Topic 8: ['thermal', 'r