In [1]:
import os.path

import nltk
import json
from tqdm import tqdm
import pandas as pd
from umap import UMAP
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired
from sentence_transformers import SentenceTransformer
from nltk.data import find
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from fuzzywuzzy import fuzz

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
config = {
    "run_name": "test",
    "dataset": "test",
    "relation_definitions_file": "test/relation_types.json",
    "input_json_file": "",
    "input_triple_file": "",
    "model": "gpt-3.5-turbo",
    "max_resp_tok": 200,
    "max_input_char": 10000,
    "prompt_tpextraction": "prompts/prompt_tpextraction.txt",
    "prompt_fusion": "prompts/prompt_fusion.txt",
    "gold_concept_file": "",
    "refined_concepts_file": "True",
    "annotated_graph_file": "data/prerequisite_of_graph.tsv",
    "language": "english"
}

In [3]:
if 'language' not in config:
    config['language'] = "english"
if 'gold_concept_file' not in config:
    config['gold_concept_file'] = ""

# create BERTopic Extractor
# language dependent part
if config['language'] == "english":
    vectorizer_model = CountVectorizer(ngram_range=(2, 4),
                                        stop_words="english")
    sentence_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
else:
    print(f"Using language {config['language']}.")
    print("Language not yet supported. Exiting.")
    exit(0)

In [None]:
texts = []
for file in os.listdir(f'test/raw/'):
    if file.endswith('.txt'):
        print(f"Loading file: {file}")
        with open(f'test/raw/{file}', 'r') as f:
            for line in f:
                texts.append(line)

Loading file: Dsa_clean.txt


In [5]:
# language independent part
umap_model = UMAP(n_neighbors=20, n_components=50, metric="cosine", min_dist=0.0, random_state=37)
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=False)
representation_model = KeyBERTInspired()

topic_model = BERTopic(verbose=True,
                        umap_model=umap_model,
                        ctfidf_model=ctfidf_model,
                        vectorizer_model=vectorizer_model,
                        embedding_model=sentence_model,
                        representation_model=representation_model,
                        nr_topics=50,
                        low_memory=True,
                        calculate_probabilities=False)

topics, _ = topic_model.fit_transform(texts)
all_topics = topic_model.get_topics()

2025-09-01 13:04:02,606 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 114/114 [00:08<00:00, 13.81it/s]
2025-09-01 13:04:10,904 - BERTopic - Embedding - Completed ✓
2025-09-01 13:04:10,904 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-01 13:04:31,713 - BERTopic - Dimensionality - Completed ✓
2025-09-01 13:04:31,716 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-01 13:04:31,961 - BERTopic - Cluster - Completed ✓
2025-09-01 13:04:31,961 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-09-01 13:04:32,042 - BERTopic - Representation - Completed ✓
2025-09-01 13:04:32,042 - BERTopic - Topic reduction - Reducing number of topics
2025-09-01 13:04:32,052 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-01 13:04:38,378 - BERTopic - Representation - Completed ✓
2025-09-01 13:04:38,381 - BERTopic - Topic reduction - Re

In [6]:
extracted_concepts = []
for topic_num, keywords in all_topics.items():
    if topic_num != -1:
        topic_keywords = [word for word, value in keywords]
        extracted_concepts.extend(topic_keywords)

In [7]:
# remove duplicates
extracted_concepts = list(set(keyword.lower() for keyword in extracted_concepts))

if not os.path.exists('output'):
    os.makedirs('output')

# write extracted concepts to file
with open('output/concepts.tsv', "w") as f:
    for id, concept in enumerate(extracted_concepts, 1):
        f.write(f"{id}|{concept}\n")
print(f"Concepts written to output/concepts.tsv.")

Concepts written to output/concepts.tsv.


In [None]:
lemmatizer = WordNetLemmatizer()

def singularize_concept(concept):
    words = concept.split()
    singular_words = [lemmatizer.lemmatize(word, wordnet.NOUN) for word in words]
    return ' '.join(singular_words)

# singularize concepts
extracted_concept = [singularize_concept(concept) for concept in extracted_concepts]

df_concepts = pd.DataFrame(extracted_concept, columns=["concept"])
df_concepts["label"] = 0

In [None]:
df_concepts = df_concepts.drop_duplicates(subset="concept", keep="first")

# reduce the text dataset to only texts containing the concepts
def filter_abstracts_by_term(term, abstracts, threshold=70):
    filtered_abstracts = []
    for abstract in abstracts:
        if isinstance(abstract, str):
            if fuzz.partial_ratio(term.lower(), abstract.lower()) >= threshold:
                filtered_abstracts.append(abstract)
    return filtered_abstracts

concept_abstracts = {}
for index, row in tqdm(df_concepts.iterrows(), desc="Processing concepts",
                        total=df_concepts.shape[0]):
    concept = row["concept"]
    label = row["label"]
    filtered_abstracts = filter_abstracts_by_term(concept, texts)
    concept_abstracts[concept] = {
        "abstracts": filtered_abstracts,
        "label": label
    }

Processing concepts: 100%|██████████| 450/450 [03:09<00:00,  2.38it/s]


In [10]:
with open('output/concept_text.json', 'w', encoding='utf-8') as f:
        json.dump(concept_abstracts, f, ensure_ascii=False, indent=4)