In [48]:
import nltk
import json
from tqdm import tqdm
import pandas as pd
from umap import UMAP
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired
from sentence_transformers import SentenceTransformer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [36]:
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/moritzblum/nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/moritzblum/nltk_data...


True

In [4]:
# Read all data into a single dataframe
acl_files = [
    'data/nlp/raw/2017_ACL.csv',
    'data/nlp/raw/2018_ACL.csv',
    'data/nlp/raw/2019_ACL.csv',
    'data/nlp/raw/2020_ACL.csv',
    'data/nlp/raw/2021_ACL.csv',
    'data/nlp/raw/2022_ACL.csv',
    'data/nlp/raw/2023_ACL.csv'
]

acl_data = pd.concat([pd.read_csv(file) for file in acl_files], ignore_index=True)

In [5]:
acl_data.head()

Unnamed: 0,link,title,abstract
0,https://aclanthology.org/P17-1001/,Adversarial Multi-task Learning for Text Class...,Neural network models have shown their promisi...
1,https://aclanthology.org/P17-1002/,Neural End-to-End Learning for Computational A...,We investigate neural techniques for end-to-en...
2,https://aclanthology.org/P17-1003/,Neural Symbolic Machines: Learning Semantic Pa...,Harnessing the statistical power of neural net...
3,https://aclanthology.org/P17-1004/,Neural Relation Extraction with Multi-lingual ...,Relation extraction has been widely used for f...
4,https://aclanthology.org/P17-1005/,Learning Structured Natural Language Represent...,We introduce a neural semantic parser which is...


In [6]:
texts = acl_data["abstract"].astype("str")
texts.shape

(4605,)

In [9]:
# create BERTopic Extractor
umap_model=UMAP(n_neighbors=20,n_components=50,metric="cosine",min_dist=0.0,random_state=37)
vectorizer_model=CountVectorizer(ngram_range=(2,4),stop_words="english")
ctfidf_model=ClassTfidfTransformer(reduce_frequent_words=False)
sentence_model=SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
representation_model = KeyBERTInspired()

topic_model=BERTopic(verbose=True,
                     umap_model=umap_model,
                     ctfidf_model=ctfidf_model,
                     vectorizer_model=vectorizer_model,
                     embedding_model=sentence_model,
                     representation_model=representation_model,
                     nr_topics=50,
                     low_memory=True,
                     calculate_probabilities=False)

In [10]:
topics, _ = topic_model.fit_transform(texts)

2024-09-09 09:55:00,770 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 144/144 [01:15<00:00,  1.91it/s]
2024-09-09 09:56:16,186 - BERTopic - Embedding - Completed ✓
2024-09-09 09:56:16,188 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
2024-09-09 09:56:31,763 - BERTopic - Dimensionality - Completed ✓
2024-09-09 09:56:31,764 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-09-09 09:56:31,973 - BERTopic - Cluster - Completed ✓
2024-09-09 09:56:31,973 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-09-09 09:56:58,538 - BERTopic - Representation - Completed ✓
2024-09-09 09:56:58,563 - BERTopic - Topic reduction - Reducing number of topics
2024-09-09 09:57:11,105 - BERTopic - Topic reduction - Reduced number of topics from 93 to 50


In [11]:
all_topics = topic_model.get_topics()

In [12]:
concepts=[]

for topic_num, keywords in all_topics.items():
    if topic_num != -1:
        topic_keywords = [word for word, value in keywords]
        concepts.extend(topic_keywords)

In [15]:
# remove duplicates
concepts = list(set(keyword.lower() for keyword in concepts))

In [16]:
len(concepts)

476

In [26]:
with open("data/nlp/extracted_concepts.tsv", "w") as f:
    for id, concept in enumerate(concepts, 1):
        f.write(f"{id}|{concept}\n")

In [28]:
extracted_concepts = pd.read_csv("data/nlp/extracted_concepts.tsv", delimiter="|", header=None)
extracted_concepts = extracted_concepts[1].tolist()

gold_concepts = pd.read_csv("data/nlp/raw/gold_concepts.tsv", delimiter="|", header=None)
gold_concepts = gold_concepts[1].tolist()

In [30]:
len(extracted_concepts)

476

In [34]:
lemmatizer = WordNetLemmatizer()

def singularize_concept(concept):
    words = concept.split()
    singular_words = [lemmatizer.lemmatize(word, wordnet.NOUN) for word in words]
    return ' '.join(singular_words)

In [38]:
# singularize concepts
gold_concept = [singularize_concept(concept) for concept in gold_concepts]
extracted_concept = [singularize_concept(concept) for concept in extracted_concepts]

# convert to lowercase
gold_concept = [concept.lower() for concept in gold_concept]
extracted_concept = [concept.lower() for concept in extracted_concept]

In [39]:
# create dataframe (column label indicated the source of the concept: 0=extracted, 1=gold)
df_old = pd.DataFrame(extracted_concept, columns=["concept"])
df_old["label"] = 0

df_new = pd.DataFrame(gold_concept, columns=["concept"])
df_new["label"] = 1

df = pd.concat([df_old, df_new])
df = df.sort_values(by="label")

df = df.drop_duplicates(subset="concept", keep="first")

In [40]:
df.shape

(945, 2)

In [44]:
# reduce the text dataset to only texts containing the concepts

def filter_abstracts_by_term(term, abstracts, threshold=70):
    filtered_abstracts = []
    for abstract in abstracts:
        if isinstance(abstract, str):
            if fuzz.partial_ratio(term.lower(), abstract.lower()) >= threshold:
                filtered_abstracts.append(abstract)
    return filtered_abstracts

concept_abstracts = {}
for index, row in tqdm(df.iterrows(), desc="Processing concepts", total=df.shape[0]):
    concept = row["concept"]
    label = row["label"]
    filtered_abstracts = filter_abstracts_by_term(concept, texts)
    concept_abstracts[concept] = {
        "abstracts": filtered_abstracts,
        "label": label
    }

Processing concepts: 100%|██████████| 945/945 [09:07<00:00,  1.72it/s]


In [45]:
label_0_count = sum(1 for details in concept_abstracts.values() if details['label'] == 0)
print(f"Number of concepts added through BERTopic: {label_0_count}")

Number of concepts added through BERTopic: 459


In [46]:
empty_abstracts_count = sum(1 for details in concept_abstracts.values() if not details['abstracts'])
print(f"Number of concepts with empty filtered_abstracts: {empty_abstracts_count}")

Number of concepts with empty filtered_abstracts: 67


In [49]:
output_file_path = "data/nlp/concept_abstracts.json"
with open(output_file_path, 'w', encoding='utf-8') as f:
    json.dump(concept_abstracts, f, ensure_ascii=False, indent=4)