In [None]:
import pandas as pd
from gensim import corpora, models
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from pathlib import Path
import pyLDAvis
from pyLDAvis.gensim import prepare

In [3]:
# download data
nltk.download('punkt_tab')

data_dir = Path("data")
# Load data
old_df = pd.read_csv(data_dir / "kaggle_arxiv_dataset" / "dataset.csv")
# new_df = pd.read_csv(data_dir / "arxiv_dataset" / "")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\robot\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [4]:
# Preprocess

# Combine title and abstract
def combine_text(df):
    return (df["title"] + " " + df["abstract"]).fillna("").tolist()

# Preprocessing
def preprocess(texts) -> list:
    # get stopwords
    stop_words = set(stopwords.words("english"))
    # Preprocess
    preprocessed = []
    for text in texts:
        # tokenize the lower case text
        tokens = word_tokenize(text.lower())
        # Filter tokens that only contain alpha characters and token is not in stop_words
        tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
        preprocessed.append(tokens)
    return preprocessed

old_texts = preprocess(combine_text(old_df))
# new_texts = preprocess(combine_text(new_df))

In [10]:
# Find topics

# Train LDA model and extract topics
def run_lda(preprocessed_texts, num_topics=20) -> tuple[models.LdaModel, list, corpora.Dictionary]:
    dictionary = corpora.Dictionary(preprocessed_texts)
    corpus = [dictionary.doc2bow(text) for text in preprocessed_texts]
    lda_model = models.LdaModel(
        corpus,
        num_topics=num_topics,
        id2word=dictionary,
        passes=5,
        random_state=0,
    )
    return lda_model, corpus, dictionary

topics = 1000
old_lda, old_corpus, old_dict = run_lda(old_texts, num_topics=topics)
# new_lda, new_corpus, new_dict = run_lda(new_texts, num_topics=topics)

In [None]:
# Get topic overlap

# Measure topic diversity
def topic_distribution(lda_model, corpus: list) -> int:
    topic_counts = [max(lda_model[doc], key=lambda x: x[1])[0] for doc in corpus]
    unique_topics = set(topic_counts)
    return len(unique_topics)

old_topic_count = topic_distribution(old_lda, old_corpus)
# new_topic_count = topic_distribution(new_lda, new_corpus)

In [11]:
# Visualize
pyLDAvis.enable_notebook()
vis = prepare(old_lda, old_corpus, old_dict, mds="mmds", R=30)
vis

