### Load the Dataset

In [1]:
%pip install kagglehub pandas

import kagglehub
import pandas as pd
import os

# Download the dataset
dataset_path = kagglehub.dataset_download("sumitm004/arxiv-scientific-research-papers-dataset")

# Assuming the downloaded file is a CSV and finding the filename
# List files in the downloaded directory
files = os.listdir(dataset_path)
csv_file = None
for file in files:
    if file.endswith('.csv'):
        csv_file = file
        break

if not csv_file: raise "No CSV file found in the downloaded dataset."

file_path = os.path.join(dataset_path, csv_file)
# Read the dataset into a pandas DataFrame
df = pd.read_csv(file_path)

# Display the first 5 rows
print("First 5 rows of the DataFrame:")
display(df.head())

# Display information about the DataFrame
print("\nInformation about the DataFrame:")
df.info()

Note: you may need to restart the kernel to use updated packages.
First 5 rows of the DataFrame:


Unnamed: 0,id,title,category,category_code,published_date,updated_date,authors,first_author,summary,summary_word_count
0,cs-9308101v1,Dynamic Backtracking,Artificial Intelligence,cs.AI,8/1/93,8/1/93,['M. L. Ginsberg'],'M. L. Ginsberg',Because of their occasional need to return to ...,79
1,cs-9308102v1,A Market-Oriented Programming Environment and ...,Artificial Intelligence,cs.AI,8/1/93,8/1/93,['M. P. Wellman'],'M. P. Wellman',Market price systems constitute a well-underst...,119
2,cs-9309101v1,An Empirical Analysis of Search in GSAT,Artificial Intelligence,cs.AI,9/1/93,9/1/93,"['I. P. Gent', 'T. Walsh']",'I. P. Gent',We describe an extensive study of search in GS...,167
3,cs-9311101v1,The Difficulties of Learning Logic Programs wi...,Artificial Intelligence,cs.AI,11/1/93,11/1/93,"['F. Bergadano', 'D. Gunetti', 'U. Trinchero']",'F. Bergadano',As real logic programmers normally use cut (!)...,174
4,cs-9311102v1,Software Agents: Completing Patterns and Const...,Artificial Intelligence,cs.AI,11/1/93,11/1/93,"['J. C. Schlimmer', 'L. A. Hermens']",'J. C. Schlimmer',To support the goal of allowing users to recor...,187



Information about the DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136238 entries, 0 to 136237
Data columns (total 10 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   id                  136238 non-null  object
 1   title               136238 non-null  object
 2   category            136238 non-null  object
 3   category_code       136238 non-null  object
 4   published_date      136238 non-null  object
 5   updated_date        136238 non-null  object
 6   authors             136238 non-null  object
 7   first_author        136238 non-null  object
 8   summary             136238 non-null  object
 9   summary_word_count  136238 non-null  int64 
dtypes: int64(1), object(9)
memory usage: 10.4+ MB


In [2]:
def parse_date(string: str):
    match list(map(int, string.split("/"))):
        case [month, day, year] if year >= 50:
            return (month, day, 1900 + year)
        case [month, day, year] if year <= 50:
            return (month, day, 2000 + year)
        case v:
            print(f"Unknown format: {v}")
            return None

In [3]:
import pandas as pd
from collections import defaultdict

# Get all unique years from the dataset
all_years = { y for _, _, y in map(parse_date, list(df["published_date"])) if isinstance(y, int) }

if not all_years:
    print("No valid years found in the dataset.")
    grouped_documents = {}
else:
    # Initialize the dictionary to hold the grouped series of documents
    buckets = {
        2000 + 2004j,
        2005 + 2009j,
        2010 + 2014j,
        2015 + 2019j,
        2020 + 2025j,
        1900 + 1999j
    }
    grouped_series = defaultdict(list)

    # Group documents into 5-year buckets for years >= 2000
    latest_year = max(all_years)
    for index, docu in df.iterrows():
        parsed_date = parse_date(docu["published_date"])
        if not parsed_date: continue

        _, _, year = parsed_date
        for bucket_bound in buckets:
            if bucket_bound.real <= year <= bucket_bound.imag:
                group_name = f"{int(bucket_bound.real)}-{int(bucket_bound.imag)}"
                grouped_series[group_name].append(docu)

    # Convert the lists of documents into DataFrames
    grouped_documents = { y: pd.DataFrame(v) for y, v in grouped_series.items() }

    # Print the head of the first group to verify
    if grouped_documents:
        first_group_name = sorted(grouped_documents.keys(), reverse=True)[0]
        print(f"Documents from group: {first_group_name}")
        display(grouped_documents[first_group_name].head())
    else:
        print("No documents were grouped.")

Documents from group: 2020-2025


Unnamed: 0,id,title,category,category_code,published_date,updated_date,authors,first_author,summary,summary_word_count
3464,abs-2002.00429v2,Uncertainty Weighted Causal Graphs,Artificial Intelligence,cs.AI,2/2/20,2/6/20,"['Eduardo C. Garrido-Merchán', 'C. Puente', 'A...",'Eduardo C. Garrido-Merchán',Causality has traditionally been a scientific ...,111
3465,abs-2002.00434v2,Integrating Deep Reinforcement Learning with M...,Artificial Intelligence,cs.AI,2/2/20,5/19/20,"['Ekim Yurtsever', 'Linda Capito', 'Keith Redm...",'Ekim Yurtsever',Automated driving in urban settings is challen...,181
3466,abs-2002.00509v2,A Machine Consciousness architecture based on ...,Artificial Intelligence,cs.AI,2/2/20,3/14/20,"['Eduardo C. Garrido Merchán', 'Martín Molina']",'Eduardo C. Garrido Merchán',Recent developments in machine learning have p...,197
3467,abs-2002.01080v4,Bridging the Gap: Providing Post-Hoc Symbolic ...,Artificial Intelligence,cs.AI,2/4/20,3/19/22,"['Sarath Sreedharan', 'Utkarsh Soni', 'Mudit V...",'Sarath Sreedharan',As increasingly complex AI systems are introdu...,148
3468,abs-2002.01088v1,Neuro-evolutionary Frameworks for Generalized ...,Artificial Intelligence,cs.AI,2/4/20,2/4/20,['Thommen George Karimpanal'],'Thommen George Karimpanal',The recent successes of deep learning and deep...,144


In [4]:
%pip install nltk

import nltk

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt_tab')

Note: you may need to restart the kernel to use updated packages.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\michael\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\michael\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\michael\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\michael\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\michael\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [5]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove non-alphanumeric characters and tokenize
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = nltk.word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

In [6]:
%pip install hyperopt

from collections import Counter
import pickle
from gensim.models import LdaMulticore, CoherenceModel
from hyperopt import fmin, tpe, hp, Trials, space_eval, STATUS_OK
from hyperopt.early_stop import no_progress_loss

class Box[T]:
    def __init__(self, data: T):
        self.data = data

# -------------------------
# Hyperopt search space
# -------------------------
search_space = {
    'num_topics': hp.choice('num_topics', list(range(4, 11))),
    'alpha': hp.uniform('alpha', 0.01, 0.99),
    'beta': hp.uniform('beta', 0.01, 0.99),
}

def objective(corpus, dictionary, texts, params, group_name, counter: Box[int]):
    try:
        num_topics = int(params['num_topics'])
        alpha = float(params['alpha'])
        beta = float(params['beta'])

        passes = 15
        if len(corpus) >= 10_000:
            passes = 10

        lda_model = LdaMulticore(corpus=corpus,
                                 id2word=dictionary,
                                 num_topics=num_topics,
                                 random_state=100,
                                 chunksize=100,
                                 passes=passes,
                                 alpha=alpha,
                                 eta=beta)

        coherence_model_lda = CoherenceModel(model=lda_model,
                                             texts=texts,
                                             dictionary=dictionary,
                                             coherence='c_v')

        coherence = coherence_model_lda.get_coherence()


        # Create a directory to save the models
        if not os.path.exists(f'lda_models/lda_{group_name}'):
            os.makedirs(f'lda_models/lda_{group_name}')

        # Create a directory to save the models
        model_path = f'lda_models/lda_{group_name}/{counter.data}.pkl'
        with open(model_path, 'wb') as f:
            pickle.dump(lda_model, f)

        counter.data += 1
        # fmin minimizes, so return negative coherence to maximize it
        return {'loss': -coherence, 'status': STATUS_OK}
    except Exception as e:
        print(e)
        # on failure return a large loss
        return {'loss': 1e6, 'status': 'fail'}


def create_optimized_model(corpus, dictionary, texts, group_name, max_evals=20):
    counter = Box(0)
    trials = Trials()
    best = fmin(fn=lambda params: objective(corpus, dictionary, texts, params, group_name, counter),
                space=search_space,
                algo=tpe.suggest,
                max_evals=max_evals,
                trials=trials,
                early_stop_fn=no_progress_loss(5))
    parameters = space_eval(search_space, best)

    # Cast final parameters
    num_topics = int(parameters['num_topics'])
    alpha = float(parameters['alpha'])
    beta = float(parameters['beta'])

    # Train final model with best params
    best_lda = LdaMulticore(corpus=corpus,
                            id2word=dictionary,
                            num_topics=num_topics,
                            random_state=100,
                            chunksize=100,
                            passes=20,
                            alpha=alpha,
                            eta=beta)

    return best_lda, num_topics, alpha, beta

Note: you may need to restart the kernel to use updated packages.


  import pkg_resources


In [7]:
%pip install gensim

from pandas import DataFrame
from gensim.corpora import Dictionary

def process_lda(dataframe: DataFrame, group_name: str):
    df = dataframe.copy()

    # Apply the preprocessing function
    df['processed_summary'] = df['summary'].apply(preprocess_text)

    # Create a dictionary from the processed_summary column
    dictionary = Dictionary(df['processed_summary'])

    # Create a corpus (bag-of-words representation)
    corpus = [dictionary.doc2bow(text) for text in df['processed_summary']]

    # Run hyperopt to get an optimized model
    lda_model, num_topics, alpha, beta = create_optimized_model(corpus,
                                                                dictionary, df['processed_summary'],
                                                                group_name)

    # Print the topics learned by the LDA model
    print("LDA Topics:")
    display(lda_model.print_topics(num_words=5))

    return lda_model, num_topics

Note: you may need to restart the kernel to use updated packages.


In [8]:
# For each group of years, we want to run the LDA for their own.
from gensim.models import LdaModel
import os
import pickle

# Create a directory to save the models
if not os.path.exists('lda_models'):
    os.makedirs('lda_models')

ldas: dict[str, tuple[LdaModel, int]] = {}
display(grouped_documents.keys())

dict_keys(['1900-1999', '2000-2004', '2005-2009', '2010-2014', '2015-2019', '2020-2025'])

In [None]:
for year_group, dataframe in grouped_documents.items():
    print(f"Processing group: {year_group}")
    lda_result = process_lda(dataframe, year_group) # lda_result: (LdaModel, int)
    ldas[year_group] = lda_result
    print(f"Finished processing for group: {year_group}")

    model_path = f'lda_models/lda_{year_group}.pkl'
    with open(model_path, 'wb') as f:
        pickle.dump(ldas, f)

# Save the entire dictionary of models
model_path = 'lda_models/all_lda_models.pkl'
with open(model_path, 'wb') as f:
    pickle.dump(ldas, f)

print(f"All models saved to {model_path}")

Processing group: 1900-1999
 25%|██▌       | 5/20 [03:00<09:01, 36.08s/trial, best loss: -0.29849822223691985]
LDA Topics:


[(0,
  '0.017*"model" + 0.013*"language" + 0.008*"method" + 0.008*"learning" + 0.007*"problem"'),
 (1,
  '0.014*"word" + 0.011*"text" + 0.011*"method" + 0.009*"paper" + 0.009*"system"'),
 (2,
  '0.005*"system" + 0.005*"grammar" + 0.004*"language" + 0.004*"paper" + 0.004*"temporal"'),
 (3,
  '0.014*"language" + 0.012*"system" + 0.009*"speech" + 0.008*"processing" + 0.008*"paper"'),
 (4,
  '0.007*"algorithm" + 0.006*"information" + 0.006*"grammar" + 0.004*"used" + 0.004*"problem"'),
 (5,
  '0.012*"discourse" + 0.006*"paper" + 0.005*"structure" + 0.005*"interpretation" + 0.005*"problem"'),
 (6,
  '0.014*"grammar" + 0.010*"constraint" + 0.008*"approach" + 0.008*"theory" + 0.008*"paper"')]

Finished processing for group: 1900-1999
Processing group: 2000-2004
 10%|█         | 2/20 [01:20<12:07, 40.40s/trial, best loss: -0.3197621681521153] 