In [11]:
import polars as pl

df = pl.read_parquet('../data/raw/arxiv-abstracts.parquet', columns=['id', 'authors', 'title', 'abstract'])
df = df.sample(n=50_000, with_replacement=False, seed=124)
# df = pl.read_parquet('../data/raw/arxiv-abstracts.parquet', columns=['id', 'authors', 'title', 'abstract'])


In [12]:
df

id,authors,title,abstract
str,str,str,str
"""physics/0107021""","""Adrien Schoof, Jan Gruenert, S…","""Reducing the linewidth of a di…",""" An extended cavity diode las…"
"""1606.02306""","""Shahar Hod""","""The superradiant instability r…",""" Spinning Kerr black holes ar…"
"""hep-ph/9812388""","""W.M. Alberico, M.B. Barbaro, S…","""Strange form factors of the pr…",""" We consider ratios of elasti…"
"""astro-ph/0208171""","""M. Boettcher (Rice Univ. / Ohi…","""Predictions of the High-Energy…",""" Spectral fitting of the radi…"
"""0912.2788""","""Xiaodong Liu, Bo Zhang""","""Inverse scattering by an inhom…",""" This paper is concerned with…"
…,…,…,…
"""1809.09358""","""Alexander Bednyakov and Veroni…","""FCNC decays of the Higgs boson…",""" We consider flavor-changing …"
"""1906.02111""","""Yuyu Zhang, Xinshi Chen, Yuan …","""Can Graph Neural Networks Help…",""" Effectively combining logic …"
"""cs/0504047""","""David Doty, Jared Nichols""","""Pushdown dimension""",""" This paper develops the theo…"
"""2009.12889""","""P. Holicky, M. Zeleny""","""There is no bound on Borel cla…",""" We show that for every ordin…"


In [13]:
import re
import string
import random
import nltk
import spacy
from bs4 import BeautifulSoup
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
from gensim.models.phrases import Phrases, Phraser
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
from gensim import corpora, models
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib
# Uncomment the following lines if you wish to use SHAP (requires a graphical environment)
# import shap
# shap.initjs()

# Download necessary NLTK data files (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
ps = PorterStemmer()

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /home/dzakirm/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/dzakirm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/dzakirm/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Basic Preprocessing

In [14]:
import re
import polars as pl
import nltk
import spacy
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from gensim.models.phrases import Phrases, Phraser

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

nlp = spacy.load("en_core_web_sm")
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

def clean_arxiv_abstract(text: str) -> str:
    if not isinstance(text, str):
        return ""
    # Convert to lowercase
    text = text.lower()
    # Remove display math (DOTALL to catch newlines)
    text = re.sub(r'\$\$.*?\$\$', '', text, flags=re.DOTALL)
    # Remove inline math expressions
    text = re.sub(r'\$.*?\$', '', text)
    # Remove LaTeX commands (e.g., \emph{...}, \textbf{...})
    text = re.sub(r'\\[a-zA-Z]+\*?(?:\[[^\]]*\])?(?:\{[^}]*\})?', '', text)
    # Remove citation markers like [1] or [1,2,3]
    text = re.sub(r'\[[^\]]*\]', '', text)
    # Collapse extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def tokenize_text(text: str) -> list:
    """Tokenize a string into words."""
    if not isinstance(text, str):
        return []
    return word_tokenize(text)

def remove_stopwords(tokens: list) -> list:
    """Remove stopwords from a list of tokens."""
    return [token for token in tokens if token not in stop_words]

def lemmatize_tokens(tokens: list) -> list:
    """Lemmatize tokens using spaCy."""
    doc = nlp(" ".join(tokens))
    return [token.lemma_ for token in doc]

def stem_tokens(tokens: list) -> list:
    """Stem tokens using NLTK's PorterStemmer."""
    return [ps.stem(token) for token in tokens]

def named_entity_recognition(text: str) -> list:
    """Extract named entities from text using spaCy."""
    if not isinstance(text, str):
        return []
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

def pos_tagging(text: str) -> list:
    """Extract POS tags from text using spaCy."""
    if not isinstance(text, str):
        return []
    doc = nlp(text)
    return [(token.text, token.pos_) for token in doc]

def build_phraser(token_lists: list) -> Phraser:
    """Build a bigram phraser using Gensim's Phrases."""
    phrases = Phrases(token_lists, min_count=1, threshold=2)
    return Phraser(phrases)

def apply_phrases(tokens: list, phraser: Phraser) -> list:
    """Apply a pre-built phraser to a list of tokens."""
    return phraser[tokens]

[nltk_data] Downloading package punkt to /home/dzakirm/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/dzakirm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/dzakirm/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Cleaning

### Title

In [15]:
df = df.with_columns([
    pl.col("title").map_elements(clean_arxiv_abstract).alias("title_clean"),
    pl.col("title").map_elements(lambda x: tokenize_text(clean_arxiv_abstract(x))).alias("title_tokens")
])

df.write_parquet("../data/interim/arxiv-abstracts-cleaned.parquet")




In [None]:
df = df.with_columns([
    pl.col("title_tokens").map_elements(remove_stopwords).alias("title_tokens_no_stop"),
])

df = df.with_columns([
    pl.col("title_tokens").map_elements(remove_stopwords).alias("title_tokens_no_stop"),
    pl.col("title_tokens_no_stop").map_elements(lemmatize_tokens).alias("title_lemmatized"),
    pl.col("title_tokens_no_stop").map_elements(stem_tokens).alias("title_stemmed"),
    pl.col("title_clean").map_elements(named_entity_recognition).alias("title_entities"),
    pl.col("title_clean").map_elements(pos_tagging).alias("title_pos_tags")
])

df.write_parquet("../data/interim/arxiv-abstracts-cleaned.parquet")


  df = df.with_columns([


### Abstract

In [7]:
df = df.with_columns([
    pl.col("abstract").map_elements(clean_arxiv_abstract).alias("abstract_clean"),
    pl.col("abstract").map_elements(lambda x: tokenize_text(clean_arxiv_abstract(x))).alias("abstract_tokens")
])

df.write_parquet("../data/interim/arxiv-abstracts-cleaned.parquet")




In [8]:
df = df.with_columns([
    pl.col("abstract_tokens").map_elements(remove_stopwords).alias("abstract_tokens_no_stop"),
])

df = df.with_columns([
    pl.col("abstract_tokens").map_elements(remove_stopwords).alias("abstract_tokens_no_stop"),
    pl.col("abstract_tokens_no_stop").map_elements(lemmatize_tokens).alias("abstract_lemmatized"),
    pl.col("abstract_tokens_no_stop").map_elements(stem_tokens).alias("abstract_stemmed"),
    pl.col("abstract_clean").map_elements(named_entity_recognition).alias("abstract_entities"),
    pl.col("abstract_clean").map_elements(pos_tagging).alias("abstract_pos_tags")
])

df.write_parquet("../data/interim/arxiv-abstracts-cleaned.parquet")


  df = df.with_columns([


### Phrase Detection

In [9]:
title_token_lists = df["title_tokens"].to_list()
abstract_token_lists = df["abstract_tokens"].to_list()
title_phraser = build_phraser(title_token_lists)
abstract_phraser = build_phraser(abstract_token_lists)

df = df.with_columns([
    pl.col("title_tokens").map_elements(lambda tokens: apply_phrases(tokens, title_phraser)).alias("title_phrases"),
    pl.col("abstract_tokens").map_elements(lambda tokens: apply_phrases(tokens, abstract_phraser)).alias("abstract_phrases")
])

df.write_parquet("../data/interim/arxiv-abstracts-cleaned.parquet")



In [10]:
df

id,authors,title,abstract,title_clean,title_tokens,title_tokens_no_stop,title_lemmatized,title_stemmed,title_entities,title_pos_tags,abstract_clean,abstract_tokens,abstract_tokens_no_stop,abstract_lemmatized,abstract_stemmed,abstract_entities,abstract_pos_tags,title_phrases,abstract_phrases
str,str,str,str,str,list[str],list[str],list[str],list[str],list[list[str]],list[list[str]],str,list[str],list[str],list[str],list[str],list[list[str]],list[list[str]],list[str],list[str]
"""physics/0107021""","""Adrien Schoof, Jan Gruenert, S…","""Reducing the linewidth of a di…",""" An extended cavity diode las…","""reducing the linewidth of a di…","[""reducing"", ""the"", … ""10^5""]","[""reducing"", ""linewidth"", … ""10^5""]","[""reduce"", ""linewidth"", … ""5""]","[""reduc"", ""linewidth"", … ""10^5""]","[[""30"", ""CARDINAL""], [""10"", ""CARDINAL""]]","[[""reducing"", ""VERB""], [""the"", ""DET""], … [""5"", ""NUM""]]","""an extended cavity diode laser…","[""an"", ""extended"", … "".""]","[""extended"", ""cavity"", … "".""]","[""extend"", ""cavity"", … "".""]","[""extend"", ""caviti"", … "".""]","[[""657"", ""CARDINAL""], [""more than 10"", ""CARDINAL""], … [""30"", ""CARDINAL""]]","[[""an"", ""DET""], [""extended"", ""ADJ""], … [""."", ""PUNCT""]]","[""reducing_the"", ""linewidth"", … ""10^5""]","[""an_extended"", ""cavity_diode"", … "".""]"
"""1606.02306""","""Shahar Hod""","""The superradiant instability r…",""" Spinning Kerr black holes ar…","""the superradiant instability r…","[""the"", ""superradiant"", … ""hole""]","[""superradiant"", ""instability"", … ""hole""]","[""superradiant"", ""instability"", … ""hole""]","[""superradi"", ""instabl"", … ""hole""]",[],"[[""the"", ""DET""], [""superradiant"", ""ADJ""], … [""hole"", ""NOUN""]]","""spinning kerr black holes are …","[""spinning"", ""kerr"", … "".""]","[""spinning"", ""kerr"", … "".""]","[""spin"", ""kerr"", … "".""]","[""spin"", ""kerr"", … "".""]","[[""4"", ""CARDINAL""], [""2}}}\{,m\}$"", ""DATE""]]","[[""spinning"", ""VERB""], [""kerr"", ""NOUN""], … [""."", ""PUNCT""]]","[""the"", ""superradiant_instability"", … ""hole""]","[""spinning_kerr"", ""black_holes"", … "".""]"
"""hep-ph/9812388""","""W.M. Alberico, M.B. Barbaro, S…","""Strange form factors of the pr…",""" We consider ratios of elasti…","""strange form factors of the pr…","[""strange"", ""form"", … ""experiment""]","[""strange"", ""form"", … ""experiment""]","[""strange"", ""form"", … ""experiment""]","[""strang"", ""form"", … ""experi""]",[],"[[""strange"", ""ADJ""], [""form"", ""NOUN""], … [""experiment"", ""NOUN""]]","""we consider ratios of elastic …","[""we"", ""consider"", … "".""]","[""consider"", ""ratios"", … "".""]","[""consider"", ""ratio"", … "".""]","[""consid"", ""ratio"", … "".""]",[],"[[""we"", ""PRON""], [""consider"", ""VERB""], … [""."", ""PUNCT""]]","[""strange_form"", ""factors_of"", … ""experiment""]","[""we_consider"", ""ratios"", … "".""]"
"""astro-ph/0208171""","""M. Boettcher (Rice Univ. / Ohi…","""Predictions of the High-Energy…",""" Spectral fitting of the radi…","""predictions of the high-energy…","[""predictions"", ""of"", … ""comae""]","[""predictions"", ""high-energy"", … ""comae""]","[""prediction"", ""high"", … ""comae""]","[""predict"", ""high-energi"", … ""coma""]",[],"[[""predictions"", ""NOUN""], [""of"", ""ADP""], … [""comae"", ""PROPN""]]","""spectral fitting of the radio …","[""spectral"", ""fitting"", … "".""]","[""spectral"", ""fitting"", … "".""]","[""spectral"", ""fit"", … "".""]","[""spectral"", ""fit"", … "".""]","[[""40"", ""CARDINAL""], [""100"", ""CARDINAL""], … [""hadronic"", ""ORG""]]","[[""spectral"", ""ADJ""], [""fitting"", ""NOUN""], … [""."", ""PUNCT""]]","[""predictions"", ""of_the"", … ""comae""]","[""spectral_fitting"", ""of"", … "".""]"
"""0912.2788""","""Xiaodong Liu, Bo Zhang""","""Inverse scattering by an inhom…",""" This paper is concerned with…","""inverse scattering by an inhom…","[""inverse"", ""scattering"", … ""medium""]","[""inverse"", ""scattering"", … ""medium""]","[""inverse"", ""scatter"", … ""medium""]","[""invers"", ""scatter"", … ""medium""]",[],"[[""inverse"", ""NOUN""], [""scattering"", ""NOUN""], … [""medium"", ""NOUN""]]","""this paper is concerned with t…","[""this"", ""paper"", … "".""]","[""paper"", ""concerned"", … "".""]","[""paper"", ""concerned"", … "".""]","[""paper"", ""concern"", … "".""]","[[""two"", ""CARDINAL""], [""one"", ""CARDINAL""]]","[[""this"", ""DET""], [""paper"", ""NOUN""], … [""."", ""PUNCT""]]","[""inverse_scattering"", ""by_an"", … ""medium""]","[""this_paper"", ""is_concerned"", … "".""]"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""1809.09358""","""Alexander Bednyakov and Veroni…","""FCNC decays of the Higgs boson…",""" We consider flavor-changing …","""fcnc decays of the higgs boson…","[""fcnc"", ""decays"", … ""model""]","[""fcnc"", ""decays"", … ""model""]","[""fcnc"", ""decays"", … ""model""]","[""fcnc"", ""decay"", … ""model""]",[],"[[""fcnc"", ""PROPN""], [""decays"", ""NOUN""], … [""model"", ""NOUN""]]","""we consider flavor-changing de…","[""we"", ""consider"", … "".""]","[""consider"", ""flavor-changing"", … "".""]","[""consider"", ""flavor"", … "".""]","[""consid"", ""flavor-chang"", … "".""]","[[""2hdm"", ""CARDINAL""], [""3"", ""CARDINAL""], … [""around 350"", ""CARDINAL""]]","[[""we"", ""PRON""], [""consider"", ""VERB""], … [""."", ""PUNCT""]]","[""fcnc"", ""decays"", … ""model""]","[""we_consider"", ""flavor-changing"", … "".""]"
"""1906.02111""","""Yuyu Zhang, Xinshi Chen, Yuan …","""Can Graph Neural Networks Help…",""" Effectively combining logic …","""can graph neural networks help…","[""can"", ""graph"", … ""?""]","[""graph"", ""neural"", … ""?""]","[""graph"", ""neural"", … ""?""]","[""graph"", ""neural"", … ""?""]",[],"[[""can"", ""AUX""], [""graph"", ""VERB""], … [""?"", ""PUNCT""]]","""effectively combining logic re…","[""effectively"", ""combining"", … "".""]","[""effectively"", ""combining"", … "".""]","[""effectively"", ""combine"", … "".""]","[""effect"", ""combin"", … "".""]","[[""noisy data"", ""ORG""], [""gnn"", ""ORG""]]","[[""effectively"", ""ADV""], [""combining"", ""VERB""], … [""."", ""PUNCT""]]","[""can"", ""graph_neural"", … ""reasoning_?""]","[""effectively_combining"", ""logic_reasoning"", … "".""]"
"""cs/0504047""","""David Doty, Jared Nichols""","""Pushdown dimension""",""" This paper develops the theo…","""pushdown dimension""","[""pushdown"", ""dimension""]","[""pushdown"", ""dimension""]","[""pushdown"", ""dimension""]","[""pushdown"", ""dimens""]",[],"[[""pushdown"", ""ADJ""], [""dimension"", ""NOUN""]]","""this paper develops the theory…","[""this"", ""paper"", … "".""]","[""paper"", ""develops"", … "".""]","[""paper"", ""develop"", … "".""]","[""paper"", ""develop"", … "".""]","[[""d/2"", ""PERSON""]]","[[""this"", ""DET""], [""paper"", ""NOUN""], … [""."", ""PUNCT""]]","[""pushdown"", ""dimension""]","[""this_paper"", ""develops"", … "".""]"
"""2009.12889""","""P. Holicky, M. Zeleny""","""There is no bound on Borel cla…",""" We show that for every ordin…","""there is no bound on borel cla…","[""there"", ""is"", … ""theorem""]","[""bound"", ""borel"", … ""theorem""]","[""bind"", ""borel"", … ""theorem""]","[""bound"", ""borel"", … ""theorem""]",[],"[[""there"", ""PRON""], [""is"", ""VERB""], … [""theorem"", ""NOUN""]]","""we show that for every ordinal…","[""we"", ""show"", … "".""]","[""show"", ""every"", … "".""]","[""show"", ""every"", … "".""]","[""show"", ""everi"", … "".""]","[[""two"", ""CARDINAL""], [""b(n"", ""PERSON""], … [""2^$"", ""CARDINAL""]]","[[""we"", ""PRON""], [""show"", ""VERB""], … [""."", ""PUNCT""]]","[""there_is"", ""no"", … ""theorem""]","[""we_show"", ""that"", … "".""]"
