In [1]:
import polars as pl

df = pl.read_parquet('../data/raw/arxiv-abstracts.parquet', columns=['id', 'authors', 'title', 'abstract'])
df = df.sample(n=1_000_000, with_replacement=False, seed=124)
# df = pl.read_parquet('../data/raw/arxiv-abstracts.parquet', columns=['id', 'authors', 'title', 'abstract'])


In [2]:
df

id,authors,title,abstract
str,str,str,str
"""1606.02306""","""Shahar Hod""","""The superradiant instability r…",""" Spinning Kerr black holes ar…"
"""hep-ph/9812388""","""W.M. Alberico, M.B. Barbaro, S…","""Strange form factors of the pr…",""" We consider ratios of elasti…"
"""astro-ph/0208171""","""M. Boettcher (Rice Univ. / Ohi…","""Predictions of the High-Energy…",""" Spectral fitting of the radi…"
"""0912.2791""","""Dong-Keun Ki, Seung-Geol Nam, …","""Dependence of quantum-Hall con…",""" By using four-terminal confi…"
"""0707.3555""","""A. Russell, Vladimir I. Fal'ko""","""Power Dependence of the Photoc…",""" We propose a kinetic theory …"
…,…,…,…
"""cond-mat/0701113""","""Youjin Deng, Timothy M. Garoni…","""Critical speeding-up in a loca…",""" We study the dynamic critica…"
"""2002.10861""","""Denis Sabani, Cihan Bacaksiz a…","""Ab initio methodology for magn…",""" The recent development in th…"
"""1310.0237""","""Hywel Owen, David Holder, Jose…","""Technologies for Delivery of P…",""" Recent developments for the …"
"""2108.11695""","""Zhuojie Wu, Zijian Wang, Wenxu…","""PAENet: A Progressive Attentio…",""" 3D to 2D retinal vessel segm…"


In [3]:
import re
import string
import random
import nltk
import spacy
from bs4 import BeautifulSoup
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
from gensim.models.phrases import Phrases, Phraser
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
from gensim import corpora, models
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib
# Uncomment the following lines if you wish to use SHAP (requires a graphical environment)
# import shap
# shap.initjs()

# Download necessary NLTK data files (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
ps = PorterStemmer()

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /home/dzakirm/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/dzakirm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/dzakirm/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  from tqdm.autonotebook import tqdm, trange
2025-02-19 22:26:52.324022: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-19 22:26:52.469976: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild Ten

## Basic Preprocessing

In [4]:
import re
import polars as pl
import nltk
import spacy
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from gensim.models.phrases import Phrases, Phraser

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

nlp = spacy.load("en_core_web_sm")
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))
import re
import string

def clean_arxiv_abstract(text: str) -> str:
    if not isinstance(text, str):
        return ""
    # Convert to lowercase
    text = text.lower()
    # Remove display math (DOTALL to catch newlines)
    text = re.sub(r'\$\$.*?\$\$', '', text, flags=re.DOTALL)
    # Remove inline math expressions
    text = re.sub(r'\$.*?\$', '', text)
    # Remove LaTeX commands (e.g., \emph{...}, \textbf{...})
    text = re.sub(r'\\[a-zA-Z]+\*?(?:\[[^\]]*\])?(?:\{[^}]*\})?', '', text)
    # Remove citation markers like [1] or [1,2,3]
    text = re.sub(r'\[[^\]]*\]', '', text)
    # Remove all punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Collapse extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def tokenize_text(text: str) -> list:
    """Tokenize a string into words."""
    if not isinstance(text, str):
        return []
    return word_tokenize(text)

def remove_stopwords(tokens: list) -> list:
    """Remove stopwords from a list of tokens."""
    return [token for token in tokens if token not in stop_words]

def lemmatize_tokens(tokens: list) -> list:
    """Lemmatize tokens using spaCy."""
    doc = nlp(" ".join(tokens))
    return [token.lemma_ for token in doc]

def stem_tokens(tokens: list) -> list:
    """Stem tokens using NLTK's PorterStemmer."""
    return [ps.stem(token) for token in tokens]

def named_entity_recognition(text: str) -> list:
    """Extract named entities from text using spaCy."""
    if not isinstance(text, str):
        return []
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

def pos_tagging(text: str) -> list:
    """Extract POS tags from text using spaCy."""
    if not isinstance(text, str):
        return []
    doc = nlp(text)
    return [(token.text, token.pos_) for token in doc]

def build_phraser(token_lists: list) -> Phraser:
    """Build a bigram phraser using Gensim's Phrases."""
    phrases = Phrases(token_lists, min_count=1, threshold=2)
    return Phraser(phrases)

def apply_phrases(tokens: list, phraser: Phraser) -> list:
    """Apply a pre-built phraser to a list of tokens."""
    return phraser[tokens]

[nltk_data] Downloading package punkt to /home/dzakirm/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/dzakirm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/dzakirm/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Cleaning

### Title

In [5]:
df = df.with_columns([
    pl.col("title").map_elements(clean_arxiv_abstract).alias("title_clean"),
    pl.col("title").map_elements(lambda x: tokenize_text(clean_arxiv_abstract(x))).alias("title_tokens")
])

df.write_parquet("../data/interim/arxiv-abstracts-cleaned_v3.parquet")




In [6]:
df = df.with_columns([
    pl.col("title_tokens").map_elements(remove_stopwords).alias("title_tokens_no_stop"),
])

df = df.with_columns([
    pl.col("title_tokens").map_elements(remove_stopwords).alias("title_tokens_no_stop"),
    pl.col("title_tokens_no_stop").map_elements(lemmatize_tokens).alias("title_lemmatized"),
    pl.col("title_tokens_no_stop").map_elements(stem_tokens).alias("title_stemmed"),
    pl.col("title_clean").map_elements(named_entity_recognition).alias("title_entities"),
    pl.col("title_clean").map_elements(pos_tagging).alias("title_pos_tags")
])

df.write_parquet("../data/interim/arxiv-abstracts-cleaned_v3.parquet")


  df = df.with_columns([


### Abstract

In [7]:
df = df.with_columns([
    pl.col("abstract").map_elements(clean_arxiv_abstract).alias("abstract_clean"),
    pl.col("abstract").map_elements(lambda x: tokenize_text(clean_arxiv_abstract(x))).alias("abstract_tokens")
])

df.write_parquet("../data/interim/arxiv-abstracts-cleaned_v3.parquet")




In [8]:
df = df.with_columns([
    pl.col("abstract_tokens").map_elements(remove_stopwords).alias("abstract_tokens_no_stop"),
])

df = df.with_columns([
    pl.col("abstract_tokens").map_elements(remove_stopwords).alias("abstract_tokens_no_stop"),
    pl.col("abstract_tokens_no_stop").map_elements(lemmatize_tokens).alias("abstract_lemmatized"),
    pl.col("abstract_tokens_no_stop").map_elements(stem_tokens).alias("abstract_stemmed"),
    pl.col("abstract_clean").map_elements(named_entity_recognition).alias("abstract_entities"),
    pl.col("abstract_clean").map_elements(pos_tagging).alias("abstract_pos_tags")
])

df.write_parquet("../data/interim/arxiv-abstracts-cleaned_v3.parquet")


  df = df.with_columns([


### Phrase Detection

In [9]:
title_token_lists = df["title_tokens"].to_list()
abstract_token_lists = df["abstract_tokens"].to_list()
title_phraser = build_phraser(title_token_lists)
abstract_phraser = build_phraser(abstract_token_lists)

df = df.with_columns([
    pl.col("title_tokens").map_elements(lambda tokens: apply_phrases(tokens, title_phraser)).alias("title_phrases"),
    pl.col("abstract_tokens").map_elements(lambda tokens: apply_phrases(tokens, abstract_phraser)).alias("abstract_phrases")
])

df.write_parquet("../data/interim/arxiv-abstracts-cleaned_v3.parquet")



In [10]:
df

id,authors,title,abstract,title_clean,title_tokens,title_tokens_no_stop,title_lemmatized,title_stemmed,title_entities,title_pos_tags,abstract_clean,abstract_tokens,abstract_tokens_no_stop,abstract_lemmatized,abstract_stemmed,abstract_entities,abstract_pos_tags,title_phrases,abstract_phrases
str,str,str,str,str,list[str],list[str],list[str],list[str],list[list[str]],list[list[str]],str,list[str],list[str],list[str],list[str],list[list[str]],list[list[str]],list[str],list[str]
"""1606.02306""","""Shahar Hod""","""The superradiant instability r…",""" Spinning Kerr black holes ar…","""the superradiant instability r…","[""the"", ""superradiant"", … ""hole""]","[""superradiant"", ""instability"", … ""hole""]","[""superradiant"", ""instability"", … ""hole""]","[""superradi"", ""instabl"", … ""hole""]",[],"[[""the"", ""DET""], [""superradiant"", ""ADJ""], … [""hole"", ""NOUN""]]","""spinning kerr black holes are …","[""spinning"", ""kerr"", … ""spectrum""]","[""spinning"", ""kerr"", … ""spectrum""]","[""spin"", ""kerr"", … ""spectrum""]","[""spin"", ""kerr"", … ""spectrum""]","[[""2 42"", ""DATE""]]","[[""spinning"", ""VERB""], [""kerr"", ""NOUN""], … [""spectrum"", ""VERB""]]","[""the"", ""superradiant_instability"", … ""black_hole""]","[""spinning_kerr"", ""black_holes"", … ""spectrum""]"
"""hep-ph/9812388""","""W.M. Alberico, M.B. Barbaro, S…","""Strange form factors of the pr…",""" We consider ratios of elasti…","""strange form factors of the pr…","[""strange"", ""form"", … ""experiment""]","[""strange"", ""form"", … ""experiment""]","[""strange"", ""form"", … ""experiment""]","[""strang"", ""form"", … ""experi""]",[],"[[""strange"", ""ADJ""], [""form"", ""NOUN""], … [""experiment"", ""NOUN""]]","""we consider ratios of elastic …","[""we"", ""consider"", … ""mass""]","[""consider"", ""ratios"", … ""mass""]","[""consider"", ""ratio"", … ""mass""]","[""consid"", ""ratio"", … ""mass""]",[],"[[""we"", ""PRON""], [""consider"", ""VERB""], … [""mass"", ""NOUN""]]","[""strange_form"", ""factors"", … ""experiment""]","[""we_consider"", ""ratios"", … ""mass""]"
"""astro-ph/0208171""","""M. Boettcher (Rice Univ. / Ohi…","""Predictions of the High-Energy…",""" Spectral fitting of the radi…","""predictions of the highenergy …","[""predictions"", ""of"", … ""comae""]","[""predictions"", ""highenergy"", … ""comae""]","[""prediction"", ""highenergy"", … ""comae""]","[""predict"", ""highenergi"", … ""coma""]",[],"[[""predictions"", ""NOUN""], [""of"", ""ADP""], … [""comae"", ""PROPN""]]","""spectral fitting of the radio …","[""spectral"", ""fitting"", … ""blazars""]","[""spectral"", ""fitting"", … ""blazars""]","[""spectral"", ""fit"", … ""blazar""]","[""spectral"", ""fit"", … ""blazar""]","[[""40"", ""CARDINAL""], [""100"", ""CARDINAL""], … [""hadronic"", ""ORG""]]","[[""spectral"", ""ADJ""], [""fitting"", ""NOUN""], … [""blazars"", ""NOUN""]]","[""predictions"", ""of"", … ""w_comae""]","[""spectral_fitting"", ""of"", … ""blazars""]"
"""0912.2791""","""Dong-Keun Ki, Seung-Geol Nam, …","""Dependence of quantum-Hall con…",""" By using four-terminal confi…","""dependence of quantumhall cond…","[""dependence"", ""of"", … ""sheet""]","[""dependence"", ""quantumhall"", … ""sheet""]","[""dependence"", ""quantumhall"", … ""sheet""]","[""depend"", ""quantumhal"", … ""sheet""]",[],"[[""dependence"", ""NOUN""], [""of"", ""ADP""], … [""sheet"", ""NOUN""]]","""by using fourterminal configur…","[""by"", ""using"", … ""system""]","[""using"", ""fourterminal"", … ""system""]","[""use"", ""fourterminal"", … ""system""]","[""use"", ""fourtermin"", … ""system""]","[[""zero"", ""CARDINAL""]]","[[""by"", ""ADP""], [""using"", ""VERB""], … [""system"", ""NOUN""]]","[""dependence_of"", ""quantumhall"", … ""sheet""]","[""by"", ""using"", … ""system""]"
"""0707.3555""","""A. Russell, Vladimir I. Fal'ko""","""Power Dependence of the Photoc…",""" We propose a kinetic theory …","""power dependence of the photoc…","[""power"", ""dependence"", … ""dot""]","[""power"", ""dependence"", … ""dot""]","[""power"", ""dependence"", … ""dot""]","[""power"", ""depend"", … ""dot""]",[],"[[""power"", ""NOUN""], [""dependence"", ""NOUN""], … [""dot"", ""NOUN""]]","""we propose a kinetic theory to…","[""we"", ""propose"", … ""light""]","[""propose"", ""kinetic"", … ""light""]","[""propose"", ""kinetic"", … ""light""]","[""propos"", ""kinet"", … ""light""]","[[""zero"", ""CARDINAL""], [""exciton"", ""GPE""]]","[[""we"", ""PRON""], [""propose"", ""VERB""], … [""light"", ""NOUN""]]","[""power"", ""dependence_of"", … ""dot""]","[""we_propose"", ""a"", … ""incident_light""]"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""cond-mat/0701113""","""Youjin Deng, Timothy M. Garoni…","""Critical speeding-up in a loca…",""" We study the dynamic critica…","""critical speedingup in a local…","[""critical"", ""speedingup"", … ""model""]","[""critical"", ""speedingup"", … ""model""]","[""critical"", ""speedingup"", … ""model""]","[""critic"", ""speedingup"", … ""model""]",[],"[[""critical"", ""ADJ""], [""speedingup"", ""NOUN""], … [""model"", ""NOUN""]]","""we study the dynamic critical …","[""we"", ""study"", … ""dynamics""]","[""study"", ""dynamic"", … ""dynamics""]","[""study"", ""dynamic"", … ""dynamic""]","[""studi"", ""dynam"", … ""dynam""]","[[""monte carlo"", ""PERSON""], [""less than one"", ""CARDINAL""], … [""the chayesmachtaswendsenwang cluster dynamics"", ""ORG""]]","[[""we"", ""PRON""], [""study"", ""VERB""], … [""dynamics"", ""NOUN""]]","[""critical_speedingup"", ""in"", … ""randomcluster_model""]","[""we_study"", ""the"", … ""dynamics""]"
"""2002.10861""","""Denis Sabani, Cihan Bacaksiz a…","""Ab initio methodology for magn…",""" The recent development in th…","""ab initio methodology for magn…","[""ab"", ""initio"", … ""hamiltonian""]","[""ab"", ""initio"", … ""hamiltonian""]","[""ab"", ""initio"", … ""hamiltonian""]","[""ab"", ""initio"", … ""hamiltonian""]","[[""heisenberg spin"", ""PERSON""]]","[[""ab"", ""PROPN""], [""initio"", ""PROPN""], … [""hamiltonian"", ""NOUN""]]","""the recent development in the …","[""the"", ""recent"", … ""sites""]","[""recent"", ""development"", … ""sites""]","[""recent"", ""development"", … ""site""]","[""recent"", ""develop"", … ""site""]","[[""2d"", ""CARDINAL""], [""jmatrix"", ""PERSON""], [""four"", ""CARDINAL""]]","[[""the"", ""DET""], [""recent"", ""ADJ""], … [""sites"", ""NOUN""]]","[""ab_initio"", ""methodology_for"", … ""hamiltonian""]","[""the"", ""recent_development"", … ""sites""]"
"""1310.0237""","""Hywel Owen, David Holder, Jose…","""Technologies for Delivery of P…",""" Recent developments for the …","""technologies for delivery of p…","[""technologies"", ""for"", … ""radiotherapy""]","[""technologies"", ""delivery"", … ""radiotherapy""]","[""technology"", ""delivery"", … ""radiotherapy""]","[""technolog"", ""deliveri"", … ""radiotherapi""]",[],"[[""technologies"", ""NOUN""], [""for"", ""ADP""], … [""radiotherapy"", ""NOUN""]]","""recent developments for the de…","[""recent"", ""developments"", … ""particles""]","[""recent"", ""developments"", … ""particles""]","[""recent"", ""development"", … ""particle""]","[""recent"", ""develop"", … ""particl""]",[],"[[""recent"", ""ADJ""], [""developments"", ""NOUN""], … [""particles"", ""NOUN""]]","[""technologies_for"", ""delivery"", … ""radiotherapy""]","[""recent_developments"", ""for"", … ""particles""]"
"""2108.11695""","""Zhuojie Wu, Zijian Wang, Wenxu…","""PAENet: A Progressive Attentio…",""" 3D to 2D retinal vessel segm…","""paenet a progressive attention…","[""paenet"", ""a"", … ""segmentation""]","[""paenet"", ""progressive"", … ""segmentation""]","[""paenet"", ""progressive"", … ""segmentation""]","[""paenet"", ""progress"", … ""segment""]","[[""3d"", ""CARDINAL""], [""2d"", ""CARDINAL""]]","[[""paenet"", ""VERB""], [""a"", ""DET""], … [""segmentation"", ""NOUN""]]","""3d to 2d retinal vessel segmen…","[""3d"", ""to"", … ""methods""]","[""3d"", ""2d"", … ""methods""]","[""3d"", ""2d"", … ""method""]","[""3d"", ""2d"", … ""method""]","[[""3d"", ""CARDINAL""], [""2d"", ""CARDINAL""], … [""2d"", ""CARDINAL""]]","[[""3d"", ""NOUN""], [""to"", ""ADP""], … [""methods"", ""NOUN""]]","[""paenet"", ""a"", … ""segmentation""]","[""3d"", ""to"", … ""previous_methods""]"
