In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.sparse import save_npz

In [2]:
def create_count_vectorizer(df, text_col="abstract_lemmatized_text", max_features=250_000, min_df=8):
    count_vect = CountVectorizer()
    count_matrix = count_vect.fit_transform(df[text_col])
    return count_matrix, count_vect

def create_tfidf_vectorizer(df, text_col="abstract_lemmatized_text", max_features=250_000, min_df=8):
    tfidf_vect = TfidfVectorizer()
    tfidf_matrix = tfidf_vect.fit_transform(df[text_col])
    return tfidf_matrix, tfidf_vect


In [3]:
df = pd.read_parquet("../data/processed/arxiv-abstracts-cleaned_v2.parquet", columns=["title", "abstract_lemmatized"])

df["abstract_lemmatized_text"] = df["abstract_lemmatized"].apply(lambda x: " ".join(x))

df

Unnamed: 0,title,abstract_lemmatized,abstract_lemmatized_text
0,The superradiant instability regime of the spi...,"[spin, kerr, black, hole, know, superradiantly...",spin kerr black hole know superradiantly unsta...
1,Strange form factors of the proton: a new anal...,"[consider, ratio, elastic, neutrinoantineutrin...",consider ratio elastic neutrinoantineutrinopro...
2,Predictions of the High-Energy Emission from B...,"[spectral, fit, radio, hard, xray, emission, b...",spectral fit radio hard xray emission bl lac o...
3,Dependence of quantum-Hall conductance on the ...,"[use, fourterminal, configuration, investigate...",use fourterminal configuration investigate dep...
4,Power Dependence of the Photocurrent Lineshape...,"[propose, kinetic, theory, describe, power, de...",propose kinetic theory describe power dependen...
...,...,...,...
499995,Practical distributed quantum information proc...,"[distribute, quantum, information, process, es...",distribute quantum information process essenti...
499996,AFFACT - Alignment-Free Facial Attribute Class...,"[facial, attribute, softbiometric, allow, limi...",facial attribute softbiometric allow limit sea...
499997,"Quasi-polynomials and the singular $[Q,R]=0$ t...","[short, note, revisit, shiftdesingularization,...",short note revisit shiftdesingularization vers...
499998,Quantum transport and momentum conserving deph...,"[study, numerically, influence, momentumconser...",study numerically influence momentumconserve d...


In [4]:

# Create CountVectorizer representation.
count_matrix, count_vect = create_count_vectorizer(df, "abstract_lemmatized_text")

# Save the sparse count matrix.
save_npz("../data/processed/count_vectorizer.npz", count_matrix)
print("CountVectorizer output saved as 'count_vectorizer.npz'.")

CountVectorizer output saved as 'count_vectorizer.npz'.


In [5]:
# Create TFIDF representation.
tfidf_matrix, tfidf_vect = create_tfidf_vectorizer(df, "abstract_lemmatized_text")

 # Save the sparse TFIDF matrix.
save_npz("../data/processed/tfidf_vectorizer.npz", tfidf_matrix)
print("TFIDF output saved as 'tfidf_vectorizer.npz'.")


TFIDF output saved as 'tfidf_vectorizer.npz'.
