In [1]:
import pandas as pd
import json
import numpy as np

chunks = []
with open('arxiv-metadata-oai-snapshot.json', 'r') as f:
    for line in f:
        record = json.loads(line)
        chunks.append({
            'title': record.get('title', '').strip(),
            'authors': record.get('authors', '').strip(),
            'abstract': record.get('abstract', '').strip(),
            'year': int(record.get('update_date', '1900')[:4])  # Extract year
        })

# Convert to DataFrame
df = pd.DataFrame(chunks)

In [2]:
# save abstracts to parquet file using polars
import polars as pl
df_polars = pl.DataFrame(df)
df_polars.write_parquet('arxiv_abstracts.parquet')


In [2]:
#load the parquet file back into a DataFrame
import pyarrow.parquet as pq
import polars as pl
df = pl.read_parquet('arxiv_abstracts.parquet')


In [4]:
from sentence_transformers import SentenceTransformer
# Load the SentenceTransformer model for generating embeddings
model = SentenceTransformer("jinaai/jina-embeddings-v3", trust_remote_code=True)

model.max_seq_length = 512 # Set the maximum sequence length for the model

# Define the task type for the model
task = "clustering"

# Encode the combined texts using the SentenceTransformer model
embeddings = model.encode(
    df["abstract"].to_list(),
    show_progress_bar=True,
    device="cuda",
    convert_to_tensor=True,
    batch_size=64
)
embeddings.shape

Batches:   0%|          | 0/43208 [00:00<?, ?it/s]

torch.Size([2765260, 1024])

In [6]:
import polars as pl
import numpy as np
import os
import glob
from tqdm import tqdm
import torch

# Convert embeddings to NumPy array if they are a torch tensor
if hasattr(embeddings, "cpu"):
    embeddings = embeddings.cpu().to(torch.float32).numpy()

# Add embeddings as separate columns to the Polars dataframe
for j in range(embeddings.shape[1]):
    df = df.with_columns(pl.Series(f"emb_{j}", embeddings[:, j].tolist()))

print("mbeddings added to dataframe.")

chunk_size = 100000 
total_records = len(df)

# Write each chunk to a separate Parquet file
for i in tqdm(range(0, total_records, chunk_size), desc="Writing chunks"):
    chunk = df.slice(i, chunk_size)
    chunk_file = f"arxiv_embeddings_chunk_{i // chunk_size}.parquet"
    chunk.write_parquet(chunk_file)

print("All chunks written.")

# Use glob to find all chunk files
print("Loading chunk files with glob...")

chunk_files = sorted(glob.glob("arxiv_embeddings_chunk_*.parquet"))

# Efficient lazy concatenation and streaming write
print("Starting streaming concatenation and sink...")
# Returns a LazyFrame over all files
final_lazy_frame = pl.scan_parquet("arxiv_embeddings_chunk_*.parquet")

final_file = "arxiv_embeddings_full.parquet"
final_lazy_frame.sink_parquet(final_file)

print(f"Final merged file written with streaming to: {final_file}")

# Clean up chunk files
print("Deleting chunk files...")
for file in chunk_files:
    os.remove(file)

print("All chunk files deleted. Process complete.")


mbeddings added to dataframe.


Writing chunks: 100%|██████████| 28/28 [02:35<00:00,  5.56s/it]


All chunks written.
Loading chunk files with glob...
Starting streaming concatenation and sink...
Final merged file written with streaming to: arxiv_embeddings_full.parquet
Deleting chunk files...
All chunk files deleted. Process complete.


In [7]:
import polars as pl

pl.scan_parquet("arxiv_embeddings_full.parquet").head().collect()

title,authors,abstract,year,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,emb_9,emb_10,emb_11,emb_12,emb_13,emb_14,emb_15,emb_16,emb_17,emb_18,emb_19,emb_20,emb_21,emb_22,emb_23,emb_24,emb_25,emb_26,emb_27,emb_28,emb_29,emb_30,emb_31,emb_32,…,emb_987,emb_988,emb_989,emb_990,emb_991,emb_992,emb_993,emb_994,emb_995,emb_996,emb_997,emb_998,emb_999,emb_1000,emb_1001,emb_1002,emb_1003,emb_1004,emb_1005,emb_1006,emb_1007,emb_1008,emb_1009,emb_1010,emb_1011,emb_1012,emb_1013,emb_1014,emb_1015,emb_1016,emb_1017,emb_1018,emb_1019,emb_1020,emb_1021,emb_1022,emb_1023
str,str,str,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""Calculation of prompt diphoton…","""C. Bal\'azs, E. L. Berger, P. …","""A fully differential calculati…",2008,0.071777,0.058838,0.001869,0.036621,0.049316,-0.099609,-0.004089,-0.016968,-0.037354,-0.103516,-0.083984,0.105469,-0.057129,-0.057129,-0.060791,0.125,-0.082031,0.014404,0.024902,-0.172852,0.031494,0.09375,-0.078613,0.167969,-0.033447,0.049316,-0.05542,0.0703125,-0.057617,0.125977,0.017944,-0.02356,0.08252,…,0.000587,-0.026367,-0.013916,-0.009766,-0.003738,-0.033936,0.019653,0.022095,-0.028198,-0.0271,0.001114,0.029419,-0.017456,0.006317,-0.018921,0.009583,0.035889,0.025879,0.001801,-0.012817,0.009583,-0.000234,0.010742,0.008118,0.048828,0.026489,-0.000671,-0.035156,0.007599,-0.019165,-0.004517,-0.042236,-0.020264,-0.011169,0.01355,-0.012451,0.02002
"""Sparsity-certifying Graph Deco…","""Ileana Streinu and Louis Thera…","""We describe a new algorithm, t…",2008,0.060791,-0.129883,-0.008118,-0.034668,0.027222,0.0703125,-0.005585,0.020264,-0.071289,0.03125,-0.056152,-0.028076,-0.090332,0.038086,0.014465,-0.036621,-0.122559,0.1015625,-0.014465,-0.037109,0.002975,0.1875,-0.023193,0.088867,0.083984,-0.008484,-0.103027,-0.057129,-0.155273,0.031494,0.120117,0.104492,0.033936,…,-0.001846,-0.007141,0.001282,0.003296,0.033691,-0.037109,0.021118,0.007294,-0.009583,0.020752,-0.001419,0.022339,0.035889,0.012146,-0.005188,0.021606,-0.011536,-0.051758,0.022583,-0.001297,-0.036133,-0.016113,0.020996,0.017334,-0.049561,0.010803,-0.032715,0.027222,0.036621,0.012695,-0.004456,0.013123,-0.026855,-0.01532,-0.002777,0.011292,0.016846
"""The evolution of the Earth-Moo…","""Hongjun Pan""","""The evolution of Earth-Moon sy…",2008,0.120117,-0.07666,0.083008,-0.08252,0.026489,-0.124023,-0.165039,0.027954,-0.063965,-0.010559,-0.146484,0.051514,0.030273,-0.037598,-0.053467,0.026855,0.055176,0.040039,0.018921,0.032471,0.047852,0.131836,0.016724,0.030029,-0.069336,0.128906,0.054443,0.083984,-0.032715,-0.016968,0.001968,0.018311,0.043701,…,-0.008972,-0.022583,-0.032959,-0.0177,0.029785,0.00206,0.019043,0.044922,0.015625,0.006134,-0.032715,-0.013123,-0.015869,0.01416,0.015747,0.009644,0.005005,0.00769,-0.012756,-0.005249,-0.01123,-0.001534,0.018433,-0.044189,0.043213,0.031982,0.015747,0.012634,0.00351,0.004364,0.002594,0.021606,0.015991,-0.007233,-0.001129,-0.011292,-0.024658
"""A determinant of Stirling cycl…","""David Callan""","""We show that a determinant of …",2007,0.068359,0.027588,-0.016602,0.094727,0.028687,0.00766,-0.036621,0.092773,-0.067383,-0.038818,-0.022339,0.07959,-0.134766,0.143555,-0.04248,-0.034424,-0.093262,0.1015625,-0.100586,-0.022583,0.03418,0.198242,-0.038574,0.064453,0.0271,0.146484,-0.041748,0.051025,-0.085449,0.03125,0.014648,-0.001129,-0.007751,…,0.006622,-0.035156,-0.013,-0.051025,-0.004364,-0.027222,0.031006,-0.03418,-0.029907,0.011475,-0.002975,-0.00766,0.007477,0.005737,-0.035889,0.029053,0.016602,-0.027832,0.005219,-0.006958,-0.033203,0.002014,-0.025879,-0.021729,-0.002838,0.026245,-0.012695,0.014038,0.024048,0.008606,-0.03064,0.00412,-0.039551,-0.021118,-0.024048,-0.019897,0.028076
"""From dyadic $\Lambda_{\alpha}$…","""Wael Abu-Shammala and Alberto …","""In this paper we show how to c…",2013,-0.015747,-0.024658,0.019043,-0.091797,-0.009949,0.031982,-0.000813,-0.032715,-0.030884,-0.07666,-0.077637,0.090332,-0.101074,-0.080566,-0.021851,-0.004639,-0.100098,0.033691,0.011658,-0.056641,-0.058594,0.234375,-0.034424,0.024536,0.026001,0.133789,0.005951,0.111816,-0.093262,0.036133,0.125,0.008789,0.020264,…,0.007721,0.0047,-0.008911,-0.032715,-0.004517,-0.039062,0.032227,-0.006317,0.013123,0.015625,0.029541,0.004089,-0.013367,-0.010132,-0.020752,-0.008179,-0.017334,-0.033691,0.022339,-0.001747,-0.016357,0.013611,0.014099,-0.028931,0.018188,0.007324,-0.03125,-0.044922,0.028564,0.003784,-0.023804,-0.026001,-0.014771,0.021118,0.002594,-0.018677,0.026001
