In [1]:
import pandas as pd

df = pd.read_json('arxiv-metadata-oai-snapshot.json', lines=True)
print(df.head())

          id           submitter  \
0  0704.0001      Pavel Nadolsky   
1  0704.0002        Louis Theran   
2  0704.0003         Hongjun Pan   
3  0704.0004        David Callan   
4  0704.0005  Alberto Torchinsky   

                                             authors  \
0  C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...   
1                    Ileana Streinu and Louis Theran   
2                                        Hongjun Pan   
3                                       David Callan   
4           Wael Abu-Shammala and Alberto Torchinsky   

                                               title  \
0  Calculation of prompt diphoton production cros...   
1           Sparsity-certifying Graph Decompositions   
2  The evolution of the Earth-Moon system based o...   
3  A determinant of Stirling cycle numbers counts...   
4  From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...   

                                  comments  \
0  37 pages, 15 figures; published version   
1    To appear in

In [3]:
from nltk.tokenize import sent_tokenize
from tqdm import tqdm

def split_sentences(abstract):
    if pd.isnull(abstract):
        return []
    return sent_tokenize(abstract)

tqdm.pandas()  
df['sentences'] = df['abstract'].progress_apply(split_sentences)


100%|██████████| 2620981/2620981 [04:19<00:00, 10094.66it/s]


In [8]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from tqdm import tqdm


# 加载 Sentence-BERT 模型
model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')  # 使用 GPU 加速


def generate_embeddings_in_batches(sentences, batch_size=8192):
    """
    输入句子列表，分批生成嵌入向量列表。
    """
    if not sentences or len(sentences) == 0:
        return []
    embeddings = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i+batch_size]
        embeddings.extend(model.encode(batch, show_progress_bar=False))
    return embeddings


tqdm.pandas()  
df['embeddings'] = df['sentences'].progress_apply(
    lambda x: generate_embeddings_in_batches(x, batch_size=8192)
)


df.to_json('arxiv-metadata-with-embeddings-df-ver.json', orient='records', lines=True)


 37%|███▋      | 982627/2620981 [1:29:58<2:39:43, 170.96it/s]

: 