In [10]:
import json
from sentence_transformers import SentenceTransformer
from nltk.tokenize import sent_tokenize
from tqdm import tqdm

# 確保使用 GPU
model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')

# 定義文件路徑
input_file = 'arxiv-metadata-oai-snapshot.json'
output_file = 'arxiv-metadata-with-embeddings.json'

# 分批處理大小
batch_size = 64

# 初始化處理
with open(input_file, 'r') as f_in, open(output_file, 'w') as f_out:
    for line in tqdm(f_in, desc="Processing papers"):
        # 加載原始記錄
        paper = json.loads(line)

        # 獲取摘要
        abstract = paper.get('abstract', None)
        
        if abstract:
            # 使用 NLTK 進行句子分割
            sentences = sent_tokenize(abstract)
            
            # 分批生成嵌入向量
            embeddings = []
            for i in range(0, len(sentences), batch_size):
                batch_sentences = sentences[i:i + batch_size]
                batch_embeddings = model.encode(batch_sentences, show_progress_bar=False)
                embeddings.extend(batch_embeddings)

            # 確保將 NumPy 嵌入轉換為列表
            paper['sentences'] = sentences
            paper['embeddings'] = [embedding.tolist() for embedding in embeddings]

        # 將處理後的記錄寫入新文件
        f_out.write(json.dumps(paper) + '\n')

print(f"Processed data saved to {output_file}")


Processing papers: 2620981it [5:12:51, 139.62it/s]

Processed data saved to arxiv-metadata-with-embeddings.json





[nltk_data] Downloading package punkt to /home/is1ab/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True