In [None]:
import json
from sentence_transformers import SentenceTransformer
from nltk.tokenize import sent_tokenize
from tqdm import tqdm


model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')


input_file = 'arxiv-metadata-oai-snapshot.json'
output_file = 'arxiv-metadata-with-embeddings.json'


batch_size = 64


with open(input_file, 'r') as f_in, open(output_file, 'w') as f_out:
    for line in tqdm(f_in, desc="Processing papers"):

        paper = json.loads(line)


        abstract = paper.get('abstract', None)
        
        if abstract:

            sentences = sent_tokenize(abstract)
            

            embeddings = []
            for i in range(0, len(sentences), batch_size):
                batch_sentences = sentences[i:i + batch_size]
                batch_embeddings = model.encode(batch_sentences, show_progress_bar=False)
                embeddings.extend(batch_embeddings)

            paper['sentences'] = sentences
            paper['embeddings'] = [embedding.tolist() for embedding in embeddings]


        f_out.write(json.dumps(paper) + '\n')

print(f"Processed data saved to {output_file}")


Processing papers: 2620981it [5:12:51, 139.62it/s]

Processed data saved to arxiv-metadata-with-embeddings.json





[nltk_data] Downloading package punkt to /home/is1ab/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True