In [None]:
! pip install sentence-transformers
! pip install transformers

In [None]:
import json
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from tqdm.auto import tqdm
import numpy as np
tqdm.pandas()

In [3]:
def preprocess_entry(dic):
  dic['abstract'] = dic['abstract'].replace('\n', ' ')
  dic['title'] = dic['title'].replace('\n', ' ')
  abstract_tokens = dic['abstract'].split(' ')
  abstract_tokens = list(filter(None, abstract_tokens))
  title_tokens = dic['title'].split(' ')
  title_tokens = list(filter(None, title_tokens))

  dic['abstract'] = ' '.join(abstract_tokens)
  dic['title'] = ' '.join(title_tokens)

  return dic

In [4]:
def load_data(path):
  file = open(path, 'r')
  data_dic = []
  while True:
    line = file.readline()
    if not line:
      break
    dic = json.loads(line)
    dic = preprocess_entry(dic)
    data_dic.append(dic)
  return pd.DataFrame(data_dic)

In [None]:
df = load_data('arxiv-metadata-oai-snapshot.json')

In [None]:
df.head(1)

In [17]:
def get_bert_model(bert_type):
  match bert_type:
    case 'BERT':
      bert_text = 'bert-base-uncased'
    case 'RoBERTa':
      bert_text = 'roberta-base'
    case 'DistilBERT':
      bert_text = 'distilbert-base-uncased'
    case 'MultiQA':
      bert_text = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
    case 'Paraphrase-MiniLM':
      bert_text = "sentence-transformers/paraphrase-MiniLM-L3-v2"
    case 'SBERT':
      bert_text = "sentence-transformers/all-mpnet-base-v2"
    case _:
      bert_text = 'bert-base-uncased'

  tokenizer = AutoTokenizer.from_pretrained(bert_text)
  model = AutoModel.from_pretrained(bert_text)
  return model, tokenizer

In [5]:
def get_bert_sentence_vectors(model, tokenizer, documents):
  input = tokenizer(documents, return_tensors='pt', padding=True, truncation=True)
  output = model(**input)
  sentence_vec = output.last_hidden_state.mean(dim=1).detach()
  sentence_vec = np.squeeze(np.asarray(sentence_vec))
  return sentence_vec

In [6]:
class Tokenizer(object):
  def __init__(self, bert_text):
    self.bert_text = bert_text
    self.model, self.tokenizer = get_bert_model(bert_text)

  def get_token(self, documents):
    return get_bert_sentence_vectors(self.model, self.tokenizer, documents)

In [None]:
df['abstract_vector'] = df['abstract'].progress_apply(Tokenizer('SBERT').get_token)

In [None]:
df.to_pickle('arxiv_embedded_sbert_full.pkl')