In [1]:
!pip install contractions

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (

In [2]:
import pandas as pd
import spacy
from tqdm.notebook import tqdm
import contractions
import re

In [3]:
df = pd.read_parquet("/kaggle/input/dataset-with-embeddings/cs_papers_wo_embeddings.parquet")
# print(df.head())
print(df.shape[0])

90583


In [4]:
df['id'] = df['id'].apply(lambda x: str(x))

df.dtypes

id                object
authors           object
title             object
categories        object
abstract          object
update_date       object
degree             int64
num_citations      int64
num_references     int64
dtype: object

In [5]:
df[df['id']=='1711.06420']

Unnamed: 0,id,authors,title,categories,abstract,update_date,degree,num_citations,num_references
60000,1711.0642,"Jiuxiang Gu, Jianfei Cai, Shafiq Joty, Li Niu,...","Look, Imagine and Match: Improving Textual-Vis...","c, s, ., C, V",Textual-visual cross-modal retrieval has bee...,2018-06-14,5,3,2


In [6]:
df["abstract"].iloc[0]

'  In a quantum mechanical model, Diosi, Feldmann and Kosloff arrived at a conjecture stating that the limit of the entropy of certain mixtures is the relative entropy as system size goes to infinity. The conjecture is proven in this paper for density matrices. The first proof is analytic and uses the quantum law of large numbers. The second one clarifies the relation to channel capacity per unit cost for classical-quantum channels. Both proofs lead to generalization of the conjecture. '

In [7]:
def expand_contractions(sentence):
    contractions_expanded = [contractions.fix(word) for word in sentence.split()]
    return ' '.join(contractions_expanded)

def lower_case(sentence):
    return ' '.join([word.lower() for word in sentence.split()])
def remove_punctuation(sentence):
    return ' '.join([re.sub(r'[^\w\s]', '', word) for word in sentence.split()])

def preprocess(sentence):
    return lower_case(remove_punctuation(expand_contractions(sentence)))

In [8]:
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing abstracts"):
    df.at[idx, "abstract"] = preprocess(row["abstract"])

Processing abstracts:   0%|          | 0/90583 [00:00<?, ?it/s]

In [9]:
def get_embeddings(abstract, model, tokenizer):
    # Tokenize the abstract
    inputs = tokenizer(abstract, return_tensors="pt", padding=True, truncation=True,max_length=512)

    # Forward pass through the model to get embeddings
    with torch.no_grad():
        outputs = model(**inputs.to(device))

    # Get the hidden states (embeddings)
    embeddings = outputs.last_hidden_state  # Shape: (batch_size, sequence_length, hidden_size)

    # Aggregate token embeddings to obtain single embedding for the abstract
    abstract_embedding = torch.mean(embeddings, dim=1)  # Mean pooling

    return abstract_embedding

In [10]:
from transformers import AutoTokenizer, AutoModel
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
model = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased").to(device)

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [11]:
batch_size = 5000
data = []
num_abstracts = len(df)

for i in tqdm(range(0, num_abstracts, batch_size)):
    new_df = df[i:i+batch_size]
    for index, row in tqdm(new_df.iterrows(), total=new_df.shape[0]):
        abstract_embedding = get_embeddings(row['abstract'], model, tokenizer)
        data.append(abstract_embedding.cpu().numpy())

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/583 [00:00<?, ?it/s]

In [12]:
df['embeddings_sci_bert'] = data
df.head(5)

Unnamed: 0,id,authors,title,categories,abstract,update_date,degree,num_citations,num_references,embeddings_sci_bert
0,704.0046,"I. Csiszar, F. Hiai, D. Petz",A limit relation for entropy and channel capac...,"q, u, a, n, t, -, p, h, , c, s, ., I, T, , m...",in a quantum mechanical model diosi feldmann a...,2009-11-13,1,1,0,"[[0.20349798, 0.48734412, -0.05522029, 0.41140..."
1,704.0062,"Rastislav \v{S}r\'amek, Bro\v{n}a Brejov\'a, T...",On-line Viterbi Algorithm and Its Relationship...,"c, s, ., D, S",in this paper we introduce the online viterbi ...,2010-01-25,2,2,0,"[[0.5757019, 0.06396782, 0.3390965, -0.1335837..."
2,704.0098,"Jack Raymond, David Saad",Sparsely-spread CDMA - a statistical mechanics...,"c, s, ., I, T, , m, a, t, h, ., I, T",sparse code division multiple access cdma a va...,2009-11-13,2,1,1,"[[0.5880471, -0.18360056, 0.31301752, 0.279218..."
3,704.0108,Sergey Gubin,Reducing SAT to 2-SAT,"c, s, ., C, C",description of a polynomial time reduction of ...,2007-05-23,5,1,4,"[[0.63297045, 0.4064216, 0.64490527, -0.073474..."
4,704.0213,Ketan D. Mulmuley Hariharan Narayanan,Geometric Complexity Theory V: On deciding non...,"c, s, ., C, C",this article has been withdrawn because it has...,2012-09-28,1,1,0,"[[0.21691473, 0.41713947, 0.34705916, 0.586101..."


In [13]:
df["embeddings_sci_bert"].iloc[0].shape

(1, 768)

In [14]:
df.to_csv("cs_papers_sci_bert_embeddings.csv", index = False)
df.to_parquet("cs_papers_sci_bert_embeddings.parquet", index = False)

ArrowInvalid: ('Can only convert 1-dimensional array values', 'Conversion failed for column embeddings_sci_bert with type object')