In [2]:
import numpy as np
from os.path import join as j
import pandas as pd
import string
import sys
import torch
import utils
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer, BertModel, BertTokenizer
from sentence_transformers import SentenceTransformer
from gensim.models import doc2vec

# Load data

In [3]:
with open("../data_dirs.txt", "r") as f: # your dataset folder should  be in a text file here
    DATA_DIRS = f.readlines()
    DATA_DIRS = [p.strip() for p in DATA_DIRS]

In [4]:
PCT = 50

In [5]:
LP_PATH = j(DATA_DIRS[0], "derived", "journal_citations", "link_prediction", "abstract_aps_all_2010_link_pred_{}-pct_neg-0.gz".format(PCT))
LP_REMOVE_PATH = j(DATA_DIRS[0], "derived", "journal_citations", "link_prediction", "abstract_aps_all_2010_link_pred_{}-pct_neg-0_pos.csv".format(PCT))

In [6]:
lp = pd.read_csv(LP_PATH, dtype="str")
lp_rm = pd.read_csv(LP_REMOVE_PATH, dtype="str")

In [7]:
len(lp) + len(lp_rm)

905529

In [8]:
lp_all = lp.append(lp_rm)
lp_abs = lp_all[["CITING_DOI", "CITING_ABSTRACT"]].rename(columns={"CITING_DOI": "DOI", "CITING_ABSTRACT": "ABSTRACT"})
lp_abs = lp_abs.append(lp_all[["CITED_DOI", "CITED_ABSTRACT"]].rename(columns={"CITED_DOI": "DOI", "CITED_ABSTRACT": "ABSTRACT"}))

In [9]:
lp_abs

Unnamed: 0,DOI,ABSTRACT
0,10.1103/PhysRevLett.101.194503,We use the Onsager principle to derive a two-p...
1,10.1103/PhysRevLett.101.194503,We use the Onsager principle to derive a two-p...
2,10.1103/PhysRevE.55.3143,Simple expressions are given for the Newtonian...
3,10.1103/PhysRevE.75.031201,We present molecular dynamics calculations of ...
4,10.1103/PhysRevE.75.031201,We present molecular dynamics calculations of ...
...,...,...
86124,10.1103/PhysRevE.81.036206,"By means of full exact diagonalization, we stu..."
86125,10.1103/PhysRevLett.102.147001,In several iron-arsenide superconductors there...
86126,10.1103/PhysRevD.65.074021,Chiral quark models with nonlocal covariant se...
86127,10.1103/PhysRevLett.100.013903,By analyzing accurate theoretical results from...


In [10]:
lp_abs = lp_abs.drop_duplicates()

In [11]:
lp_abs = lp_abs.set_index("DOI")

In [12]:
lp_abs

Unnamed: 0_level_0,ABSTRACT
DOI,Unnamed: 1_level_1
10.1103/PhysRevLett.101.194503,We use the Onsager principle to derive a two-p...
10.1103/PhysRevE.55.3143,Simple expressions are given for the Newtonian...
10.1103/PhysRevE.75.031201,We present molecular dynamics calculations of ...
10.1103/PhysRevE.78.051404,The coupled activated dynamics in dense mixtur...
10.1103/PhysRevE.61.2967,We evaluate theoretical expressions for the lo...
...,...
10.1103/PhysRevA.81.053815,The output pulses of a commercial high-power f...
10.1103/PhysRevC.81.024324,We suggest the procedure of direct derivation ...
10.1103/PhysRevB.79.121302,We report on the observation of the circular p...
10.1103/PhysRevLett.102.165005,Turbulence measurements in TORE SUPRA tokamak ...


# SciBERT

In [13]:
# SciBERT embed
scibert_tokenizer = AutoTokenizer.from_pretrained(
    "allenai/scibert_scivocab_uncased", cache_dir="scibert_cache"
)
scibert_model = BertModel.from_pretrained(
    "allenai/scibert_scivocab_uncased",
    cache_dir="scibert_cache",
    output_hidden_states=True,
)
scibert_model = scibert_model.to("cuda:0")


def get_scibert_embedding(text):
    if isinstance(text, str):
        text = [text]
    # Encode the text, adding the (required!) special tokens, and converting to
    # PyTorch tensors.
    input_id = scibert_tokenizer(text, return_tensors="pt", add_special_tokens=True, padding=True)[
        "input_ids"
    ][:, :512]

    # Run the text through BERT, and collect all of the hidden states produced
    # from all 12 layers.
    with torch.no_grad():
        outputs = scibert_model(input_id.to("cuda:0"))
        hidden_states = outputs[2]  # Get the hidden states
        token_embeddings = torch.stack(
            hidden_states, dim=0
        )  # Concatenate all hidden layers into a big tensor
        token_embeddings = torch.squeeze(
            token_embeddings, dim=1
        )  # Remove the batch dimension
        if len(text) > 1:
            token_embeddings = token_embeddings.permute(1, 0, 2, 3)  # Permut
            last_layers = token_embeddings[:, -5:-1, :]
            embs = last_layers[:, :, 0, :].sum(axis = 1).cpu().numpy()
            #embs = last_layers.sum(axis=1).mean(axis=1).cpu().numpy()
        else:
            last_layers = token_embeddings[-5:-1, :]
            embs = last_layers[:, 0, :].sum(axis = 0).cpu().numpy().reshape((1, -1))
            #embs = last_layers.sum(axis=0).mean(axis=0).cpu().numpy().reshape((1, -1))
    return input_id.numpy(), embs

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
TEXT_COL = "ABSTRACT"
nchunks = np.ceil(lp_abs.shape[0] / 200)  # split data to fit into GPU memory
prog_bar1 = tqdm(total=nchunks)  # show progress

emb_chunks = []
doi_chunks = []
for chunk in np.array_split(lp_abs, nchunks):
    emb_chunks.append(get_scibert_embedding(chunk[TEXT_COL].values.tolist())[1])
    doi_chunks.append(chunk.index.values)
    prog_bar1.update()

emb = np.vstack(emb_chunks)
doi = np.concatenate(doi_chunks)

100%|██████████| 769/769.0 [26:30<00:00,  2.08s/it]

In [15]:
# Put into a DF
df = pd.DataFrame(index=doi, data=emb)

In [16]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
10.1103/PhysRevLett.101.194503,2.220189,-0.038966,-1.346239,1.325677,1.471345,-4.790093,1.397781,0.713637,0.884818,-2.199072,...,9.905186,6.397893,-7.671278,-2.539862,-0.374945,-3.535321,-4.466987,-0.342328,-2.326478,0.874566
10.1103/PhysRevE.55.3143,1.336326,-2.764805,-1.101324,1.414889,0.021981,-1.317988,-0.219822,-3.976848,0.812859,0.216416,...,0.863583,1.093233,-5.186958,0.389794,0.496394,-1.968900,-0.658606,-0.645864,-0.561217,0.720374
10.1103/PhysRevE.75.031201,2.516039,-0.257893,-0.935225,1.382465,1.943949,-4.705141,2.347713,0.278168,0.840199,-2.392417,...,10.267403,5.936669,-7.277453,-2.150421,0.061968,-3.609573,-4.257217,-0.276468,-2.489944,0.749153
10.1103/PhysRevE.78.051404,2.990185,0.190634,-0.216100,0.720089,2.126396,-4.416872,3.494903,-0.264984,0.836931,-2.768781,...,10.011682,4.529173,-6.912262,-0.749617,0.652850,-4.168040,-4.192029,0.028063,-1.658877,0.171728
10.1103/PhysRevE.61.2967,3.897556,0.917510,-1.557811,0.759431,0.503002,-4.218773,1.108172,0.685760,0.857979,-2.317019,...,9.249008,6.491690,-7.722021,-2.005756,-0.508103,-3.541920,-5.782886,0.739539,-1.668645,0.698210
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10.1103/PhysRevA.81.053815,2.264898,-0.138340,0.781959,1.449328,2.090161,-4.244765,3.480632,-1.044569,1.162098,-2.444239,...,9.412158,4.129291,-7.830984,0.142406,0.946790,-3.847188,-3.343171,0.454161,-1.279854,0.073235
10.1103/PhysRevC.81.024324,2.446236,0.396332,0.097410,1.484750,1.898536,-4.893203,3.182283,0.851303,0.558959,-2.157561,...,10.288223,5.365814,-7.427999,-1.465783,0.595176,-4.187515,-3.492922,0.086965,-2.070427,0.499000
10.1103/PhysRevB.79.121302,2.492838,0.388896,-2.035466,1.029683,1.231283,-4.752047,0.974694,0.700384,1.061738,-2.079085,...,9.423745,6.689996,-8.259450,-2.561558,-0.017619,-4.078710,-4.918072,-0.012604,-2.339068,0.657722
10.1103/PhysRevLett.102.165005,3.073672,0.292516,-1.991745,1.281648,0.881275,-4.570620,1.064054,0.859119,1.092499,-2.101201,...,9.531420,6.698943,-7.450677,-2.445735,-0.650331,-3.493343,-4.726419,-0.032522,-2.316936,0.601554


In [17]:
EMB_PROP = "abstract"
EMB_METHOD = "scibert"

In [18]:
EMB_PATH =  j(DATA_DIRS[0], "embedding", "link-prediction", "aps_all_2010_{}_{}_{}-pct_vector.gz".format(EMB_PROP, EMB_METHOD, PCT))

In [20]:
df.to_csv(EMB_PATH, index=True, header=True)

In [21]:
del df

# Sentence-BERT

In [22]:
model = SentenceTransformer('paraphrase-mpnet-base-v2', device='cuda:1')

In [23]:
embeddings = model.encode(lp_abs.ABSTRACT)

In [24]:
embeddings.shape

(153731, 768)

In [26]:
df = pd.DataFrame(data=embeddings, index=lp_abs.index)

In [27]:
df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
DOI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10.1103/PhysRevLett.101.194503,0.033811,-0.360128,-0.030413,0.006288,-0.022581,0.032182,0.196586,0.016866,-0.143045,0.160737,...,0.010987,0.002574,0.187192,-0.020788,0.033193,-0.020339,-0.169699,-0.076009,0.020315,-0.053710
10.1103/PhysRevE.55.3143,-0.114672,-0.440219,-0.100867,0.098209,-0.020825,0.001359,0.036185,-0.027957,0.142640,0.107093,...,-0.089042,0.078058,0.170470,-0.029088,0.032487,-0.370950,-0.047580,-0.125858,0.021294,-0.098499
10.1103/PhysRevE.75.031201,0.030370,-0.219910,-0.036254,0.017405,0.011727,0.033971,0.106169,0.044619,0.191736,0.051482,...,0.018914,0.011689,0.059526,0.085040,0.061095,-0.310391,-0.079325,-0.105871,-0.181300,0.013482
10.1103/PhysRevE.78.051404,-0.051325,-0.303609,-0.009633,0.028547,-0.007302,-0.019091,0.180183,-0.106969,0.048736,0.062494,...,0.091960,0.039084,0.015583,0.058539,0.048073,-0.384592,0.018351,-0.026521,0.049197,0.052946
10.1103/PhysRevE.61.2967,-0.044166,-0.205885,-0.060528,-0.025844,-0.064279,-0.047138,0.251620,-0.013070,0.081357,0.104247,...,0.043182,-0.056679,0.071530,-0.026357,0.060295,-0.502227,-0.056273,-0.105142,-0.034294,-0.055191
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10.1103/PhysRevA.81.053815,-0.047876,0.035222,-0.112620,0.073488,-0.041668,-0.083359,-0.009868,-0.057214,-0.266231,0.086852,...,-0.024357,0.100483,0.153921,-0.039078,-0.082990,-0.316718,-0.055818,-0.077087,-0.194621,-0.027206
10.1103/PhysRevC.81.024324,-0.070170,0.007055,0.022919,0.017989,-0.109422,0.027529,-0.034731,0.044885,0.108636,0.080636,...,-0.027711,-0.031671,0.148436,-0.062103,-0.007145,-0.275690,-0.086925,-0.007866,-0.205981,0.052382
10.1103/PhysRevB.79.121302,-0.039902,-0.071506,-0.058737,0.027131,-0.015007,0.050889,0.117841,-0.026857,-0.075119,0.062782,...,0.086802,0.017527,0.020639,-0.018692,0.005803,-0.128850,-0.138284,0.036457,0.040084,0.066507
10.1103/PhysRevLett.102.165005,-0.032397,-0.144495,-0.010610,0.133042,-0.042824,-0.023626,-0.025771,0.036689,-0.040966,0.051571,...,0.224571,-0.010964,0.037043,-0.051360,0.026408,0.055964,-0.064750,0.040111,-0.147203,0.074758


In [28]:
EMB_METHOD = "sentencebert"

In [29]:
EMB_PATH =  j(DATA_DIRS[0], "embedding", "link-prediction", "aps_all_2010_{}_{}_{}-pct_vector.gz".format(EMB_PROP, EMB_METHOD, PCT))

In [30]:
df.to_csv(EMB_PATH, index=True, header=True)

In [31]:
del df, embeddings

# doc2vec

In [32]:
sentences = [doc2vec.TaggedDocument(doc, [i]) for i, doc in enumerate(lp_abs.ABSTRACT.values)]

In [33]:
model_d2v = doc2vec.Doc2Vec(sentences, vector_size=128, window_size=5, workers=8)

In [34]:
model_d2v.docvecs[len(lp_abs)-1]

array([-0.25837514, -0.04180207,  0.22081287, -0.02222839, -0.04914665,
        0.08905195, -0.0468855 ,  0.04249118, -0.07752813,  0.15491524,
        0.12383277, -0.22824891, -0.01456277, -0.02792626, -0.0074306 ,
        0.03380278,  0.00365705,  0.27239963,  0.00496646,  0.06062222,
       -0.01056839,  0.03709349,  0.01783052, -0.06794666, -0.17543067,
        0.08970719, -0.02590165, -0.01786206,  0.06824381,  0.0925628 ,
        0.07621519,  0.0546307 ,  0.21764822,  0.04786328, -0.06664898,
       -0.13311104,  0.02649323,  0.01431488,  0.01050736, -0.13701984,
       -0.05928836, -0.02553607, -0.00255887,  0.10522679, -0.02549553,
        0.0281689 , -0.02794505, -0.01148177, -0.02461423,  0.09447543,
       -0.08440026,  0.08068947,  0.02392111, -0.05922328,  0.04124222,
        0.09712397,  0.1462453 ,  0.01162686, -0.06997424, -0.10787892,
       -0.03408252, -0.09465178,  0.07654863,  0.01976702,  0.17067061,
       -0.0872879 , -0.00067988,  0.05440522, -0.10475834,  0.18

In [36]:
vectors = []
for i in range(len(lp_abs)):
    vectors.append(model_d2v.docvecs[i])

In [37]:
vectors = np.vstack(vectors)

In [38]:
vectors.shape

(153731, 128)

In [39]:
d2v = pd.DataFrame(index=lp_abs.index, data=vectors, columns=range(128))

In [40]:
EMB_METHOD = "doc2vec"

In [41]:
EMB_PATH_2 = j(DATA_DIRS[0], "embedding", "link-prediction", "aps_all_2010_{}_{}_{}-pct_vector.gz".format(EMB_PROP, EMB_METHOD, PCT))

In [42]:
d2v.to_csv(EMB_PATH_2, index=True, header=True)