In [1]:
import pickle
from rdflib import Graph, Literal, BNode, Namespace, RDF, URIRef

with open('graph.pkl', 'rb') as f:
    g = pickle.load(f)


# Define the ARXIV namespace
ARXIV = Namespace("http://arxiv.org/")

In [2]:
from transformers import AutoTokenizer, AutoModel
import torch
from rdflib import URIRef, Namespace, Graph
from rdflib.namespace import RDF
import json
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from scipy.sparse import lil_matrix
import numpy as np

# Load pre-trained model and tokenizer (BERT)
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Check if CUDA is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the device
bert_model = model.to(device)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
# Adjust the get_bert_embedding function to use bert_model instead of model
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = inputs.to(device)  # Move the inputs to the device
    outputs = bert_model(**inputs)  # Adjust this line to use bert_model
    # Use the [CLS] token representation as the sentence embedding
    sentence_embedding = outputs.last_hidden_state[:, 0, :].detach().cpu().numpy()  # Move the tensor back to the CPU for numpy conversion
    return sentence_embedding.squeeze()  # Remove extra dimensions



In [4]:
ARXIV = Namespace("http://arxiv.org/")

papers = []
paper_index = {}  # to quickly find the index of a paper by its id
embeddings = []
for s in g.subjects(RDF.type, ARXIV.Paper):
    paper_id = str(s).replace(str(ARXIV), "")
    paper_index[paper_id] = len(papers)
    title = g.value(s, ARXIV.title)
    abstract = g.value(s, ARXIV.abstract)
    papers.append({
        'paper_id': paper_id,
        'title': str(title),
        'abstract': {'text': str(abstract)}
    })
    embeddings.append(get_bert_embedding(str(abstract)))


In [5]:
# Create a sparse matrix where each row represents a paper and each column represents a cited paper
citation_matrix = lil_matrix((len(papers), len(papers)))

for s, o in g.subject_objects(ARXIV.cites):
    paper_id = str(s).replace(str(ARXIV), "")
    cited_paper_id = str(o).replace(str(ARXIV), "")
    if paper_id in paper_index and cited_paper_id in paper_index:  # only consider citations between papers that are in our graph
        citation_matrix[paper_index[paper_id], paper_index[cited_paper_id]] = 1

# Convert the citation matrix to CSR format for efficient arithmetic and matrix vector operations
citation_matrix = citation_matrix.tocsr()


In [6]:
n = len(papers)
pairs = np.zeros((n*n, 2), dtype=int)
for i in range(n):
    for j in range(n):
        pairs[i*n + j, 0] = i
        pairs[i*n + j, 1] = j

# Calculate similarities
similarities = cosine_similarity(np.array(embeddings))


# Compute pair-wise similarities
pair_similarities = similarities[pairs[:, 0], pairs[:, 1]].reshape(-1, 1)

# Create target array
y = citation_matrix.toarray().flatten()


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(pair_similarities, y, test_size=0.2, random_state=42)

# Train the model on the training set
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Calculate the evaluation metrics
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Print the evaluation metrics
print(f"ROC AUC Score: {roc_auc}")


ROC AUC Score: 0.866199533997673


In [8]:
def recommend(paper):
    paper_vector = get_bert_embedding(paper['abstract'])
    pair_similarities = cosine_similarity(paper_vector.reshape(1, -1), embeddings)
    # flatten it to make it compatible with model.predict_proba
    pair_similarities = pair_similarities.flatten().reshape(-1, 1)
    scores = model.predict_proba(pair_similarities)[:, 1]
    recommendations = sorted(zip(papers, scores), key=lambda x: -x[1])
    # Modify this line to return the abstract as well
    return [(recommendation[0]['paper_id'], recommendation[0]['abstract']) for recommendation in recommendations[:5]]


In [9]:
def load_input_data(input_file):
    input_papers = []
    with open(input_file, 'r') as file:
        for line in file:
            try:
                data = json.loads(line)
                input_papers.append(data)
            except json.JSONDecodeError:
                print(f"Skipping line as it's not valid JSON: {line.strip()}")
    return input_papers


# Load the input data
input_papers = load_input_data('new_research_papers.jsonl')

# Generate recommendations for each input paper
for paper in input_papers:
    recommendations = recommend(paper)
    print(f"Recommendations for '{paper['title']}' ({paper['discipline']}):")
    for paper_id, abstract in recommendations:
        print(f"ID: {paper_id}\nAbstract: {abstract}\n")


Recommendations for 'Enhanced Accuracy in Galactic Disc Action Estimates through Perturbed Distribution Functions' (Physics):
ID: 2012.06597
Abstract: {'text': '  In the Gaia era, understanding the effects of the perturbations of the\nGalactic disc is of major importance in the context of dynamical modelling. In\nthis theoretical paper we extend previous work in which, making use of the\nepicyclic approximation, the linearized Boltzmann equation had been used to\nexplicitly compute, away from resonances, the perturbed distribution function\nof a Galactic thin-disc population in the presence of a non-axisymmetric\nperturbation of constant amplitude. Here we improve this theoretical framework\nin two distinct ways in the new code that we present. First, we use better\nestimates for the action-angle variables away from quasi-circular orbits,\ncomputed from the AGAMA software, and we present an efficient routine to\nnumerically re-express any perturbing potential in these coordinates with 