This notebook contains the implementation for training and evaluation on the semi-synthetic dataset. This code runs on a small subset of the actual dataset used in the paper.

In [None]:
import pandas as pd
from datetime import datetime
from contextlib import contextmanager
import os
import attr
import collections
import matplotlib.pyplot as plt
import string
import random
import pickle
import numpy as np
import itertools
import functools
from datetime import datetime
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, losses, optimizers, regularizers, metrics
import math
import pickle
import scipy
from sklearn.metrics import precision_score, recall_score, f1_score, average_precision_score, roc_auc_score, roc_curve
import random

In [None]:
DATA_DIR = "data/"

In [None]:
training_data, val_data, test_data = pickle.load(open(os.path.join(DATA_DIR, "semi_synthetic_data.pkl"),
                                                      "rb"))

In [None]:
class PaperMetadata:
    
    def __init__(self, paper_id, tokens, fos):
        self.paper_id = paper_id
        self.tokens = tokens
        self.fos = fos
    
    def __str__(self):
      return "id=%d, fos=%s, tokens_5=%s" % (self.paper_id, ",".join(self.fos), ",".join(list(self.tokens)[:5]))

class PaperIdAndIndexMap:
    
    def __init__(self, topo_sorted_nodes):
        self.paper_id_to_idx = {}
        self.idx_to_paper_id = {}
        for idx, paper_id in enumerate(topo_sorted_nodes):
            self.paper_id_to_idx[paper_id] = idx
            self.idx_to_paper_id[idx] = paper_id

scibert_predictor_weights, paper_set, idx_to_paper_id, records = (
    pickle.load(open(os.path.join(DATA_DIR, "semi_synthetic_data_metadata.pkl"), "rb"))
)
nodes_to_scibert = pickle.load(open(os.path.join(DATA_DIR, "nodes_to_scibert_embedding_1.pkl"), "rb"))
nodes_to_scibert.update(pickle.load(open(os.path.join(DATA_DIR, "nodes_to_scibert_embedding_2.pkl"), "rb")))

In [None]:
# Show an example record that represents a node in the citation graph.
node_id = 1640247718
print(records[node_id])

id=1640247718, fos=engineering, tokens_5=reality,legged,problem,author,evolution


In [None]:
# The preprocessed scibert embedding generated from the tokens using
# bert-as-a-service [1].
#
# [1] https://github.com/hanxiao/bert-as-service
nodes_to_scibert[node_id].shape

(768,)

In [None]:
root_fos_map = {
    'art': 1,
    'biology': 2,
    'business': 3,
    'chemistry': 4,
    'computer science': 5,
    'economics': 6,
    'engineering': 7,
    'environmental science': 8,
    'geography': 9,
    'geology': 10,
    'history': 11,
    'materials science': 12,
    'mathematics': 13,
    'medicine': 14,
    'philosophy': 15,
    'physics': 16,
    'political science': 17,
    'psychology': 18,
    'sociology': 19
}
root_fos_list = sorted(root_fos_map.keys())

In [None]:
# Generate the ground-truth propensity scores.

NP_RANDOM_SEED = 234981293
np.random.seed(NP_RANDOM_SEED)
vector_size = (len(root_fos_list) + 2)**2
is_low_propensity = np.random.binomial(1, 0.8, size=vector_size)
propensity_vector = (
    is_low_propensity * np.random.uniform(.1, .3, size=vector_size) + 
    (1 - is_low_propensity) * np.random.uniform(.7, 1, size=vector_size)
)
for i in range(1, len(root_fos_list) + 1):
    propensity_vector[i * len(root_fos_list) + i] = np.random.uniform(0.7, 0.9)
np.random.seed(None)

In [None]:
train_citers = [i for item in training_data for i in item["paper_citer"]]
train_citeds = [i for item in training_data for i in item["paper_cited"]]

val_citers = [i for item in val_data for i in item["paper_citer"]]
val_citeds = [i for item in val_data for i in item["paper_cited"]]

test_citers = [i for item in test_data for i in item["paper_citer"]]
test_citeds = [i for item in test_data for i in item["paper_cited"]]

print(len(train_citeds))
print(len(val_citeds))
print(len(test_citeds))

32000
3200
6400


In [None]:
def add_scibert_embeddings(elements):
    
    def add_scibert_internal(citer_ids, cited_ids):
        node_set = set(citer_ids.numpy()) | set(cited_ids.numpy())
        scibert_citer = [nodes_to_scibert[idx_to_paper_id[n]] for n in citer_ids.numpy()]
        scibert_cited = [nodes_to_scibert[idx_to_paper_id[n]] for n in cited_ids.numpy()]
        return np.array(scibert_citer), np.array(scibert_cited)
  
    scibert_citer, scibert_cited = tf.py_function(add_scibert_internal,
                                                inp=[elements["paper_citer"], elements["paper_cited"]],
                                                Tout=(tf.float32, tf.float32))
  
    elements["scibert_citer"] = scibert_citer
    elements["scibert_cited"] = scibert_cited
    return elements

def add_fos(elements):
    
    def add_fos_internal(paper_citer, paper_cited):
        paper_citer = paper_citer.numpy()
        paper_cited = paper_cited.numpy()

        fos_citer = []
        for p in paper_citer:
            random.seed(p)
            fos_citer.append([root_fos_map[random.choice(root_fos_list)]])

        fos_cited = []
        for p in paper_cited:
            random.seed(p)
            fos_cited.append([root_fos_map[random.choice(root_fos_list)]])

        random.seed(None)
        return np.array(fos_citer, np.int32), np.array(fos_cited, np.int32)
  
    fos_citer, fos_cited = tf.py_function(add_fos_internal,
                                        inp=[elements["paper_citer"], elements["paper_cited"]],
                                        Tout=(tf.int32, tf.int32))

    elements["fos_citer"] = fos_citer
    elements["fos_cited"] = fos_cited
    return elements

def add_propensity_scores(elements):
    
    def get_propensities_internal(fos_citer, fos_cited):
        np.random.seed(NP_RANDOM_SEED)

        fos_citer = fos_citer.numpy()
        fos_cited = fos_cited.numpy()

        propensities = []
        for f1, f2 in zip(fos_citer, fos_cited):
            propensities.append(propensity_vector[f1[0] * len(root_fos_list) + f2[0]])

        propensities = np.array(propensities)
        exposure = np.random.binomial(1, propensities)

        np.random.seed(None)
        return propensities, exposure
      
    propensities, exposure = tf.py_function(get_propensities_internal,
                                          inp=[elements["fos_citer"], elements["fos_cited"]],
                                          Tout=(tf.float32, tf.float32))

    propensities.set_shape(elements["paper_citer"].get_shape())
    exposure.set_shape(elements["paper_citer"].get_shape())

    elements["propensity"] = propensities
    elements["exposure"] = exposure
    return elements

def add_is_citation_gt(elements):
    
    def get_is_citation_internal(scibert_citer, scibert_cited):
        np.random.seed(NP_RANDOM_SEED)

        scibert_citer = scibert_citer.numpy()
        scibert_cited = scibert_cited.numpy()

        # shape: (batch, 768)
        elem_prod = scibert_citer * scibert_cited

        weights = scibert_predictor_weights[0]
        bias = scibert_predictor_weights[1]

        output = np.squeeze(elem_prod @ weights + bias, axis=-1)
        citation_probs = scipy.special.expit(output)
        is_citation = np.random.binomial(1, citation_probs)

        np.random.seed(None)
        return citation_probs, is_citation
  
    citation_probs, is_citation_gt = tf.py_function(get_is_citation_internal,
                                                inp=[elements["scibert_citer"], elements["scibert_cited"]],
                                                Tout=(tf.float32, tf.float32))

    citation_probs.set_shape(elements["paper_citer"].get_shape())
    is_citation_gt.set_shape(elements["paper_citer"].get_shape())

    elements["citation_probs"] = citation_probs
    elements["is_citation_gt"] = is_citation_gt
    return elements

def add_citation_using_exposure(elements):
    elements["is_citation"] = elements["is_citation_gt"] * elements["exposure"]
    return elements

In [None]:
def get_dataset(citers, citeds, batch_size=32, is_train=False,
                num_batches=None):

    dataset = tf.data.Dataset.from_tensor_slices({"paper_citer": citers,
                                                "paper_cited": citeds})
    if is_train:
        dataset = dataset.shuffle(buffer_size=len(citers))

    dataset = dataset.batch(batch_size=batch_size, drop_remainder=False)
    if num_batches is not None:
        dataset = dataset.take(num_batches)
    dataset = dataset.map(add_scibert_embeddings)
    dataset = dataset.map(add_fos)
    dataset = dataset.map(add_propensity_scores)
    dataset = dataset.map(add_is_citation_gt)
    dataset = dataset.map(add_citation_using_exposure)
    dataset = dataset.map(lambda e: (e, (e["is_citation"], e["is_citation_gt"])))
    if is_train:
        dataset = dataset.repeat()

    return dataset

In [None]:
# Test the dataset generation.

for item in get_dataset(train_citers, train_citeds, batch_size=32,
                        num_batches=1, is_train=False):
    print(item[0].keys())
    print(item[0]["paper_citer"].numpy().shape)
    print(item[0]["scibert_citer"].numpy().shape)

dict_keys(['paper_citer', 'paper_cited', 'scibert_citer', 'scibert_cited', 'fos_citer', 'fos_cited', 'propensity', 'exposure', 'citation_probs', 'is_citation_gt', 'is_citation'])
(32,)
(32, 768)


In [None]:
def fos_to_propensity_prob(fos_citer, fos_cited, embedding_fn):
    num_fos_plus_one = 20

    fos_citer = tf.cast(fos_citer, tf.float32)
    fos_cited = tf.cast(fos_cited, tf.float32)

    embeddings = embedding_fn(fos_citer * num_fos_plus_one + fos_cited)
    return tf.sigmoid(tf.squeeze(embeddings, axis=-1))

def compute_fos_idx():
    num_fos_plus_one = 20
    embedding_fn = layers.Embedding((num_fos_plus_one + 1)**2, 1)

    fos_citer = item[0]["fos_citer"]
    fos_cited = item[0]["fos_cited"]

    print(fos_to_propensity_prob(fos_citer, fos_cited, embedding_fn))

**Loss functions using the proposed weighting schemes**

The next cell contains the implementation for the loss based on the three weighting schemes proposed in the paper: $\widehat{R}_w, \widehat{R}_{\text{PU}}$, and $\widehat{R}_{\text{AP}}$.

Different weighting schemes can be used by defining a new Keras layer that computes a scalar loss based on ground-truth link probabilities, estimated link probabilities, and estimated propensities. Then this layer can be easily plugged into the training pipeline.


In [None]:
# Loss layers corresponding to the weighting schemes proposed in the paper. New
# weighting schemes can be defined analogously.

class RWWeightingLoss(layers.Layer):
    
    def __init__(self, lambda_weighting_scheme_loss, name=None):
        super(RWWeightingLoss, self).__init__(name=name)
        self.lambda_weighting_scheme_loss = lambda_weighting_scheme_loss
    
    def call(self, inputs):
        is_citation = inputs[0]
        cite_prob = inputs[1]
        propensities = inputs[2]
        
        propensities = tf.expand_dims(propensities, axis=-1)
        is_citation = tf.expand_dims(is_citation, axis=-1)
        
        positive_weights = (1 / (propensities + 1e-5))
        negative_weights = (1 - cite_prob) / (1 - propensities * cite_prob + 1e-5)
        weighting = is_citation * positive_weights + (1 - is_citation) * negative_weights
        weighting = tf.squeeze(weighting, axis=-1)
        
        return (
            self.lambda_weighting_scheme_loss * tf.reduce_mean(
                weighting * losses.binary_crossentropy(is_citation, cite_prob)))

class PUWeightingLoss(layers.Layer):
    
    def __init__(self, lambda_weighting_scheme_loss, name=None):
        super(PUWeightingLoss, self).__init__(name=name)
        self.lambda_weighting_scheme_loss = lambda_weighting_scheme_loss
    
    def call(self, inputs):
        is_citation = inputs[0]
        cite_prob = inputs[1]
        propensities = inputs[2]
        
        propensities = tf.expand_dims(propensities, axis=-1)
        is_citation = tf.expand_dims(is_citation, axis=-1)
        
        positive_weights = (1 / (propensities + 1e-5))
        added_negative_weights = 1 - (1 / (propensities + 1e-5))
        negative_weights = 1
        weighting = is_citation * positive_weights + (1 - is_citation) * negative_weights
        added_negative_weighting = is_citation * added_negative_weights
        weighting = tf.squeeze(weighting, axis=-1)
        added_negative_weighting = tf.squeeze(added_negative_weighting, axis=-1)
        
        return (
            self.lambda_weighting_scheme_loss * (
                tf.reduce_mean(weighting * losses.binary_crossentropy(is_citation, cite_prob)) +
                tf.reduce_mean(added_negative_weighting * losses.binary_crossentropy(1 - is_citation, cite_prob))
            )
        )

class APWeightingLoss(layers.Layer):
    
    def __init__(self, lambda_weighting_scheme_loss, name=None):
        super(APWeightingLoss, self).__init__(name=name)
        self.lambda_weighting_scheme_loss = lambda_weighting_scheme_loss
    
    def call(self, inputs):
        is_citation = inputs[0]
        cite_prob = inputs[1]
        propensities = inputs[2]
        
        propensities = tf.expand_dims(propensities, axis=-1)
        is_citation = tf.expand_dims(is_citation, axis=-1)
        
        positive_weights = 1
        negative_weights = (1 - cite_prob) / (1 - propensities * cite_prob + 1e-5)
        added_positive_weights = cite_prob * (1 - propensities) / (1 - propensities * cite_prob + 1e-5)
        
        weighting = is_citation * positive_weights + (1 - is_citation) * negative_weights
        positive_multiplier = 1
        added_positive_weighting = (1 - is_citation) * added_positive_weights * positive_multiplier
        weighting = tf.squeeze(weighting, axis=-1)
        added_positive_weighting = tf.squeeze(added_positive_weighting, axis=-1)
        
        return (
            self.lambda_weighting_scheme_loss * (
                tf.reduce_mean(weighting * losses.binary_crossentropy(is_citation, cite_prob)) +
                tf.reduce_mean(added_positive_weighting * losses.binary_crossentropy(is_citation, cite_prob))
            )
        )

In [None]:
PAPER_TEXT_EMB_SIZE = 768

@attr.s
class TrainHParams(object):
    # One of: ("none", "MLE" "R_W", "R_PU", "R_AP").
    # This hyperparameter is used to select the weighting scheme used for
    # training. Each setting corresponds to one of the five methods tested in
    # our paper.
    weighting_scheme = attr.ib(default="R_W")
    lambda_prediction = attr.ib(default=20.)
    lambda_weighting_scheme_loss = attr.ib(default=1.)
    lr = attr.ib(default=1e-3)
    
class IdentityLayer(layers.Layer):
    
    def __init__(self, name=None):
        super(IdentityLayer, self).__init__(name=name)
    
    def call(self, inputs):
        return inputs

class EmbeddingsToPrediction(layers.Layer):
    
    def __init__(self, kernel_regularizer=None, name=None):
        super(EmbeddingsToPrediction, self).__init__(name=name)
        self.linear_classifier = layers.Dense(units=1,
                                              activation="sigmoid")
    
    def call(self, inputs):
        elem_multiply = inputs[0] * inputs[1]
        return self.linear_classifier(elem_multiply)

class FosToPropensity(layers.Layer):
    """This layer represents the propensity score model used in the paper.

    In our work, the propensity score only depends on the fields-of-study of
    study of the two papers.

    Different propensity score models (which potentially depend on more features
    of the node) can be incorporated in the training pipeline by redefining this
    layer.
    """
    
    def __init__(self, name=None):
        super(FosToPropensity, self).__init__(name=name)
        self.embeddings_fn = layers.Embedding(25**2, 1)
    
    def call(self, inputs):
        fos_citer = inputs[0]
        fos_cited = inputs[1]
        probs = fos_to_propensity_prob(fos_citer, fos_cited, self.embeddings_fn)
        return tf.squeeze(probs, axis=-1)

class CitationLoss(layers.Layer):
    """Computes the binary cross-entropy loss between ground-truth citations and
    estimated citations.

    When `use_propensity` is `True`, this loss function becomes the MLE.
    When `use_propensity` is `False`, this loss function corresponds to the
    `No_Prop` estimator in the paper. 
    """
    
    def __init__(self, use_propensity, lambda_prediction=1, name=None):
        super(CitationLoss, self).__init__(name=name)
        self.use_propensity = use_propensity
        self.lambda_prediction = lambda_prediction
    
    def call(self, inputs):
        is_citation = inputs[0]
        cite_prob = inputs[1]

        if self.use_propensity:
          propensities = inputs[2]
          propensities = tf.expand_dims(propensities, axis=-1)
          y_pred = cite_prob * propensities
        else:
          y_pred = cite_prob
        
        return (
            self.lambda_prediction * tf.reduce_mean(
                losses.binary_crossentropy(tf.expand_dims(is_citation, axis=-1), y_pred)))

class CitationPredictor:
    
    def __init__(self, embedding_size=32, hparams=TrainHParams()):
        self.hparams = hparams
        self._compile_model()
      
    def _add_metric(self, model, metric, name):
        model.add_metric(metric, aggregation="mean", name=name)
  
    def _compile_model(self):
        scibert_citer = layers.Input(shape=(PAPER_TEXT_EMB_SIZE), name="scibert_citer")
        scibert_cited = layers.Input(shape=(PAPER_TEXT_EMB_SIZE), name="scibert_cited")
        fos_citer = layers.Input(shape=(1), name="fos_citer")
        fos_cited = layers.Input(shape=(1), name="fos_cited")
        propensities = layers.Input(shape=(), name="propensity")
        is_citation = layers.Input(shape=(), name="is_citation")
        is_citation_gt = layers.Input(shape=(), name="is_citation_gt")

        cite_prob = EmbeddingsToPrediction(name="cite_prob")([scibert_citer, scibert_cited])

        cite_prob_gt = IdentityLayer(name="cite_prob_gt")(cite_prob)

        propensities_pred = FosToPropensity(name="propensities_pred")([fos_citer, fos_cited])

        self.model = keras.Model(inputs=[scibert_citer, scibert_cited, propensities, is_citation, is_citation_gt,
                                          fos_citer, fos_cited],
                                  outputs=[cite_prob, cite_prob_gt])

        # Decide between `No_Prop` and `MLE`.
        if self.hparams.weighting_scheme == "none":
            cite_prob_loss = CitationLoss(use_propensity=False,
                                          lambda_prediction=self.hparams.lambda_prediction)([is_citation,
                                                                                             cite_prob])
        else:
            cite_prob_loss = CitationLoss(use_propensity=True,
                                          lambda_prediction=self.hparams.lambda_prediction)([is_citation,
                                                                                             cite_prob,
                                                                                             propensities_pred])
        self.model.add_loss(cite_prob_loss)
        self._add_metric(self.model, cite_prob_loss, "cite_prob_loss")
        
        weighting_scheme_loss = None
        # Decide which weighting scheme loss to use (if any).
        if self.hparams.weighting_scheme == "R_W":
            weighting_scheme_loss = RWWeightingLoss(
                self.hparams.lambda_weighting_scheme_loss)([is_citation,
                                                            cite_prob,
                                                            propensities_pred])
        elif self.hparams.weighting_scheme == "R_PU":
            weighting_scheme_loss = PUWeightingLoss(
                self.hparams.lambda_weighting_scheme_loss)([is_citation,
                                                            cite_prob,
                                                            propensities_pred])
        elif self.hparams.weighting_scheme == "R_PU":
            weighting_scheme_loss = APWeightingLoss(
                self.hparams.lambda_weighting_scheme_loss)([is_citation,
                                                            cite_prob,
                                                            propensities_pred])

        if weighting_scheme_loss is not None:
          self.model.add_loss(weighting_scheme_loss)
          self._add_metric(self.model, weighting_scheme_loss, "weighting_scheme_loss")

        propensity_abs = tf.reduce_mean(tf.abs(propensities_pred - propensities))
        self._add_metric(self.model, propensity_abs, "propensity_abs")

        self.model.compile(loss=lambda yt,yp: 0.,
                            optimizer=optimizers.Adam(learning_rate=self.hparams.lr),
                            metrics={"cite_prob": ["accuracy",
                                                  metrics.AUC(name="auc"),
                                                  metrics.TrueNegatives(name="true_negatives"),
                                                  metrics.FalseNegatives(name="false_negatives"),
                                                  metrics.TruePositives(name="true_positives"),
                                                  metrics.FalsePositives(name="false_positives"),
                                                  metrics.Precision(name="precision"),
                                                  metrics.Recall(name="recall")],
                                    "cite_prob_gt": ["accuracy",
                                                  metrics.AUC(name="auc"),
                                                  metrics.TrueNegatives(name="true_negatives"),
                                                  metrics.FalseNegatives(name="false_negatives"),
                                                  metrics.TruePositives(name="true_positives"),
                                                  metrics.FalsePositives(name="false_positives"),
                                                  metrics.Precision(name="precision"),
                                                  metrics.Recall(name="recall")],
                                    }
                          )

        self.propensity_model = keras.Model(inputs=[fos_citer, fos_cited], outputs=[propensities_pred])

In [None]:
tf.keras.backend.clear_session()
citation_predictor = CitationPredictor()

In [None]:
def train_model(model):
    batch_multiplier = 1
    batch_size = 32 * batch_multiplier
    num_batches = 1000
    return model.fit(x=get_dataset(train_citers, train_citeds,
                                 batch_size=batch_size, is_train=True,
                                 num_batches=num_batches),
                    validation_data=get_dataset(val_citers, val_citeds,
                                                is_train=False,
                                                num_batches=100),
                    verbose=1,
                    steps_per_epoch=num_batches,
                    initial_epoch=0,
                    epochs=5,
                  )

# Test training by running for a few epochs.
train_history = train_model(citation_predictor.model)

Epoch 1/5


  [n for n in tensors.keys() if n not in ref_input_names])


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
def get_predictions(dataset, model):
    y_true = []
    y_pred = []
    propensities = []
    for item in dataset:
        predictions = model.predict(item[0], batch_size=2048)
        y_pred.extend(predictions[0][:, 0])
        y_true.extend(item[0]["is_citation"])
        propensities.extend(item[0]["propensity"])

    return np.array(y_true), np.array(y_pred), np.array(propensities)

y_true, y_pred, propensities = get_predictions(get_dataset(test_citers, test_citeds, 
                                                           is_train=False,
                                                           num_batches=10),
                                               citation_predictor.model)

  [n for n in tensors.keys() if n not in ref_input_names])


In [None]:
# Compute metrics on the test set.

def compute_test_set_metrics(y_true, y_pred, plot_roc=False):
    y_pred = np.array(y_pred)
    y_pred[y_pred >= 0.5] = 1
    y_pred[y_pred < 0.5] = 0
    
    print("precision: %f" % precision_score(y_true,
                          y_pred,
                         ))
    print("recall: %f" % recall_score(y_true,
                          y_pred,
                         ))
    print("f1 score: %f" % f1_score(y_true,
                          y_pred,
                         ))
    print("average precision: %f" % average_precision_score(y_true,
                          y_pred,
                         ))
    print("roc auc: %f" % roc_auc_score(y_true,
                          y_pred,
                         ))
    
    if plot_roc:
        fpr, tpr, thresholds = roc_curve(y_true, y_pred)
        plt.plot(fpr, tpr)
        plt.xlabel("FPR")
        plt.ylabel("TPR")

compute_test_set_metrics(y_true, y_pred)

precision: 0.375000
recall: 0.107143
f1 score: 0.166667
average precision: 0.118304
roc auc: 0.545010
