In [21]:
from statistics import mean
import pandas as pd
from transformers import GPT2TokenizerFast, GPT2LMHeadModel
import torch

## Loading the Data

In [2]:
sentence_df = pd.read_csv('data/arxiv_cleaned.csv', converters={'article': pd.eval, 'abstract': pd.eval})
sentence_df.head()

Unnamed: 0,article_id,article,abstract
0,gr-qc0101015,[there is considerable current interest in stu...,[in this paper we consider the collision of sp...
1,0803.1640,[the first data system requiring dark energy c...,[upcoming weak lensing surveys can be used to ...
2,1510.01821,[quantum key distribution qkd is the first mat...,[the fully symmetric gaussian tripartite entan...
3,1105.2448,[the active galactic nucleus agn unification s...,[x ray unabsorbed seyfert 2 galaxies appear to...
4,1602.04433,[deep neural networks have significantly impro...,[the recent success of deep neural networks re...


## The Model and Functions

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_id = 'gpt2-large'
model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
tokenizer = GPT2TokenizerFast.from_pretrained(model_id, sep_token=";")

In [22]:
def f(s,t):
    # Concatenate source and target sentences and encode them
    sentence = f"{s}; {t}"
    encodings = tokenizer(sentence, return_tensors='pt').to(device)
    input_ids = encodings.input_ids.to(device)
    semicolon_idx = (input_ids[0] == tokenizer.sep_token_id).nonzero(as_tuple=True)[0]

    # Run GPT-2 large on the concatenated sentence
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
    
    # Chop off the source sentence and the semicolon tokens
    t_input_ids = input_ids[:, semicolon_idx+1:]
    t_logits = outputs.logits[:,semicolon_idx+1:,:]

    # Get the log probability of each token in the target sentence
    # The uncommented bit gets the log softmax of the logits and then selects the log probability of the target token
    # The commented bit gets the logits of the target token and then gets the log softmax
    # Each method is not equivalent but I *think* the first is correct, the end results are similar regardless
    t_log_probs = torch.nn.functional.log_softmax(t_logits, dim=-1)
    t_log_probs = torch.gather(t_log_probs, dim=-1, index=t_input_ids.unsqueeze(-1)).squeeze(-1)
    # t_logits = torch.gather(t_logits, dim=-1, index=t_input_ids.unsqueeze(-1)).squeeze(-1)
    # t_log_probs = torch.nn.functional.log_softmax(t_logits, dim=-1)
    t_log_probs = t_log_probs.view(-1)

    # Get the perplexity of the target sentence
    log_prob_sum = torch.sum(t_log_probs)
    perplexity = torch.pow(1/-log_prob_sum, 1/len(t_input_ids[0])).item()
    return perplexity

def salience_score(s, target_summary):
    # Get the perplexity of the source sentence with respect to each target sentence
    # perplexities = target_summary.apply(lambda t: f(s, t))
    perplexities = [f(s, t) for t in target_summary]

    # Get the average perplexity of the source sentence
    avg_perplexity = mean(perplexities)
    return -avg_perplexity

## Testing on a example document

In [10]:
example = sentence_df.loc[210]
print(example.article_id)
print(len(example.article))
print(len(example.abstract))
example.abstract

astro-ph0011254
106
9


['we report the first wide field mapping of the kinematics and stellar populations in the e3 galaxy ngc4365 .',
 'the velocity maps extend previous long slit work .',
 'they show two independent kinematic subsystems : the central @xmath0 pc rotates about the projected minor axis , and the main body of the galaxy , @xmath1 kpc , rotates almost at right angles to this .',
 'the line strength maps show that the metallicity of the stellar population decreases from a central value greater than solar , to one half solar at a radius of 2 kpc .',
 'the decoupled core and main body of the galaxy have the same luminosity weighted age , of @xmath214 gyr , and the same elevated magnesium iron ratio .',
 'the two kinematically distinct components have thus shared a common star formation history .',
 'we infer that the galaxy underwent a sequence of mergers associated with dissipative star formation that ended @xmath312 gyr ago .',
 'the misalignment between the photometric and kinematic axes of the

#### Single Source Sentence

In [11]:
# Test f(s,t) on an example sentence
source = example['article'][0]
target = example['abstract'][0]
sentence = f"{source}; {target}"
print(sentence.replace('\n', ''))
f(source, target)

the existence of decoupled cores in @xmath230 of the early type galaxies is strong evidence that mergers play an important part in the evolution of these systems @xcite .; we report the first wide field mapping of the kinematics and stellar populations in the e3 galaxy ngc4365 .


0.8044186234474182

#### All source sentences

In [25]:
saliency_scores = []
for s in example['article']:
    saliency_score = salience_score(s, test_doc['abstract'])
    saliency_scores.append({"sentence": s, "saliency_score": saliency_score})

In [None]:
max_saliency_score = max(saliency_scores, key=lambda x: x['saliency_score'])
print(f"Max saliency score: {max_saliency_score['saliency_score']}")
min_saliency_score = min(saliency_scores, key=lambda x: x['saliency_score'])
print(f"Min saliency score: {min_saliency_score['saliency_score']}")
avg_saliency_score = sum([s['saliency_score'] for s in saliency_scores])/len(saliency_scores)
print(f"Average saliency score: {avg_saliency_score}")
# print(saliency_scores)

Max saliency score: -0.7958607739872403
Min saliency score: -0.7999491956498888
Average saliency score: -0.7972548668364559


## Testing on another document

In [None]:
example = sentence_df.loc[125]
print(example.article_id)
print(len(example.article))
print(len(example.abstract))
# example.abstract

0710.5721
188
3


#### Single Source Sentence

In [None]:
# Test f(s,t) on an example sentence
source = example['article'][0]
target = example['abstract'][0]
sentence = f"{source}; {target}"
print(sentence.replace('\n', ''))
f(source, target)

in theoretical cosmology , many insights can already be gained from spatially isotropic friedmann robertson walker models @xmath0 with @xmath1 or @xmath2 .; the equation of state for radiation is derived in a canonical formulation of the electromagnetic field .


0.7509577870368958

#### All Source Sentences

In [13]:
saliency_scores = []
for s in example['article']:
    s.replace('\n', '')
    saliency_score = salience_score(s, test_doc['abstract'])
    saliency_scores.append({"sentence": s, "saliency_score": saliency_score})

KeyboardInterrupt: 

In [None]:
max_saliency_score = max(saliency_scores, key=lambda x: x['saliency_score'])
print(f"Max saliency score: {max_saliency_score['saliency_score']}")
min_saliency_score = min(saliency_scores, key=lambda x: x['saliency_score'])
print(f"Min saliency score: {min_saliency_score['saliency_score']}")
avg_saliency_score = sum([s['saliency_score'] for s in saliency_scores])/len(saliency_scores)
print(f"Average saliency score: {avg_saliency_score}")
# print(saliency_scores)

Max saliency score: -0.7890425486998125
Min saliency score: -0.7942301034927368
Average saliency score: -0.7916782622841124
