In [1]:
import pandas as pd
from transformers import GPT2TokenizerFast, GPT2LMHeadModel
import torch

## Loading the Data

In [2]:
sentence_df = pd.read_csv('data/arxiv_cleaned.csv', converters={'article': pd.eval, 'abstract': pd.eval})
sentence_df.head()

Unnamed: 0,article_id,article,abstract
0,cond-mat9902107,[hope get better understanding strongly intera...,[applied recurrent variational approach two le...
1,1308.2865,[consider network 0 1 denotes set vertices 2 3...,[paper hub refers non terminal vertex degree l...
2,1208.1580,[magnetism fermi gases always received conside...,[magnetic properties charged spin 1 bose gas f...
3,astro-ph0108136,[paper tries understand whether concentrations...,[examine question well physical properties clu...
4,0805.4263,[past decade half two hundred fifty extra sola...,[perturbation caused planet moon binarity time...


## The Model and Functions

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_id = 'gpt2-large'
model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
tokenizer = GPT2TokenizerFast.from_pretrained(model_id, sep_token=";")

In [31]:
def f(s,t):
    # Concatenate source and target sentences and encode them
    sentence = f"{s}; {t}"
    encodings = tokenizer(sentence, return_tensors='pt').to(device)
    input_ids = encodings.input_ids.to(device)
    semicolon_idx = (input_ids[0] == tokenizer.sep_token_id).nonzero(as_tuple=True)[0]

    # Run GPT-2 large on the concatenated sentence
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
    
    # Chop off the source sentence and the semicolon tokens
    t_input_ids = input_ids[:, semicolon_idx+1:]
    t_logits = outputs.logits[:,semicolon_idx+1:,:]

    # Get the log probability of each token in the target sentence
    # The uncommented bit gets the log softmax of the logits and then selects the log probability of the target token
    # The commented bit gets the logits of the target token and then gets the log softmax
    # Each method is not equivalent but I *think* the first is correct, the end results are similar regardless
    t_log_probs = torch.nn.functional.log_softmax(t_logits, dim=-1)
    t_log_probs = torch.gather(t_log_probs, dim=-1, index=t_input_ids.unsqueeze(-1)).squeeze(-1)
    # t_logits = torch.gather(t_logits, dim=-1, index=t_input_ids.unsqueeze(-1)).squeeze(-1)
    # t_log_probs = torch.nn.functional.log_softmax(t_logits, dim=-1)
    t_log_probs = t_log_probs.view(-1)

    # Get the perplexity of the target sentence
    log_prob_sum = torch.sum(t_log_probs)
    perplexity = torch.pow(1/-log_prob_sum, 1/len(t_input_ids[0])).item()
    return perplexity

def salience_score(s, target_summary):
    total_perplexity = 0
    # Get the perplexity of the source sentence with respect to each target sentence
    for t in target_summary:
        perplexity = f(s, t)
        total_perplexity += perplexity
    # Get the average perplexity of the source sentence
    avg_perplexity = total_perplexity/len(target_summary)
    return -avg_perplexity

## Testing on a example document

In [40]:
example = sentence_df.loc[210]
print(example.article_id)
print(len(example.article))
print(len(example.abstract))
example.abstract

cond-mat0608637
207
5


['temperature dependent infrared reflectivity spectra srfe0sb1 measured',
 'renormalized drude peak heavy effective mass pronounced pseudogap 10 mev develops optical conductivity spectra low temperatures',
 'temperature decreases 100 k effective mass 2 rapidly increases scattering rate 3 quenched',
 'temperature dependence 2 3 indicates hybridization fe 4 spins charge carriers plays important role determining physical properties srfe0sb1 low temperatures',
 'result clear evidence iron based heavy quasiparticles']

In [35]:
# Test f(s,t) on an example sentence
source = example['article'][0]
target = example['abstract'][0]
sentence = f"{source}; {target}"
print(sentence.replace('\n', ''))
f(source, target)

recently heavy quasiparticles heavy fermions normally appearing ce yb based compounds observed transition metal compounds example liv5o0 mnsi zrzn5 name; temperature dependent infrared reflectivity spectra srfe0sb1 measured


0.692134439945221

In [36]:
# Test salience_score(s, target_summary) on an example document
test_doc = sentence_df.loc[210]
sentences = test_doc['article']
# sentences

In [26]:
saliency_scores = []
for s in sentences:
    s.replace('\n', '')
    saliency_score = salience_score(s, test_doc['abstract'])
    saliency_scores.append({"sentence": s, "saliency_score": saliency_score})

In [44]:
max_saliency_score = max(saliency_scores, key=lambda x: x['saliency_score'])
print(f"Max saliency score: {max_saliency_score['saliency_score']}")
min_saliency_score = min(saliency_scores, key=lambda x: x['saliency_score'])
print(f"Min saliency score: {min_saliency_score['saliency_score']}")
avg_saliency_score = sum([s['saliency_score'] for s in saliency_scores])/len(saliency_scores)
print(f"Average saliency score: {avg_saliency_score}")
# print(saliency_scores)

Max saliency score: 0.7669628500938416
Min saliency score: 0.755369758605957
Average saliency score: 0.7622274405138505


## Testing on another document

In [42]:
example = sentence_df.loc[125]
print(example.article_id)
print(len(example.article))
print(len(example.abstract))
# example.abstract

0811.4176
185
11


In [47]:
# Test f(s,t) on an example sentence
source = example['article'][0]
target = example['abstract'][0]
sentence = f"{source}; {target}"
print(sentence.replace('\n', ''))
f(source, target)

detection temperature anisotropies cosmic microwave background cmb provided evidence large scale structure formation universe seeded small density fluctuations generated early times; perform series high resolution n body simulations cosmological structure formation starting gaussian non gaussian initial conditions


0.7688817977905273

In [48]:
# Test salience_score(s, target_summary) on an example document
test_doc = sentence_df.loc[125]
sentences = test_doc['article']
# sentences

In [49]:
saliency_scores = []
for s in sentences:
    s.replace('\n', '')
    saliency_score = salience_score(s, test_doc['abstract'])
    saliency_scores.append({"sentence": s, "saliency_score": saliency_score})

In [50]:
max_saliency_score = max(saliency_scores, key=lambda x: x['saliency_score'])
print(f"Max saliency score: {max_saliency_score['saliency_score']}")
min_saliency_score = min(saliency_scores, key=lambda x: x['saliency_score'])
print(f"Min saliency score: {min_saliency_score['saliency_score']}")
avg_saliency_score = sum([s['saliency_score'] for s in saliency_scores])/len(saliency_scores)
print(f"Average saliency score: {avg_saliency_score}")
# print(saliency_scores)

Max saliency score: -0.7890425486998125
Min saliency score: -0.7942301034927368
Average saliency score: -0.7916782622841124
