In [1]:
from statistics import mean
import pandas as pd
from transformers import GPT2TokenizerFast, GPT2LMHeadModel
import torch
from torch.quantization import quantize_dynamic
from torcheval.metrics.text import Perplexity
from tqdm.notebook import tqdm
import numpy as np
# from multiprocessing import Process, Pool

## Loading the Data

In [2]:
sentence_df = pd.read_csv('data/arxiv_cleaned.csv', converters={'article': pd.eval, 'abstract': pd.eval})
sentence_df.head()

Unnamed: 0,article_id,article,abstract
0,gr-qc0101015,[there is considerable current interest in stu...,[in this paper we consider the collision of sp...
1,0803.1640,[the first data system requiring dark energy c...,[upcoming weak lensing surveys can be used to ...
2,1510.01821,[quantum key distribution qkd is the first mat...,[the fully symmetric gaussian tripartite entan...
3,1105.2448,[the active galactic nucleus agn unification s...,[x ray unabsorbed seyfert 2 galaxies appear to...
4,1602.04433,[deep neural networks have significantly impro...,[the recent success of deep neural networks re...


## The Model and Functions

In [3]:
# Use cuda if available, else use cpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Clear cuda cache if using cuda (to avoid out of memory errors)
if device == 'cuda':
    torch.cuda.empty_cache()

# Load model and tokenizer
model_id = 'gpt2-large'
tokenizer = GPT2TokenizerFast.from_pretrained(model_id, sep_token=";")
model = GPT2LMHeadModel.from_pretrained(model_id).to_bettertransformer().to(device)

# Quantize the model if on cpu
if device == 'cpu':
    quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8, inplace=True)

# Set model to eval mode
model.eval()
for param in model.parameters():
    param.requires_grad = False



In [4]:
def encode_sentence(s, t):
    try:
        # Concatenate source and target sentences and encode them
        sentence = f"{s}; {t}"
        encodings = tokenizer(sentence, return_tensors='pt')
        input_ids = encodings.input_ids.to(device)
        semicolon_idx = (input_ids[0] == tokenizer.sep_token_id).nonzero(as_tuple=True)[0]
        target_ids = input_ids.clone()
        target_ids[:, :semicolon_idx+1] = -100
        return input_ids, target_ids
    except RuntimeError as e:
        print("OOM encode_sentence")
        raise e
    except TypeError as e:
        print("TypeError encode_sentence")
        raise e


def f(s,t):
    try: 
        input_ids, target_ids = encode_sentence(s, t)

        # Run GPT-2 large on the concatenated sentence
        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids, output_hidden_states=False, output_attentions=False)
            neg_log_likelihood = outputs.loss.item()

        del outputs
        del input_ids
        del target_ids

        return neg_log_likelihood
    
    except RuntimeError as e:
        print("OOM f")
        print(e)
        return []
    except TypeError as e:
        print("TypeError f")
        print(e)
        return []
    


def salience_score(s, target_summary):
    try:
        # Get the perplexity of the source sentence with respect to each target sentence
        # perplexities = target_summary.apply(lambda t: f(s, t))
        perplexities = [f(s, t) for t in target_summary]

        if perplexities == []:
            return None
        
        # Get the average perplexity of the source sentence
        avg_perplexity = mean(perplexities)
        return -avg_perplexity
    except TypeError as e:
        print("TypeError salience_score")
        print(e)
        return None

def document_saliency(document):
    salience_scores = []
    for s in document['article']:
        salience_scores.append(salience_score(s, document['abstract']))
    return salience_scores

## Testing on a example document

In [7]:
# Just looking at average number of sentences
avg_abstract_len = sentence_df.abstract.map(lambda x: len(x)).mean()
avg_article_len = sentence_df.article.map(lambda x: len(x)).mean()
print(f"Average abstract length: {avg_abstract_len:.2f}")
print(f"Average article length: {avg_article_len:.2f}")

Average abstract length: 6.32
Average article length: 145.02


In [5]:
example = sentence_df.sample(1).iloc[0]
# example = sentence_df.loc[210]
print(example.name)
print(example.article_id)
print(len(example.article))
print(len(example.abstract))
example.abstract

34
1405.3070
87
8


['the full counting statistics of charge transport is the probability distribution @xmath0 that @xmath1 electrons have flown through the system in measuring time @xmath2 .',
 'the cumulant generating function cgf of this distribution @xmath3 has been well studied in the long time limit @xmath4 , however there are relatively few results on the finite measuring time corrections to this .',
 'in this work , we study the leading finite time corrections to the cgf of interacting fermi systems with a single transmission channel at zero temperature but driven out of equilibrium by a bias voltage .',
 'we conjecture that the leading finite time corrections are logarithmic in @xmath2 with a coefficient universally related to the long time limit .',
 'we provide detailed numerical evidence for this with reference to the self dual interacting resonant level model .',
 'this model further contains a phase transition associated with the fractionalisation of charge at a critical bias voltage .',
 't

#### Single Source Sentence

In [6]:
# Test f(s,t) on an example sentence
source = example['article'][1]
targets = example['abstract']
print(source)
print()
losses = [f(source, target) for target in targets]
for i in range(len(targets)):
    print(f"{losses[i]:.2f}: {targets[i]}")
    print()

print(f"Average loss: {mean(losses):.2f}")

this must be contrasted with experimental work , in which one tends to make measurements on systems of a finite size .

4.73: the full counting statistics of charge transport is the probability distribution @xmath0 that @xmath1 electrons have flown through the system in measuring time @xmath2 .

4.89: the cumulant generating function cgf of this distribution @xmath3 has been well studied in the long time limit @xmath4 , however there are relatively few results on the finite measuring time corrections to this .

4.50: in this work , we study the leading finite time corrections to the cgf of interacting fermi systems with a single transmission channel at zero temperature but driven out of equilibrium by a bias voltage .

5.06: we conjecture that the leading finite time corrections are logarithmic in @xmath2 with a coefficient universally related to the long time limit .

5.26: we provide detailed numerical evidence for this with reference to the self dual interacting resonant level model

#### All source sentences

In [45]:
saliency_scores = []
for idx, s in enumerate(example['article']):
    if idx%10 == 0:
        print(f"{idx}/{len(example['article'])}")
    saliency_score = salience_score(s, example['abstract'])
    saliency_scores.append(saliency_score)
print("done!")

0/35
10/35
20/35
30/35
done!


In [12]:
max_saliency_score = max(saliency_scores)
print(f"Max saliency score: {max_saliency_score}")
min_saliency_score = min(saliency_scores)
print(f"Min saliency score: {min_saliency_score}")
avg_saliency_score = mean(saliency_scores)
print(f"Average saliency score: {avg_saliency_score}")
# print(saliency_scores)

Max saliency score: -2.8345259189605714
Min saliency score: -5.1319221496582035
Average saliency score: -4.31052623820082


In [25]:
# get the most salient sentences in order that they appear in the article
sorted_saliency_scores_idx = np.argsort(-np.array(saliency_scores))
sorted_saliency_sentences = np.array(example.article)[sorted_saliency_scores_idx]
sorted_saliency_scores = np.array(saliency_scores)[sorted_saliency_scores_idx]
list(zip(sorted_saliency_scores, sorted_saliency_sentences))

[(-2.8345259189605714,
  'this homology theory may be extended to a categorification of the bollobs riordan polynomial of the signed fat graphs , from which the khovanov homology of an associated link may be recovered .'),
 (-2.889389896392822,
  'we then prove that both our chromatic homology from section construction , and the khovanov homology of an associated link can be recovered from our fatgraph homology .'),
 (-2.9327403783798216,
  'one of the main results provides a second thistlethwaite type relation which states that our chromatic homology for a plane graph can be recovered from the khovanov homology of an associated link .'),
 (-3.075070357322693,
  'the chromatic homology of a plane graph can be recovered from the khovanov homology of an associated link .'),
 (-3.1877366065979005,
  'in the final section we provide a relation between l. helme guizon and y. rong s categorification of the chromatic polynomial introduced in @xcite and further studied in @xcite , a categorifi

## Running on all documents

In [7]:
# Sample the df to get a smaller df to test on
SAMPLE_SIZE = 50
if SAMPLE_SIZE is not None:
    sentence_df = sentence_df.sample(n=SAMPLE_SIZE, ignore_index=True)
sentence_df.shape

(50, 3)

In [8]:
# Get the salience score of each sentence in the document
salience_scores = sentence_df.apply(document_saliency, axis=1)
sentence_df['salience_scores'] = salience_scores
sentence_df.head()

Token indices sequence length is longer than the specified maximum sequence length for this model (2700 > 1024). Running this sequence through the model will result in indexing errors


OOM f
OOM f
OOM f


TypeError: can't convert type 'list' to numerator/denominator

In [46]:
sentence_df.head()

Unnamed: 0,article_id,article,abstract
0,1609.02098,[for a general metric measure space we give su...,[we give sufficient conditions to show that bo...
1,904.072,[the laser ultrasonics technique is a unique t...,[semi analytical model for calculating acousti...
