In [38]:
import torch
from transformers import AutoTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
import evaluate
from multiprocess import set_start_method
import os

## Extracting The Salient Sentences from the data

In [2]:
salience_df = pd.read_csv('data/salience_scores.csv', converters={'article': eval, 'abstract': eval, 'salience_scores': eval})
salience_df = salience_df.loc[~salience_df.loc[:,'salience_scores'].isin([[]])]
print(salience_df.shape)
salience_df.head()

(42, 4)


Unnamed: 0,article_id,article,abstract,salience_scores
0,1007.3454,[top physics is at the heart of the tevatron a...,[the latest progress in calculating electrowea...,"[-5.234752337137858, -5.170439084370931, -5.10..."
1,astro-ph0501438,[celeste was a cherenkov experiment using 53 h...,[the celeste atmospheric cherenkov detector ra...,"[-5.097828229268392, -4.900956511497498, -5.49..."
2,1307.0062,[the dynamics of the magnetic domain wall driv...,[current induced spin transfer torques stts ha...,"[-4.588931481043498, -4.928768475850423, -4.92..."
3,0904.2565,"[with exoplanet detection rates soaring , it i...","[in our previous paper , we evaluated the tran...","[-4.961644921983991, -4.767604351043701, -4.82..."
4,1011.2313,[cognitive radio is a promising approach to ef...,[information about primary transmitter locatio...,"[-4.2171395778656, -4.119485187530517, -3.5939..."


In [3]:
def extract_salient_sents(document, get_least_salient=False):
    # Order the sentences' indexes based on salience
    scores = np.array(document['salience_scores'])
    sents = np.array(document['article'])
    most_salient_idxs = np.argsort(-scores)

    # calculate n, the number of sentences to keep
    n = len(document['abstract']) * 3
    if n > len(sents):
        n = int(document['article']/2)
        
    # return the n most/least salient sentences
    if not get_least_salient:
        return sents[most_salient_idxs[:n]]
    else:
        return sents[most_salient_idxs[-n:]]

In [4]:
salience_df['salient'] = salience_df.apply(lambda x: extract_salient_sents(x, get_least_salient=False), axis=1)
salience_df['not_salient'] = salience_df.apply(lambda x: extract_salient_sents(x, get_least_salient=True), axis=1)
salience_df.head()

Unnamed: 0,article_id,article,abstract,salience_scores,salient,not_salient
0,1007.3454,[top physics is at the heart of the tevatron a...,[the latest progress in calculating electrowea...,"[-5.234752337137858, -5.170439084370931, -5.10...","[@xci g : me stemati ,tit g: ,wid 207 understa...","[@xci g : me stemati ,tit g: ,wid 207 left pan..."
1,astro-ph0501438,[celeste was a cherenkov experiment using 53 h...,[the celeste atmospheric cherenkov detector ra...,"[-5.097828229268392, -4.900956511497498, -5.49...",[right : light curve of mrk 421 zoomed on the ...,"[, as seen in figure fig : simulation ., , as ..."
2,1307.0062,[the dynamics of the magnetic domain wall driv...,[current induced spin transfer torques stts ha...,"[-4.588931481043498, -4.928768475850423, -4.92...","[finally , the dynamics of local spin in fe dw...","[m. hayashi , l. thomas , y. b. bazaliy , c. r..."
3,0904.2565,"[with exoplanet detection rates soaring , it i...","[in our previous paper , we evaluated the tran...","[-4.961644921983991, -4.767604351043701, -4.82...",[@xmath110 although the euler angles can play ...,"[we now consider the total effect ., amplitude..."
4,1011.2313,[cognitive radio is a promising approach to ef...,[information about primary transmitter locatio...,"[-4.2171395778656, -4.119485187530517, -3.5939...","[in particular , information about pu location...",[the variance of the @xmath3th term of @xmath3...


In [5]:
classifier_df = pd.DataFrame(columns=['sentence', 'is_salient'])
for idx, row in salience_df.iterrows():
    for sent in row['salient']:
        classifier_df.loc[len(classifier_df)] = {'sentence': sent, 'is_salient': 1}
    for sent in row['not_salient']:
        classifier_df.loc[len(classifier_df)] = {'sentence': sent, 'is_salient': 0}
classifier_df = classifier_df.sample(frac=1).reset_index(drop=True)
print(classifier_df.shape)
classifier_df.head()

(1554, 2)


Unnamed: 0,sentence,is_salient
0,we gradually increase the number of grid point...,0
1,all basic data reduction was accomplished usin...,0
2,"fig : histogr ,tit g : myr of our highest reso...",0
3,", which constitutes a set of 3@xmath34 coupled...",0
4,section sec : epsk is devoted to the calculati...,0


In [6]:
classifier_df['is_salient'].value_counts()

0    777
1    777
Name: is_salient, dtype: int64

## Train the Model

#### Load the Model

In [7]:
# Use cuda if available, else use cpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model and tokenizer
model_id = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = BertForSequenceClassification.from_pretrained(model_id, num_labels=2).to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

#### Split the data into Test, Train, and Validation sets

In [24]:
# Set the test, train, and validation distributions
TRAIN_PERC = 0.6
VAL_PERC = 0.2
TEST_PERC = 0.2
# Set the random seed
SEED = 42

In [25]:
# Get the test, train, and validation splits
X_train, X_test, y_train, y_test = train_test_split(classifier_df['sentence'], 
                                                    classifier_df['is_salient'], 
                                                    test_size=TEST_PERC+VAL_PERC, 
                                                    random_state=SEED)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, 
                                                  test_size=VAL_PERC/(TEST_PERC+VAL_PERC), 
                                                  random_state=SEED)

In [26]:
# Convert the splits to dataframes
train = pd.DataFrame({'sentence': X_train, 'is_salient': y_train})
val = pd.DataFrame({'sentence': X_val, 'is_salient': y_val})
test = pd.DataFrame({'sentence': X_test, 'is_salient': y_test})

In [27]:
# Convert the dataframes to datasets
train_dataset = Dataset.from_pandas(train)
val_dataset = Dataset.from_pandas(val)
test_dataset = Dataset.from_pandas(test)

#### Pretokenize the data

In [29]:
def tokenize_function(examples): 
    return tokenizer(examples["sentence"], padding="max_length", truncation=True)

In [30]:
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/466 [00:00<?, ? examples/s]

Map:   0%|          | 0/466 [00:00<?, ? examples/s]

Map:   0%|          | 0/622 [00:00<?, ? examples/s]

#### Setup the training

In [10]:
training_args = TrainingArguments(output_dir="test_trainer_bert")

In [12]:
metric = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [13]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [35]:
train_dataset.column_names

['sentence', 'is_salient', '__index_level_0__']

In [14]:
training_args = TrainingArguments(output_dir="test_trainer_bert", evaluation_strategy="epoch")

In [31]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    compute_metrics=compute_metrics,
)

In [32]:
trainer.train()



  0%|          | 0/177 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 96.00 MiB (GPU 0; 4.00 GiB total capacity; 3.33 GiB already allocated; 0 bytes free; 3.38 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF