In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%pip install -q -r requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
import json
import pandas as pd
import os
import re
import string
import pickle

from transformers import BertForSequenceClassification
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import AutoTokenizer
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
from transformers import TrainingArguments, Trainer
from datasets import Dataset
import numpy as np
from sklearn.model_selection import train_test_split

In [4]:
import os
#os.chdir('../..')

from vecsim_app.categories import CATEGORIES
from vecsim_app.data_utils import papers

Define the parameters for fetching the papers dataset:

- Dataset Path
- Year cutoff: Year cut off for the papers.
- Pattern for fetching a given amount of years
- Max Sample size: maximum simple size (if you just want to try out the notebook - if it's too low the model won't perform well)

In [5]:
DATA_PATH = "./arxiv-metadata-oai-snapshot.json"
YEAR_CUTOFF = 2010
YEAR_PATTERN = r"(19|20[0-9]{2})"
MAX_SAMPLE_SIZE = 20000


In [6]:
df = pd.DataFrame(papers(data_path=DATA_PATH, year_cutoff=YEAR_CUTOFF, year_pattern=YEAR_PATTERN))
len(df)

# Take a sample for computing reasons
df = df.sample(MAX_SAMPLE_SIZE)

In [21]:
df

Unnamed: 0,id,title,year,authors,categories,abstract,text
210240,1512.07410,Fragmentation of long-lived hydrocarbons after...,2016,"Seyedreza Larimian, Sonia Erattupuzha, Erik L\...","physics.chem-ph,physics.atm-clus,physics.optics",We experimentally and theoretically investig...,Fragmentation of long-lived hydrocarbons after...
148715,1404.1520,Single spin stochastic optical reconstruction ...,2014,"Matthias Pfender, Nabeel Aslam, Gerald Waldher...","quant-ph,physics.optics",We experimentally demonstrate precision addr...,Single spin stochastic optical reconstruction ...
219516,1603.07790,Weighted Pushdown Systems with Indexed Weight ...,2016,Yasuhiko Minamide,"cs.FL,cs.PL",The reachability analysis of weighted pushdo...,Weighted Pushdown Systems with Indexed Weight ...
370814,1911.02005,Simultaneous spectral estimation of dephasing ...,2020,"Virginia Frey, Leigh M. Norris, Lorenza Viola ...",quant-ph,The fragility of quantum systems makes them ...,Simultaneous spectral estimation of dephasing ...
228464,1606.06192,A Novel Quasi-One-Dimensional Topological Insu...,2016,"Gabriel Aut\`es, Anna Isaeva, Luca Moreschini,...","cond-mat.mtrl-sci,cond-mat.mes-hall",Recent progress in the field of topological ...,A Novel Quasi-One-Dimensional Topological Insu...
...,...,...,...,...,...,...,...
316199,1808.06472,Dark Matter Sommerfeld-enhanced annihilation a...,2018,"Tobias Binder, Laura Covi and Kyohei Mukaida","hep-ph,astro-ph.CO,hep-th",Traditional computations of the dark matter ...,Dark Matter Sommerfeld-enhanced annihilation a...
414998,2010.12385,Resonances in hyperbolic dynamics,2018,St\'ephane Nonnenmacher,"math-ph,math.DS,math.MP,math.SP",The study of wave propagation outside bounde...,Resonances in hyperbolic dynamics The study ...
189313,1506.01307,Control of fixed points and existence and uniq...,2016,George Glauberman and Justin Lynd,"math.GR,math.AT",A. Chermak has recently proved that to each ...,Control of fixed points and existence and uniq...
65915,1108.5137,A Laser System for the Spectroscopy of Highly-...,2012,"S. Albrecht, S. Altenburg, C. Siegel, N. Hersc...","physics.atom-ph,nucl-ex",We present and characterize a laser system f...,A Laser System for the Spectroscopy of Highly-...


In [20]:
CATEGORIES

{'astro-ph': 'Astrophysics',
 'astro-ph.CO': 'Cosmology and Nongalactic Astrophysics',
 'astro-ph.EP': 'Earth and Planetary Astrophysics',
 'astro-ph.GA': 'Astrophysics of Galaxies',
 'astro-ph.HE': 'High Energy Astrophysical Phenomena',
 'astro-ph.IM': 'Instrumentation and Methods for Astrophysics',
 'astro-ph.SR': 'Solar and Stellar Astrophysics',
 'cond-mat.dis-nn': 'Disordered Systems and Neural Networks',
 'cond-mat.mes-hall': 'Mesoscale and Nanoscale Physics',
 'cond-mat.mtrl-sci': 'Materials Science',
 'cond-mat.other': 'Other Condensed Matter',
 'cond-mat.quant-gas': 'Quantum Gases',
 'cond-mat.soft': 'Soft Condensed Matter',
 'cond-mat.stat-mech': 'Statistical Mechanics',
 'cond-mat.str-el': 'Strongly Correlated Electrons',
 'cond-mat.supr-con': 'Superconductivity',
 'cs.AI': 'Artificial Intelligence',
 'cs.AR': 'Hardware Architecture',
 'cs.CC': 'Computational Complexity',
 'cs.CE': 'Computational Engineering, Finance, and Science',
 'cs.CG': 'Computational Geometry',
 'cs.CL

In [7]:
df.shape

(5000, 6)

In [8]:
df.head(3)

Unnamed: 0,id,title,year,authors,categories,abstract
210240,1512.0741,Fragmentation of long-lived hydrocarbons after...,2016,"Seyedreza Larimian, Sonia Erattupuzha, Erik L\...","physics.chem-ph,physics.atm-clus,physics.optics",We experimentally and theoretically investig...
148715,1404.152,Single spin stochastic optical reconstruction ...,2014,"Matthias Pfender, Nabeel Aslam, Gerald Waldher...","quant-ph,physics.optics",We experimentally demonstrate precision addr...
219516,1603.0779,Weighted Pushdown Systems with Indexed Weight ...,2016,Yasuhiko Minamide,"cs.FL,cs.PL",The reachability analysis of weighted pushdo...


In [9]:
df['text'] = df['title'] + ' ' + df['abstract']
# df['categories'] = df['categories'].apply(lambda x: x.split(','))

In [10]:
df.iloc[0].categories

'physics.chem-ph,physics.atm-clus,physics.optics'

## Train dataset creation

In [13]:
df_train, df_test = train_test_split(df, train_size=0.8)

df.shape, df_train.shape, df_test.shape

((5000, 7), (4000, 7), (1000, 7))

In [14]:
def get_tokenizer(tokenizer_model):
    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True)

    tokenizer = AutoTokenizer.from_pretrained(tokenizer_model)
    return tokenize_function, tokenizer

tokenize_function, tokenizer = get_tokenizer('bert-base-uncased')

In [47]:
# Drop samples where there are categories which should not be present according to our predefined categories.py

df['split_categories'] = df['categories'].apply(lambda x: x.split(','))

df = df[
    df['split_categories'].apply(lambda x: len(set(x) - set(CATEGORIES)) == 0)
]

In [48]:
mlb = MultiLabelBinarizer()
# mlb.fit([[(k,v) for k, v in CATEGORIES.items()]]) #df_train['categories'])
mlb.fit([list(CATEGORIES.keys())]) #df_train['categories'])
mlb.classes_[:10]

array(['astro-ph', 'astro-ph.CO', 'astro-ph.EP', 'astro-ph.GA',
       'astro-ph.HE', 'astro-ph.IM', 'astro-ph.SR', 'cond-mat.dis-nn',
       'cond-mat.mes-hall', 'cond-mat.mtrl-sci'], dtype=object)

In [49]:
def preprocess_data(examples):
    # take a batch of texts
    text = examples["text"]

    # encode them
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)

    encoded_categories = mlb.transform([c.split(',') for c in examples['categories']]).astype(float)

    encoding["labels"] = encoded_categories

    return encoding

In [50]:
df_train_hf = Dataset.from_pandas(df_train[['text', 'categories']])
tokenized_train = df_train_hf.map(preprocess_data, batched=True)

df_test_hf = Dataset.from_pandas(df_test[['text', 'categories']])
tokenized_test = df_test_hf.map(preprocess_data, batched=True)

  0%|          | 0/4 [00:00<?, ?ba/s]



  0%|          | 0/1 [00:00<?, ?ba/s]



In [51]:
# Get inverse transform, as an example

print("Reversed", mlb.inverse_transform(np.asarray(tokenized_test[0]['labels']).reshape(1, -1)))
print("Original categories", tokenized_test[0]['categories'])

Reversed [('cond-mat.str-el', 'hep-th')]
Original categories cond-mat.str-el,hep-th


In [52]:
# Store multilabel binarizer as a pickle file

!rm -rf checkpoint
!mkdir checkpoint
with open('checkpoint/mlb.pkl', 'wb') as handle:
    pickle.dump(mlb, handle, protocol=pickle.HIGHEST_PROTOCOL)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


## Training multi label class model

In [53]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(mlb.classes_),
    problem_type="multi_label_classification"
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [34]:
# Adaptation: https://github.com/NielsRogge/Transformers-Tutorials/blob/master/BERT/Fine_tuning_BERT_(and_friends)_for_multi_label_text_classification.ipynb 
# 

# Define batch size according to your GPU RAM
batch_size = 8
nb_epochs = 1 # DEMONSTRATIVE PURPOSES
metric_name = "f1"

args = TrainingArguments(
    f"paper-multilabel-finetuning",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size//2,
    num_train_epochs=nb_epochs,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    eval_accumulation_steps=1,
)

# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    return multi_label_metrics(
        predictions=p.predictions, 
        labels=p.label_ids)

In [35]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [36]:
trainer.train()

***** Running training *****
  Num examples = 48
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 6
  Number of trainable parameters = 109599897
The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, categories, __index_level_0__. If text, categories, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.657666,0.026588,0.502474,0.0


***** Running Evaluation *****
  Num examples = 12
  Batch size = 4
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, categories, __index_level_0__. If text, categories, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
Saving model checkpoint to paper-multilabel-finetuning/checkpoint-6
Configuration saved in paper-multilabel-finetuning/checkpoint-6/config.json
Model weights saved in paper-multilabel-finetuning/checkpoint-6/pytorch_model.bin
tokenizer config file saved in paper-multilabel-finetuning/checkpoint-6/tokenizer_config.json
Special tokens file saved in paper-multilabel-finetuning/checkpoint-6/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from paper-multilabel-finetuning/checkpoint-6 (score: 0.026587887740029542).


TrainOutput(global_step=6, training_loss=0.6783577601114908, metrics={'train_runtime': 9.3058, 'train_samples_per_second': 5.158, 'train_steps_per_second': 0.645, 'total_flos': 3161613275136.0, 'train_loss': 0.6783577601114908, 'epoch': 1.0})

In [37]:
eval_res = trainer.evaluate()
eval_res

***** Running Evaluation *****
  Num examples = 12
  Batch size = 4
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, categories, __index_level_0__. If text, categories, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.


{'eval_loss': 0.6576664447784424,
 'eval_f1': 0.026587887740029542,
 'eval_roc_auc': 0.5024737713970182,
 'eval_accuracy': 0.0,
 'eval_runtime': 0.6499,
 'eval_samples_per_second': 18.465,
 'eval_steps_per_second': 4.616,
 'epoch': 1.0}

## Perform inference on a given text sample

In [38]:
text = df['text'].iloc[5]
categories = df['categories'].iloc[5]
print(categories)

encoding = tokenizer(text, return_tensors="pt")
encoding = {k: v.to(trainer.model.device) for k, v in encoding.items()}

outputs = trainer.model(**encoding)
logits = outputs.logits

astro-ph.HE


In [39]:
# apply sigmoid + threshold
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(logits.squeeze().cpu())
predictions = np.zeros(probs.shape)
predictions[np.where(probs >= 0.3)] = 1

In [40]:
print(text)
print(mlb.inverse_transform(predictions.reshape(1, -1)))

A Search for MeV to TeV Neutrinos from Fast Radio Bursts with IceCube   We present two searches for IceCube neutrino events coincident with 28 fast
radio bursts (FRBs) and one repeating FRB. The first improves upon a previous
IceCube analysis -- searching for spatial and temporal correlation of events
with FRBs at energies greater than roughly 50 GeV -- by increasing the
effective area by an order of magnitude. The second is a search for temporal
correlation of MeV neutrino events with FRBs. No significant correlation is
found in either search, therefore, we set upper limits on the time-integrated
neutrino flux emitted by FRBs for a range of emission timescales less than one
day. These are the first limits on FRB neutrino emission at the MeV scale, and
the limits set at higher energies are an order-of-magnitude improvement over
those set by any neutrino telescope.

[('astro-ph', 'astro-ph.CO', 'astro-ph.EP', 'astro-ph.GA', 'astro-ph.HE', 'astro-ph.IM', 'astro-ph.SR', 'cond-mat.dis-nn',

In [41]:
trainer.save_model(output_dir='./checkpoint')

Saving model checkpoint to ./checkpoint
Configuration saved in ./checkpoint/config.json
Model weights saved in ./checkpoint/pytorch_model.bin
tokenizer config file saved in ./checkpoint/tokenizer_config.json
Special tokens file saved in ./checkpoint/special_tokens_map.json


In [43]:
with open('./checkpoint/model_info.json', 'w') as f:
    f.write(json.dumps(eval_res, indent=4))