<a href="https://colab.research.google.com/github/samhiggs/journal-title-text-classifier/blob/main/journal_title_conference_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
! pip install transformers datasets

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/b5/d5/c6c23ad75491467a9a84e526ef2364e523d45e2b0fae28a7cbe8689e7e84/transformers-4.8.1-py3-none-any.whl (2.5MB)
[K     |████████████████████████████████| 2.5MB 7.9MB/s 
[?25hCollecting datasets
[?25l  Downloading https://files.pythonhosted.org/packages/08/a2/d4e1024c891506e1cee8f9d719d20831bac31cb5b7416983c4d2f65a6287/datasets-1.8.0-py3-none-any.whl (237kB)
[K     |████████████████████████████████| 245kB 38.6MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 50.2MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/s

## Load Dataset
Using the labeled and clean dataset that aims to predict the conference a journal will be in based on it's title.

In [3]:
import pandas as pd

paper_url = "https://raw.githubusercontent.com/susanli2016/NLP-with-Python/master/data/title_conference.csv"
papers_df = pd.read_csv(paper_url)

In [4]:
papers_df.head()

Unnamed: 0,Title,Conference
0,Innovation in Database Management: Computer Sc...,VLDB
1,High performance prime field multiplication fo...,ISCAS
2,enchanted scissors: a scissor interface for su...,SIGGRAPH
3,Detection of channel degradation attack by Int...,INFOCOM
4,Pinning a Complex Network through the Betweenn...,ISCAS


## Split Dataset

In [5]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(papers_df, test_size=0.3, stratify=papers_df.Conference)

## Transform DataFrame to Huggingface Dataset

In [6]:
from datasets import Dataset, DatasetDict

papers_datasets = DatasetDict({
    "train": Dataset.from_pandas(train),
    "test": Dataset.from_pandas(test)
})

In [7]:
papers_datasets = papers_datasets.rename_column("Conference", "label")
papers_datasets = papers_datasets.rename_column("Title", "text")
papers_datasets = papers_datasets.rename_column("__index_level_0__", "idx")
papers_datasets = papers_datasets.class_encode_column("label")

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [8]:
papers_datasets["train"].features

{'idx': Value(dtype='int64', id=None),
 'label': ClassLabel(num_classes=5, names=['INFOCOM', 'ISCAS', 'SIGGRAPH', 'VLDB', 'WWW'], names_file=None, id=None),
 'text': Value(dtype='string', id=None)}

'VLDB'

In [9]:
papers_datasets["train"][0]

{'idx': 1297,
 'label': 0,
 'text': 'Topology Control in Heterogeneous Wireless Networks: Problems and Solutions.'}

## Tokenize

In [10]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/paraphrase-distilroberta-base-v1")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1352.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=718.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=798293.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456356.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355881.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=239.0, style=ProgressStyle(description_…




In [11]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = papers_datasets.map(tokenize_function, batched=True)

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [12]:
# print(f"Max length (should be 512): {max([len(x["input_ids"]) for x in tokenized_datasets["train"]]}")

In [13]:
full_train_dataset = tokenized_datasets["train"]
full_eval_dataset = tokenized_datasets["test"]

In [14]:
import torch
torch.cuda.is_available()

True

<a id='trainer'></a>

In [15]:
!nvidia-smi

Mon Jun 28 06:51:13 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   62C    P8    11W /  70W |      3MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Load Model

In [16]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("sentence-transformers/paraphrase-distilroberta-base-v2", 
                                                           num_labels=len(train.Conference.unique()), 
                                                           output_attentions = False, 
                                                           output_hidden_states = False)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=686.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=328515953.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at sentence-transformers/paraphrase-distilroberta-base-v2 were not used when initializing RobertaForSequenceClassification: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/paraphrase-distilroberta-base-v2 and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You

In [17]:
from transformers import TrainingArguments

training_args = TrainingArguments("journal_conference_classifier", 
                                  per_device_train_batch_size=8, 
                                  per_device_eval_batch_size=8)

In [18]:
from transformers import Trainer

trainer = Trainer(
    model=model, 
    args=training_args, 
    train_dataset=full_train_dataset, 
    eval_dataset=full_eval_dataset
)

In [19]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: idx, text.
***** Running training *****
  Num examples = 1754
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 660


Step,Training Loss
500,0.4177


Saving model checkpoint to journal_conference_classifier/checkpoint-500
Configuration saved in journal_conference_classifier/checkpoint-500/config.json
Model weights saved in journal_conference_classifier/checkpoint-500/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=660, training_loss=0.34666322361339225, metrics={'train_runtime': 270.8527, 'train_samples_per_second': 19.428, 'train_steps_per_second': 2.437, 'total_flos': 1327494921799680.0, 'train_loss': 0.34666322361339225, 'epoch': 3.0})

In [20]:
print(model)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [21]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1362.0, style=ProgressStyle(description…




In [22]:
# Set to eval mode to avoid running out of memory
model.eval()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=full_train_dataset,
    eval_dataset=full_eval_dataset,
    compute_metrics=compute_metrics,
)
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: idx, text.
***** Running Evaluation *****
  Num examples = 753
  Batch size = 8


{'eval_accuracy': 0.851261620185923,
 'eval_loss': 0.6070619225502014,
 'eval_runtime': 12.2993,
 'eval_samples_per_second': 61.223,
 'eval_steps_per_second': 7.724}

88.6% is a pretty strong classifier given the domain knowledge required to understand the model. But it may fall flat where there are misspelt titles, or unseen examples (one shot problem). One approach to this is to do NLP Augmentation.

In [32]:
from transformers import RobertaForSequenceClassification

In [33]:
loaded_model = RobertaForSequenceClassification.from_pretrained("journal_conference_classifier/checkpoint-500")

loading configuration file journal_conference_classifier/checkpoint-500/config.json
Model config RobertaConfig {
  "_name_or_path": "sentence-transformers/paraphrase-distilroberta-base-v2",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "transfor

In [68]:
def predict(title):
  with torch.no_grad():
    predict_input = tokenizer(title,
                              truncation=True,
                              padding=True,
                              return_tensors="pt")

    labels = torch.tensor([1]).unsqueeze(0)
    label_map = papers_datasets["train"].features["label"]
    outputs = loaded_model(**predict_input, labels=labels)
    logits = outputs.logits
    return label_map.int2str(int(np.argmax(logits, axis=-1)[0]))

In [69]:
predict("Intermediate Training of BERT for Product Matching")

'VLDB'

In [70]:
predict("On the stability of fine-tuning BERT: misconceptions, explanations, and strong baselines")

'INFOCOM'

## Push to Model HUB

In [35]:
!ls -la journal_conference_classifier/

total 16
drwxr-xr-x 4 root root 4096 Jun 28 05:17 .
drwxr-xr-x 1 root root 4096 Jun 28 05:14 ..
drwxr-xr-x 2 root root 4096 Jun 28 05:17 checkpoint-500
drwxr-xr-x 3 root root 4096 Jun 28 05:14 runs


In [36]:
!apt-get install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  git-lfs
0 upgraded, 1 newly installed, 0 to remove and 39 not upgraded.
Need to get 2,129 kB of archives.
After this operation, 7,662 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 git-lfs amd64 2.3.4-1 [2,129 kB]
Fetched 2,129 kB in 1s (1,518 kB/s)
Selecting previously unselected package git-lfs.
(Reading database ... 160772 files and directories currently installed.)
Preparing to unpack .../git-lfs_2.3.4-1_amd64.deb ...
Unpacking git-lfs (2.3.4-1) ...
Setting up git-lfs (2.3.4-1) ...
Processing triggers for man-db (2.8.3-2ubuntu0.1) ...


In [39]:
!transformers-cli login

2021-06-28 05:23:31.432052: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0

        _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
        _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
        _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
        _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
        _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

        
Username: higgzy
Password: 
Login successful
Your token: jXlpdLROvbOOGQKDewtVUcIhcPgopFidJjvsVrNNjavXkmNuCPNVvxdyXVoKCyVQttjZFGgJprjPsNeKoXaBBRVJLRkMjabSmbpJZRwkKsaNRmJyYNbQyKbcOTzTHWHT 

Your token has been saved to /root/.huggingface/token


In [None]:
!export TOKEN=$(cat /root/.huggingface/token)

In [54]:
!cd paraphrase-distilroberta-base-v1-finetuned-journal-conference && git clone https://higgzy:$TOKEN@huggingface.co/higgzy/paraphrase-distilroberta-base-v1-finetuned-journal-conference

In [57]:
!cd paraphrase-distilroberta-base-v1-finetuned-journal-conference && git push

Counting objects: 12, done.
Delta compression using up to 2 threads.
Compressing objects: 100% (11/11), done.
^C


In [58]:
!zip -r paraphrase-distilroberta-base-v1-finetuned-journal-conference.zip paraphrase-distilroberta-base-v1-finetuned-journal-conference/

  adding: paraphrase-distilroberta-base-v1-finetuned-journal-conference/ (stored 0%)
  adding: paraphrase-distilroberta-base-v1-finetuned-journal-conference/trainer_state.json (deflated 48%)
  adding: paraphrase-distilroberta-base-v1-finetuned-journal-conference/pytorch_model.bin (deflated 7%)
  adding: paraphrase-distilroberta-base-v1-finetuned-journal-conference/scheduler.pt (deflated 49%)
  adding: paraphrase-distilroberta-base-v1-finetuned-journal-conference/.gitattributes (deflated 83%)
  adding: paraphrase-distilroberta-base-v1-finetuned-journal-conference/training_args.bin (deflated 47%)
  adding: paraphrase-distilroberta-base-v1-finetuned-journal-conference/config.json (deflated 53%)
  adding: paraphrase-distilroberta-base-v1-finetuned-journal-conference/optimizer.pt (deflated 46%)
  adding: paraphrase-distilroberta-base-v1-finetuned-journal-conference/.git/ (stored 0%)
  adding: paraphrase-distilroberta-base-v1-finetuned-journal-conference/.git/refs/ (stored 0%)
  adding: para

In [59]:
from google.colab import files
files.download("/content/paraphrase-distilroberta-base-v1-finetuned-journal-conference.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!mv journal_conference_classifier/checkpoint-500/* paraphrase-distilroberta-base-v1-finetuned-journal-conference/
!cd paraphrase-distilroberta-base-v1-finetuned-journal-conference && git add . && git commit -m "Add model" && git push


In [1]:
!ls

sample_data


<a id='keras'></a>