In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pandas as pd
import logging
from fast_bert.data_lm import BertLMDataBunch
from fast_bert.learner_lm import BertLMLearner
from pathlib import Path
from box import Box
import torch

In [3]:
fast_topics = pd.read_csv('/Users/jpnelson/2020/sul-dlss/ai-etd/data/topic_uri_label_utf8.csv', names=['URI', 'Label'])

In [4]:
fast_topics.head()

Unnamed: 0,URI,Label
0,http://id.worldcat.org/fast/799409,African American teenagers--Education
1,http://id.worldcat.org/fast/912703,Enthusiasm--Religious aspects--Christianity
2,http://id.worldcat.org/fast/966912,Identity (Psychology) in old age
3,http://id.worldcat.org/fast/817698,Artists' studios
4,http://id.worldcat.org/fast/833340,Bisexual women--Health and hygiene


In [5]:
args = Box({
    "seed": 42,
    "task_name": "etd_fulltext_lm",
    "model_name": "roberta-base",
    "model_type": "roberta",
    "train_batch_size": 16,
    "learning_rate": 4e-5,
    "num_train_epochs": 20,
    "fp16": False,
    "fp16_opt_level": "O2",
    "warmup_steps": 1000,
    "logging_steps": 0,
    "max_seq_length": 512,
    "multi_gpu": False
})
logger = logging.getLogger()

In [6]:
DATA_PATH = Path("../tmp/etds/")
LOG_PATH = Path("logs")
MODEL_PATH = Path(f"lm_models_{args.model_type}")

DATA_PATH.mkdir(exist_ok=True)
MODEL_PATH.mkdir(exist_ok=True)
LOG_PATH.mkdir(exist_ok=True)

In [7]:
texts = []
for path in DATA_PATH.iterdir():
    texts.append(path.name)

etd_databunch_lm = BertLMDataBunch.from_raw_corpus(
    data_dir=DATA_PATH,
    text_list=texts,
    tokenizer=args.model_name,
    batch_size_per_gpu=args.train_batch_size,
    multi_gpu=args.multi_gpu,
    model_type=args.model_type,
    logger=logger)

Setting 'max_len_single_sentence' is now deprecated. This value is automatically set up.


In [8]:
learner = BertLMLearner.from_pretrained_model(
    dataBunch=etd_databunch_lm,
    pretrained_path=args.model_name,
    output_dir=MODEL_PATH,
    metrics=[],
    device=torch.device('cpu'),
    logger=logger,
    multi_gpu=False,
    logging_steps=args.logging_steps,
    is_fp16=False)

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at roberta-base and are newly initialized: ['lm_head.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
torch.device("cpu")

device(type='cpu')

In [None]:
learner.fit(epochs=3,
            lr=6e-5,
            validate=True,
            schedule_type="warmup_cosine",
            optimizer_type="lamb")

In [9]:
learner.save_model()

In [10]:
from fast_bert.prediction import BertClassificationPredictor

In [17]:
predicator = BertClassificationPredictor(
    model_path='lm_models_roberta/model_out',
    label_path='data/',
    multi_label=True,
    model_type='bert',
    do_lower_case=False)

Some weights of the model checkpoint at lm_models_roberta/model_out were not used when initializing BertForMultiLabelSequenceClassification: ['roberta.embeddings.word_embeddings.weight', 'roberta.embeddings.position_embeddings.weight', 'roberta.embeddings.token_type_embeddings.weight', 'roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.self.query.weight', 'roberta.encoder.layer.0.attention.self.query.bias', 'roberta.encoder.layer.0.attention.self.key.weight', 'roberta.encoder.layer.0.attention.self.key.bias', 'roberta.encoder.layer.0.attention.self.value.weight', 'roberta.encoder.layer.0.attention.self.value.bias', 'roberta.encoder.layer.0.attention.output.dense.weight', 'roberta.encoder.layer.0.attention.output.dense.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.intermediate.dense.weight', 'roberta.encoder.layer.0.interme

In [24]:
predicator.predict("incentives in computer science")

[('http://id.worldcat.org/fast/821133', 0.747307300567627),
 ('http://id.worldcat.org/fast/833890', 0.7351551651954651),
 ('http://id.worldcat.org/fast/802012', 0.7306155562400818),
 ('http://id.worldcat.org/fast/878608', 0.7305094003677368),
 ('http://id.worldcat.org/fast/981569', 0.7294960021972656),
 ('http://id.worldcat.org/fast/899270', 0.7287221550941467),
 ('http://id.worldcat.org/fast/1749604', 0.7271628975868225),
 ('http://id.worldcat.org/fast/2000669', 0.724565863609314),
 ('http://id.worldcat.org/fast/1086011', 0.724439263343811),
 ('http://id.worldcat.org/fast/1039350', 0.7240569591522217),
 ('http://id.worldcat.org/fast/1737958', 0.724053680896759),
 ('http://id.worldcat.org/fast/833892', 0.7237916588783264),
 ('http://id.worldcat.org/fast/930076', 0.7234480977058411),
 ('http://id.worldcat.org/fast/913043', 0.7229713797569275),
 ('http://id.worldcat.org/fast/819578', 0.7208371162414551),
 ('http://id.worldcat.org/fast/880327', 0.7207109332084656),
 ('http://id.worldcat.o