In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pandas as pd
import logging
from fast_bert.data_lm import BertLMDataBunch
from fast_bert.learner_lm import BertLMLearner
from pathlib import Path
from box import Box
import torch

In [3]:
fast_topics = pd.read_csv('/farmshare/user_data/jpnelson/ai-etd/data/topic_uri_label_utf8.csv', names=['URI', 'Label'])

In [4]:
fast_topics.head()

Unnamed: 0,URI,Label
0,http://id.worldcat.org/fast/799409,African American teenagers--Education
1,http://id.worldcat.org/fast/912703,Enthusiasm--Religious aspects--Christianity
2,http://id.worldcat.org/fast/966912,Identity (Psychology) in old age
3,http://id.worldcat.org/fast/817698,Artists' studios
4,http://id.worldcat.org/fast/833340,Bisexual women--Health and hygiene


In [5]:
args = Box({
    "seed": 42,
    "task_name": "etd_fulltext_lm",
    "model_name": "roberta-base",
    "model_type": "roberta",
    "train_batch_size": 16,
    "learning_rate": 4e-5,
    "num_train_epochs": 20,
    "fp16": False,
    "fp16_opt_level": "O2",
    "warmup_steps": 1000,
    "logging_steps": 0,
    "max_seq_length": 512,
    "multi_gpu": False
})
logger = logging.getLogger()

In [6]:
DATA_PATH = Path("../etds/")
LOG_PATH = Path("logs")
MODEL_PATH = Path(f"lm_models_{args.model_type}")

DATA_PATH.mkdir(exist_ok=True)
MODEL_PATH.mkdir(exist_ok=True)
LOG_PATH.mkdir(exist_ok=True)

In [7]:
texts = []
for path in DATA_PATH.iterdir():
    texts.append(path.name)

etd_databunch_lm = BertLMDataBunch.from_raw_corpus(
    data_dir=DATA_PATH,
    text_list=texts,
    tokenizer=args.model_name,
    batch_size_per_gpu=args.train_batch_size,
    multi_gpu=args.multi_gpu,
    model_type=args.model_type,
    logger=logger)

Setting 'max_len_single_sentence' is now deprecated. This value is automatically set up.


In [8]:
learner = BertLMLearner.from_pretrained_model(
    dataBunch=etd_databunch_lm,
    pretrained_path=args.model_name,
    output_dir=MODEL_PATH,
    metrics=[],
    device=torch.device('cuda'),
    logger=logger,
    multi_gpu=False,
    logging_steps=args.logging_steps,
    is_fp16=False)

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at roberta-base and are newly initialized: ['lm_head.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
print(torch.cuda.current_device())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))

0
1
Tesla K40m


In [10]:
learner.fit(epochs=3,
            lr=6e-5,
            validate=True,
            schedule_type="warmup_cosine",
            optimizer_type="lamb")

RuntimeError: CUDA error: no kernel image is available for execution on the device

In [None]:
learner.save_model()

In [None]:
from fast_bert.prediction import BertClassificationPredictor

In [None]:
predicator = BertClassificationPredictor(
    model_path='lm_models_roberta/model_out',
    label_path='data/',
    multi_label=True,
    model_type='bert',
    do_lower_case=False)

In [None]:
predicator.predict("incentives in computer science")

In [None]:
torch.cuda.get_device_name(0)