In [5]:
#gets all this setup
from transformers import BertTokenizer
from pathlib import Path
import torch

from box import Box
import pandas as pd
import collections
import os
from tqdm import tqdm, trange
import sys
import random
import numpy as np
# import apex
from sklearn.model_selection import train_test_split

import datetime

from fast_bert.modeling import BertForMultiLabelSequenceClassification
from fast_bert.data_cls import BertDataBunch, InputExample, InputFeatures, MultiLabelTextProcessor, convert_examples_to_features
from fast_bert.learner_cls import BertLearner
from fast_bert.metrics import accuracy_multilabel, accuracy_thresh, fbeta, roc_auc

torch.cuda.empty_cache()

pd.set_option('display.max_colwidth', -1)
run_start_time = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M-%S')

LOG_PATH=Path('/scratch/da2734/twitter/sana/log/')
DATA_PATH=Path('/scratch/da2734/twitter/sana/data')
LABEL_PATH=Path('/scratch/da2734/twitter/sana/data/')
OUTPUT_PATH=Path('/scratch/da2734/twitter/sana/output/')
FINETUNED_PATH = None

args = Box({
    "run_text": "multilabel toxic comments with freezable layers",
    "train_size": -1,
    "val_size": -1,
    "log_path": LOG_PATH,
    "full_data_dir": DATA_PATH,
    "data_dir": DATA_PATH,
    "task_name": "labor_market_classification",
    "no_cuda": False,
#     "bert_model": BERT_PRETRAINED_PATH,
    "output_dir": OUTPUT_PATH,
    "max_seq_length": 512,
    "do_train": True,
    "do_eval": True,
    "do_lower_case": True,
    "train_batch_size": 8,
    "eval_batch_size": 16,
    "learning_rate": 5e-5,
    "num_train_epochs": 6,
    "warmup_proportion": 0.0,
    "no_cuda": False,
    "local_rank": -1,
    "seed": 42,
    "gradient_accumulation_steps": 1,
    "optimize_on_cpu": False,
    "fp16": False,
    "fp16_opt_level": "O1",
    "weight_decay": 0.0,
    "adam_epsilon": 1e-8,
    "max_grad_norm": 1.0,
    "max_steps": -1,
    "warmup_steps": 500,
    "logging_steps": 50,
    "eval_all_checkpoints": True,
    "overwrite_output_dir": True,
    "overwrite_cache": False,
    "seed": 42,
    "loss_scale": 128,
    "task_name": 'intent',
    "model_name": 'bert-base-uncased',
    "model_type": 'bert'
})

import logging

logfile = str(LOG_PATH/'log-{}-{}.txt'.format(run_start_time, args["run_text"]))

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
    datefmt='%m/%d/%Y %H:%M:%S',
    handlers=[
        logging.FileHandler(logfile),
        logging.StreamHandler(sys.stdout)
    ])

logger = logging.getLogger()

logger.info(args)

device = torch.device('cuda')
if torch.cuda.device_count() > 1:
    args.multi_gpu = True
else:
    args.multi_gpu = False

label_cols = ["job_loss","is_unemployed","job_search","is_hired","job_offer"]

databunch = BertDataBunch(
                        args['data_dir'], 
                        LABEL_PATH, 
                        args.model_name, 
                        train_file='train.csv', 
                        val_file='dev.csv',
                        # test_data='test.csv',
                        text_col="text", #this is the name of the column in the train file that containts the tweet text
                        label_col=label_cols,
                        batch_size_per_gpu=args['train_batch_size'], 
                        max_seq_length=args['max_seq_length'], 
                        multi_gpu=args.multi_gpu, 
                        multi_label=True, 
                        model_type=args.model_type)

num_labels = len(databunch.labels)
print('num_labels', num_labels)


02/22/2020 18:14:54 - INFO - root -   {'run_text': 'multilabel toxic comments with freezable layers', 'train_size': -1, 'val_size': -1, 'log_path': PosixPath('/scratch/da2734/twitter/sana/log'), 'full_data_dir': PosixPath('/scratch/da2734/twitter/sana/data'), 'data_dir': PosixPath('/scratch/da2734/twitter/sana/data'), 'task_name': 'intent', 'no_cuda': False, 'output_dir': PosixPath('/scratch/da2734/twitter/sana/output'), 'max_seq_length': 512, 'do_train': True, 'do_eval': True, 'do_lower_case': True, 'train_batch_size': 8, 'eval_batch_size': 16, 'learning_rate': 5e-05, 'num_train_epochs': 6, 'warmup_proportion': 0.0, 'local_rank': -1, 'seed': 42, 'gradient_accumulation_steps': 1, 'optimize_on_cpu': False, 'fp16': False, 'fp16_opt_level': 'O1', 'weight_decay': 0.0, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'max_steps': -1, 'warmup_steps': 500, 'logging_steps': 50, 'eval_all_checkpoints': True, 'overwrite_output_dir': True, 'overwrite_cache': False, 'loss_scale': 128, 'model_name': 'b



02/22/2020 18:14:56 - INFO - transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/da2734/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
02/22/2020 18:14:57 - INFO - root -   Writing example 0 of 1303
02/22/2020 18:14:57 - INFO - root -   Saving features into cached file /scratch/da2734/twitter/sana/data/cache/cached_bert_train_multi_label_512_train.csv
02/22/2020 18:14:58 - INFO - root -   Writing example 0 of 559
02/22/2020 18:14:59 - INFO - root -   Saving features into cached file /scratch/da2734/twitter/sana/data/cache/cached_bert_dev_multi_label_512_dev.csv
num_labels 5


In [15]:
databunch.train_dl.dataset[2][3] # this train_dlgives us the training dataset for example 2's labels

tensor([0., 0., 0., 1., 0.])

In [16]:
metrics = []
metrics.append({'name': 'accuracy_thresh', 'function': accuracy_thresh})
metrics.append({'name': 'roc_auc', 'function': roc_auc})
metrics.append({'name': 'fbeta', 'function': fbeta})

In [17]:
learner = BertLearner.from_pretrained_model(
                                            databunch, 
                                            pretrained_path=args.model_name, 
                                            metrics=metrics, 
                                            device=device, 
                                            logger=logger, 
                                            output_dir=args.output_dir, 
                                            finetuned_wgts_path=FINETUNED_PATH, 
                                            warmup_steps=args.warmup_steps,
                                            multi_gpu=args.multi_gpu, 
                                            is_fp16=args.fp16, 
                                            multi_label=True, 
                                            logging_steps=0)

02/22/2020 18:17:02 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/da2734/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.8f56353af4a709bf5ff0fbc915d8f5b42bfff892cbb6ac98c3c45f481a03c685
02/22/2020 18:17:02 - INFO - transformers.configuration_utils -   Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "do_sample": false,
  "eos_token_ids": 0,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings

In [18]:
learner.fit(args.num_train_epochs, args.learning_rate, validate=True) #this trains the model

02/22/2020 18:19:01 - INFO - root -   ***** Running training *****
02/22/2020 18:19:01 - INFO - root -     Num examples = 1303
02/22/2020 18:19:01 - INFO - root -     Num Epochs = 6
02/22/2020 18:19:01 - INFO - root -     Total train batch size (w. parallel, distributed & accumulation) = 8
02/22/2020 18:19:01 - INFO - root -     Gradient Accumulation steps = 1
02/22/2020 18:19:01 - INFO - root -     Total optimization steps = 978


02/22/2020 18:20:40 - INFO - root -   Running evaluation
02/22/2020 18:20:40 - INFO - root -     Num examples = 559
02/22/2020 18:20:40 - INFO - root -     Batch size = 16


02/22/2020 18:20:51 - INFO - root -   eval_loss after epoch 1: 0.6733051231929235: 
02/22/2020 18:20:51 - INFO - root -   eval_accuracy_thresh after epoch 1: 0.5595706701278687: 
02/22/2020 18:20:51 - INFO - root -   eval_roc_auc after epoch 1: 0.508515282946023: 
02/22/2020 18:20:51 - INFO - root -   eval_fbeta after epoch 1: 0.3264578878879547: 
02/22/2020 18:20:51 - INFO - root -   lr after epoch 1: 1.63e-05
02/22/2020 18:20:51 - INFO - root -   train_loss after epoch 1: 0.6946481689353662
02/22/2020 18:20:51 - INFO - root -   





02/22/2020 18:22:28 - INFO - root -   Running evaluation
02/22/2020 18:22:28 - INFO - root -     Num examples = 559
02/22/2020 18:22:28 - INFO - root -     Batch size = 16


02/22/2020 18:22:38 - INFO - root -   eval_loss after epoch 2: 0.5981588346617562: 
02/22/2020 18:22:38 - INFO - root -   eval_accuracy_thresh after epoch 2: 0.7230768799781799: 
02/22/2020 18:22:38 - INFO - root -   eval_roc_auc after epoch 2: 0.5420363159926728: 
02/22/2020 18:22:38 - INFO - root -   eval_fbeta after epoch 2: 0.32337433099746704: 
02/22/2020 18:22:38 - INFO - root -   lr after epoch 2: 3.26e-05
02/22/2020 18:22:38 - INFO - root -   train_loss after epoch 2: 0.6457013242815169
02/22/2020 18:22:38 - INFO - root -   

02/22/2020 18:24:14 - INFO - root -   Running evaluation
02/22/2020 18:24:14 - INFO - root -     Num examples = 559
02/22/2020 18:24:14 - INFO - root -     Batch size = 16


02/22/2020 18:24:24 - INFO - root -   eval_loss after epoch 3: 0.47361881732940675: 
02/22/2020 18:24:24 - INFO - root -   eval_accuracy_thresh after epoch 3: 0.8540250062942505: 
02/22/2020 18:24:24 - INFO - root -   eval_roc_auc after epoch 3: 0.6426220048136557: 
02/22/2020 18:24:24 - INFO - root -   eval_fbeta after epoch 3: 0.2806064486503601: 
02/22/2020 18:24:24 - INFO - root -   lr after epoch 3: 4.89e-05
02/22/2020 18:24:24 - INFO - root -   train_loss after epoch 3: 0.5357613417268531
02/22/2020 18:24:24 - INFO - root -   

02/22/2020 18:26:01 - INFO - root -   Running evaluation
02/22/2020 18:26:01 - INFO - root -     Num examples = 559
02/22/2020 18:26:01 - INFO - root -     Batch size = 16


02/22/2020 18:26:11 - INFO - root -   eval_loss after epoch 4: 0.4071083588259561: 
02/22/2020 18:26:11 - INFO - root -   eval_accuracy_thresh after epoch 4: 0.8540250062942505: 
02/22/2020 18:26:11 - INFO - root -   eval_roc_auc after epoch 4: 0.7660417539449798: 
02/22/2020 18:26:11 - INFO - root -   eval_fbeta after epoch 4: 0.1293693333864212: 
02/22/2020 18:26:11 - INFO - root -   lr after epoch 4: 3.8528583998355094e-05
02/22/2020 18:26:11 - INFO - root -   train_loss after epoch 4: 0.4379907358277795
02/22/2020 18:26:11 - INFO - root -   

02/22/2020 18:27:47 - INFO - root -   Running evaluation
02/22/2020 18:27:47 - INFO - root -     Num examples = 559
02/22/2020 18:27:47 - INFO - root -     Batch size = 16


02/22/2020 18:27:57 - INFO - root -   eval_loss after epoch 5: 0.3876696701560702: 
02/22/2020 18:27:57 - INFO - root -   eval_accuracy_thresh after epoch 5: 0.8540250062942505: 
02/22/2020 18:27:57 - INFO - root -   eval_roc_auc after epoch 5: 0.7805628116349179: 
02/22/2020 18:27:57 - INFO - root -   eval_fbeta after epoch 5: 0.12540818750858307: 
02/22/2020 18:27:57 - INFO - root -   lr after epoch 5: 1.3025330901416888e-05
02/22/2020 18:27:57 - INFO - root -   train_loss after epoch 5: 0.3969135406924172
02/22/2020 18:27:57 - INFO - root -   

02/22/2020 18:29:34 - INFO - root -   Running evaluation
02/22/2020 18:29:34 - INFO - root -     Num examples = 559
02/22/2020 18:29:34 - INFO - root -     Batch size = 16


02/22/2020 18:29:44 - INFO - root -   eval_loss after epoch 6: 0.38508441490786416: 
02/22/2020 18:29:44 - INFO - root -   eval_accuracy_thresh after epoch 6: 0.8540250062942505: 
02/22/2020 18:29:44 - INFO - root -   eval_roc_auc after epoch 6: 0.7829953095607745: 
02/22/2020 18:29:44 - INFO - root -   eval_fbeta after epoch 6: 0.12640202045440674: 
02/22/2020 18:29:44 - INFO - root -   lr after epoch 6: 0.0
02/22/2020 18:29:44 - INFO - root -   train_loss after epoch 6: 0.3859975963282439
02/22/2020 18:29:44 - INFO - root -   



(978, 0.5161687846320294)

In [19]:
learner.validate()

02/22/2020 18:29:52 - INFO - root -   Running evaluation
02/22/2020 18:29:52 - INFO - root -     Num examples = 559
02/22/2020 18:29:52 - INFO - root -     Batch size = 16


{'loss': 0.38508441490786416,
 'accuracy_thresh': 0.8540250062942505,
 'roc_auc': 0.7829953095607745,
 'fbeta': 0.12640202045440674}

In [20]:
learner.save_model()

02/22/2020 18:30:24 - INFO - transformers.configuration_utils -   Configuration saved in /scratch/da2734/twitter/sana/output/model_out/config.json
02/22/2020 18:30:24 - INFO - transformers.modeling_utils -   Model weights saved in /scratch/da2734/twitter/sana/output/model_out/pytorch_model.bin


In [24]:
texts = ['I just received a job offer']
predictions = learner.predict_batch(texts)
print(predictions[0])

02/22/2020 18:33:33 - INFO - root -   Writing example 0 of 1
[('job_loss', 0.29054415225982666), ('is_unemployed', 0.2861913740634918), ('is_hired', 0.2792271375656128), ('job_search', 0.17347979545593262), ('job_offer', 0.11866018921136856)]
