In [1]:
#gets all this setup
import time
start_time = time.time()

from transformers import BertTokenizer
from pathlib import Path
import torch

from box import Box
import pandas as pd
import collections
import os
from tqdm import tqdm, trange
import sys
import random
import numpy as np
# import apex
from sklearn.model_selection import train_test_split

import datetime

from fast_bert.modeling import BertForMultiLabelSequenceClassification
from fast_bert.data_cls import BertDataBunch, InputExample, InputFeatures, MultiLabelTextProcessor, convert_examples_to_features
from fast_bert.learner_cls import BertLearner
from fast_bert.metrics import *

torch.cuda.empty_cache()

pd.set_option('display.max_colwidth', -1)
run_start_time = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M-%S')

LOG_PATH=Path('/scratch/da2734/twitter/mturk_mar6/log/')
DATA_PATH=Path('/scratch/da2734/twitter/mturk_mar6/data')
LABEL_PATH=Path('/scratch/da2734/twitter/mturk_mar6/data/')
OUTPUT_PATH=Path('/scratch/da2734/twitter/mturk_mar6/output_100')
FINETUNED_PATH = None

args = Box({
    "run_text": "multilabel toxic comments with freezable layers",
    "train_size": -1,
    "val_size": -1,
    "log_path": LOG_PATH,
    "full_data_dir": DATA_PATH,
    "data_dir": DATA_PATH,
    "task_name": "labor_market_classification",
    "no_cuda": False,
#     "bert_model": BERT_PRETRAINED_PATH,
    "output_dir": OUTPUT_PATH,
    "max_seq_length": 512,
    "do_train": True,
    "do_eval": True,
    "do_lower_case": True,
    "train_batch_size": 8,
    "eval_batch_size": 16,
    "learning_rate": 5e-5,
    "num_train_epochs": 10,
    "warmup_proportion": 0.0,
    "no_cuda": False,
    "local_rank": -1,
    "seed": 42,
    "gradient_accumulation_steps": 1,
    "optimize_on_cpu": False,
    "fp16": False,
    "fp16_opt_level": "O1",
    "weight_decay": 0.0,
    "adam_epsilon": 1e-8,
    "max_grad_norm": 1.0,
    "max_steps": -1,
    "warmup_steps": 500,
    "logging_steps": 50,
    "eval_all_checkpoints": True,
    "overwrite_output_dir": True,
    "overwrite_cache": False,
    "seed": 42,
    "loss_scale": 128,
    "task_name": 'intent',
    "model_name": 'bert-base-uncased',
    "model_type": 'bert'
})

import logging

logfile = str(LOG_PATH/'log-{}-{}.txt'.format(run_start_time, args["run_text"]))

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
    datefmt='%m/%d/%Y %H:%M:%S',
    handlers=[
        logging.FileHandler(logfile),
        logging.StreamHandler(sys.stdout)
    ])

logger = logging.getLogger()

logger.info(args)

device = torch.device('cuda')
if torch.cuda.device_count() > 1:
    args.multi_gpu = True
else:
    args.multi_gpu = False

# label_cols = ["job_loss","is_unemployed","job_search","is_hired","job_offer"]
label_cols = ["is_unemployed", "lost_job_1mo", "job_search","is_hired_1mo","job_offer"]

databunch = BertDataBunch(
                        args['data_dir'], 
                        LABEL_PATH, 
                        args.model_name, 
                        train_file='train.csv', 
                        val_file='val.csv',
                        # test_data='test.csv',
                        text_col="text", #this is the name of the column in the train file that containts the tweet text
                        label_col=label_cols,
                        batch_size_per_gpu=args['train_batch_size'], 
                        max_seq_length=args['max_seq_length'], 
                        multi_gpu=args.multi_gpu, 
                        multi_label=True, 
                        model_type=args.model_type)

num_labels = len(databunch.labels)
print('num_labels', num_labels)


print('time taken to load all this stuff:', str(time.time() - start_time), 'seconds')

03/06/2020 21:04:41 - INFO - root -   {'run_text': 'multilabel toxic comments with freezable layers', 'train_size': -1, 'val_size': -1, 'log_path': PosixPath('/scratch/da2734/twitter/mturk_mar6/log'), 'full_data_dir': PosixPath('/scratch/da2734/twitter/mturk_mar6/data'), 'data_dir': PosixPath('/scratch/da2734/twitter/mturk_mar6/data'), 'task_name': 'intent', 'no_cuda': False, 'output_dir': PosixPath('/scratch/da2734/twitter/mturk_mar6/output_100'), 'max_seq_length': 512, 'do_train': True, 'do_eval': True, 'do_lower_case': True, 'train_batch_size': 8, 'eval_batch_size': 16, 'learning_rate': 5e-05, 'num_train_epochs': 10, 'warmup_proportion': 0.0, 'local_rank': -1, 'seed': 42, 'gradient_accumulation_steps': 1, 'optimize_on_cpu': False, 'fp16': False, 'fp16_opt_level': 'O1', 'weight_decay': 0.0, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'max_steps': -1, 'warmup_steps': 500, 'logging_steps': 50, 'eval_all_checkpoints': True, 'overwrite_output_dir': True, 'overwrite_cache': False, 'loss_



03/06/2020 21:04:42 - INFO - transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/da2734/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
03/06/2020 21:04:42 - INFO - root -   Loading features from cached file /scratch/da2734/twitter/mturk_mar6/data/cache/cached_bert_train_multi_label_512_train.csv
03/06/2020 21:04:42 - INFO - root -   Loading features from cached file /scratch/da2734/twitter/mturk_mar6/data/cache/cached_bert_dev_multi_label_512_val.csv
num_labels 5
time taken to load all this stuff: 217.17217087745667


In [2]:
databunch.train_dl.dataset[2][3] # this train_dlgives us the training dataset for example 2's labels

tensor([1., 0., 0., 0., 0.])

In [2]:
# metrics defined: https://github.com/kaushaltrivedi/fast-bert/blob/d89e2aa01d948d6d3cdea7ad106bf5792fea7dfa/fast_bert/metrics.py
metrics = []
metrics.append({'name': 'accuracy_thresh', 'function': accuracy_thresh})
metrics.append({'name': 'roc_auc', 'function': roc_auc})
metrics.append({'name': 'fbeta', 'function': fbeta})
metrics.append({'name': 'accuracy', 'function': accuracy})
metrics.append({'name': 'accuracy_multilabel', 'function': accuracy_multilabel})


In [3]:
learner = BertLearner.from_pretrained_model(
                                            databunch, 
                                            pretrained_path=args.model_name, 
                                            metrics=metrics, 
                                            device=device, 
                                            logger=logger, 
                                            output_dir=args.output_dir, 
                                            finetuned_wgts_path=FINETUNED_PATH, 
                                            warmup_steps=args.warmup_steps,
                                            multi_gpu=args.multi_gpu, 
                                            is_fp16=args.fp16, 
                                            multi_label=True, 
                                            logging_steps=0)

03/06/2020 21:05:02 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/da2734/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.8f56353af4a709bf5ff0fbc915d8f5b42bfff892cbb6ac98c3c45f481a03c685
03/06/2020 21:05:02 - INFO - transformers.configuration_utils -   Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "do_sample": false,
  "eos_token_ids": 0,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings

In [4]:
learner.fit(args.num_train_epochs, args.learning_rate, validate=True) #this trains the model

03/06/2020 21:05:50 - INFO - root -   ***** Running training *****
03/06/2020 21:05:50 - INFO - root -     Num examples = 2948
03/06/2020 21:05:50 - INFO - root -     Num Epochs = 10
03/06/2020 21:05:50 - INFO - root -     Total train batch size (w. parallel, distributed & accumulation) = 8
03/06/2020 21:05:50 - INFO - root -     Gradient Accumulation steps = 1
03/06/2020 21:05:50 - INFO - root -     Total optimization steps = 3690


03/06/2020 21:08:31 - INFO - root -   Running evaluation
03/06/2020 21:08:31 - INFO - root -     Num examples = 737
03/06/2020 21:08:31 - INFO - root -     Batch size = 16


03/06/2020 21:08:42 - INFO - root -   eval_loss after epoch 1: 0.5329679757990735: 
03/06/2020 21:08:42 - INFO - root -   eval_accuracy_thresh after epoch 1: 0.8966078758239746: 
03/06/2020 21:08:42 - INFO - root -   eval_roc_auc after epoch 1: 0.556025306158764: 
03/06/2020 21:08:42 - INFO - root -   eval_fbeta after epoch 1: 0.24764427542686462: 
03/06/2020 21:08:42 - INFO - root -   eval_accuracy after epoch 1: 0.0: 
03/06/2020 21:08:42 - INFO - root -   eval_accuracy_multilabel after epoch 1: 0.12075983717774763: 
03/06/2020 21:08:42 - INFO - root -   lr after epoch 1: 3.69e-05
03/06/2020 21:08:42 - INFO - root -   train_loss after epoch 1: 0.6337756782689392
03/06/2020 21:08:42 - INFO - root -   





03/06/2020 21:11:22 - INFO - root -   Running evaluation
03/06/2020 21:11:22 - INFO - root -     Num examples = 737
03/06/2020 21:11:22 - INFO - root -     Batch size = 16


03/06/2020 21:11:33 - INFO - root -   eval_loss after epoch 2: 0.34493391945007: 
03/06/2020 21:11:33 - INFO - root -   eval_accuracy_thresh after epoch 2: 0.896879255771637: 
03/06/2020 21:11:33 - INFO - root -   eval_roc_auc after epoch 2: 0.6808255165138256: 
03/06/2020 21:11:33 - INFO - root -   eval_fbeta after epoch 2: 0.11397557705640793: 
03/06/2020 21:11:33 - INFO - root -   eval_accuracy after epoch 2: 0.0: 
03/06/2020 21:11:33 - INFO - root -   eval_accuracy_multilabel after epoch 2: 0.2225237449118046: 
03/06/2020 21:11:33 - INFO - root -   lr after epoch 2: 4.931641405405896e-05
03/06/2020 21:11:33 - INFO - root -   train_loss after epoch 2: 0.43517155832229915
03/06/2020 21:11:33 - INFO - root -   

03/06/2020 21:14:13 - INFO - root -   Running evaluation
03/06/2020 21:14:13 - INFO - root -     Num examples = 737
03/06/2020 21:14:13 - INFO - root -     Batch size = 16


03/06/2020 21:14:24 - INFO - root -   eval_loss after epoch 3: 0.2972912934232265: 
03/06/2020 21:14:24 - INFO - root -   eval_accuracy_thresh after epoch 3: 0.9088195562362671: 
03/06/2020 21:14:24 - INFO - root -   eval_roc_auc after epoch 3: 0.7865277433541146: 
03/06/2020 21:14:24 - INFO - root -   eval_fbeta after epoch 3: 0.1166892796754837: 
03/06/2020 21:14:24 - INFO - root -   eval_accuracy after epoch 3: 0.0: 
03/06/2020 21:14:24 - INFO - root -   eval_accuracy_multilabel after epoch 3: 0.6770691994572592: 
03/06/2020 21:14:24 - INFO - root -   lr after epoch 3: 4.566455127328748e-05
03/06/2020 21:14:24 - INFO - root -   train_loss after epoch 3: 0.319216335045936
03/06/2020 21:14:24 - INFO - root -   

03/06/2020 21:17:03 - INFO - root -   Running evaluation
03/06/2020 21:17:03 - INFO - root -     Num examples = 737
03/06/2020 21:17:03 - INFO - root -     Batch size = 16


03/06/2020 21:17:14 - INFO - root -   eval_loss after epoch 4: 0.27507450796188193: 
03/06/2020 21:17:14 - INFO - root -   eval_accuracy_thresh after epoch 4: 0.9115332961082458: 
03/06/2020 21:17:14 - INFO - root -   eval_roc_auc after epoch 4: 0.8401214149078824: 
03/06/2020 21:17:14 - INFO - root -   eval_fbeta after epoch 4: 0.11533243209123611: 
03/06/2020 21:17:14 - INFO - root -   eval_accuracy after epoch 4: 0.0: 
03/06/2020 21:17:14 - INFO - root -   eval_accuracy_multilabel after epoch 4: 0.7245590230664858: 
03/06/2020 21:17:14 - INFO - root -   lr after epoch 4: 3.931362928732317e-05
03/06/2020 21:17:14 - INFO - root -   train_loss after epoch 4: 0.2773852293407368
03/06/2020 21:17:14 - INFO - root -   

03/06/2020 21:19:54 - INFO - root -   Running evaluation
03/06/2020 21:19:54 - INFO - root -     Num examples = 737
03/06/2020 21:19:54 - INFO - root -     Batch size = 16


03/06/2020 21:20:05 - INFO - root -   eval_loss after epoch 5: 0.2626908423101648: 
03/06/2020 21:20:05 - INFO - root -   eval_accuracy_thresh after epoch 5: 0.9118046760559082: 
03/06/2020 21:20:05 - INFO - root -   eval_roc_auc after epoch 5: 0.8590779966063564: 
03/06/2020 21:20:05 - INFO - root -   eval_fbeta after epoch 5: 0.11961835622787476: 
03/06/2020 21:20:05 - INFO - root -   eval_accuracy after epoch 5: 0.0: 
03/06/2020 21:20:05 - INFO - root -   eval_accuracy_multilabel after epoch 5: 0.7340569877883311: 
03/06/2020 21:20:05 - INFO - root -   lr after epoch 5: 3.109316112168412e-05
03/06/2020 21:20:05 - INFO - root -   train_loss after epoch 5: 0.2523988006399253
03/06/2020 21:20:05 - INFO - root -   

03/06/2020 21:22:45 - INFO - root -   Running evaluation
03/06/2020 21:22:45 - INFO - root -     Num examples = 737
03/06/2020 21:22:45 - INFO - root -     Batch size = 16


03/06/2020 21:22:56 - INFO - root -   eval_loss after epoch 6: 0.2517178993909917: 
03/06/2020 21:22:56 - INFO - root -   eval_accuracy_thresh after epoch 6: 0.9120759963989258: 
03/06/2020 21:22:56 - INFO - root -   eval_roc_auc after epoch 6: 0.8769097189122546: 
03/06/2020 21:22:56 - INFO - root -   eval_fbeta after epoch 6: 0.16961245238780975: 
03/06/2020 21:22:56 - INFO - root -   eval_accuracy after epoch 6: 0.0: 
03/06/2020 21:22:56 - INFO - root -   eval_accuracy_multilabel after epoch 6: 0.7367706919945726: 
03/06/2020 21:22:56 - INFO - root -   lr after epoch 6: 2.20768468524104e-05
03/06/2020 21:22:56 - INFO - root -   train_loss after epoch 6: 0.23377781900448527
03/06/2020 21:22:56 - INFO - root -   

03/06/2020 21:25:35 - INFO - root -   Running evaluation
03/06/2020 21:25:35 - INFO - root -     Num examples = 737
03/06/2020 21:25:35 - INFO - root -     Batch size = 16


03/06/2020 21:25:46 - INFO - root -   eval_loss after epoch 7: 0.241286538224271: 
03/06/2020 21:25:46 - INFO - root -   eval_accuracy_thresh after epoch 7: 0.9134328961372375: 
03/06/2020 21:25:46 - INFO - root -   eval_roc_auc after epoch 7: 0.8882647613963509: 
03/06/2020 21:25:46 - INFO - root -   eval_fbeta after epoch 7: 0.180252805352211: 
03/06/2020 21:25:46 - INFO - root -   eval_accuracy after epoch 7: 0.0: 
03/06/2020 21:25:46 - INFO - root -   eval_accuracy_multilabel after epoch 7: 0.621438263229308: 
03/06/2020 21:25:46 - INFO - root -   lr after epoch 7: 1.3442334409213062e-05
03/06/2020 21:25:46 - INFO - root -   train_loss after epoch 7: 0.21938050936069592
03/06/2020 21:25:46 - INFO - root -   

03/06/2020 21:28:26 - INFO - root -   Running evaluation
03/06/2020 21:28:26 - INFO - root -     Num examples = 737
03/06/2020 21:28:26 - INFO - root -     Batch size = 16


03/06/2020 21:28:37 - INFO - root -   eval_loss after epoch 8: 0.2372553678269082: 
03/06/2020 21:28:37 - INFO - root -   eval_accuracy_thresh after epoch 8: 0.9161465764045715: 
03/06/2020 21:28:37 - INFO - root -   eval_roc_auc after epoch 8: 0.8925306476520943: 
03/06/2020 21:28:37 - INFO - root -   eval_fbeta after epoch 8: 0.19271020591259003: 
03/06/2020 21:28:37 - INFO - root -   eval_accuracy after epoch 8: 0.0: 
03/06/2020 21:28:37 - INFO - root -   eval_accuracy_multilabel after epoch 8: 0.6295793758480326: 
03/06/2020 21:28:37 - INFO - root -   lr after epoch 8: 6.317403436757782e-06
03/06/2020 21:28:37 - INFO - root -   train_loss after epoch 8: 0.21028170064293594
03/06/2020 21:28:37 - INFO - root -   

03/06/2020 21:31:17 - INFO - root -   Running evaluation
03/06/2020 21:31:17 - INFO - root -     Num examples = 737
03/06/2020 21:31:17 - INFO - root -     Batch size = 16


03/06/2020 21:31:28 - INFO - root -   eval_loss after epoch 9: 0.23567049760133663: 
03/06/2020 21:31:28 - INFO - root -   eval_accuracy_thresh after epoch 9: 0.9177747964859009: 
03/06/2020 21:31:28 - INFO - root -   eval_roc_auc after epoch 9: 0.893885880790325: 
03/06/2020 21:31:28 - INFO - root -   eval_fbeta after epoch 9: 0.1969798505306244: 
03/06/2020 21:31:28 - INFO - root -   eval_accuracy after epoch 9: 0.0: 
03/06/2020 21:31:28 - INFO - root -   eval_accuracy_multilabel after epoch 9: 0.6485753052917232: 
03/06/2020 21:31:28 - INFO - root -   lr after epoch 9: 1.6326626026727688e-06
03/06/2020 21:31:28 - INFO - root -   train_loss after epoch 9: 0.2048721521042873
03/06/2020 21:31:28 - INFO - root -   

03/06/2020 21:34:08 - INFO - root -   Running evaluation
03/06/2020 21:34:08 - INFO - root -     Num examples = 737
03/06/2020 21:34:08 - INFO - root -     Batch size = 16


03/06/2020 21:34:18 - INFO - root -   eval_loss after epoch 10: 0.23552671804073008: 
03/06/2020 21:34:18 - INFO - root -   eval_accuracy_thresh after epoch 10: 0.9175034165382385: 
03/06/2020 21:34:18 - INFO - root -   eval_roc_auc after epoch 10: 0.8943037311014089: 
03/06/2020 21:34:18 - INFO - root -   eval_fbeta after epoch 10: 0.1971413791179657: 
03/06/2020 21:34:18 - INFO - root -   eval_accuracy after epoch 10: 0.0: 
03/06/2020 21:34:18 - INFO - root -   eval_accuracy_multilabel after epoch 10: 0.6390773405698779: 
03/06/2020 21:34:18 - INFO - root -   lr after epoch 10: 0.0
03/06/2020 21:34:18 - INFO - root -   train_loss after epoch 10: 0.2032173075246294
03/06/2020 21:34:18 - INFO - root -   



(3690, 0.298947709025487)

In [5]:
learner.validate()

03/06/2020 23:41:11 - INFO - root -   Running evaluation
03/06/2020 23:41:11 - INFO - root -     Num examples = 737
03/06/2020 23:41:11 - INFO - root -     Batch size = 16


{'loss': 0.23552671804073008,
 'accuracy_thresh': 0.9175034165382385,
 'roc_auc': 0.8943037311014089,
 'fbeta': 0.1971413791179657,
 'accuracy': 0.0,
 'accuracy_multilabel': 0.6390773405698779}

In [6]:
learner.save_model()

03/06/2020 23:41:38 - INFO - transformers.configuration_utils -   Configuration saved in /scratch/da2734/twitter/mturk_mar6/output_100/model_out/config.json
03/06/2020 23:41:41 - INFO - transformers.modeling_utils -   Model weights saved in /scratch/da2734/twitter/mturk_mar6/output_100/model_out/pytorch_model.bin


In [7]:
texts = ['I just received a job offer']
predictions = learner.predict_batch(texts)
print(predictions[0])

03/06/2020 23:41:52 - INFO - root -   Writing example 0 of 1
[('is_hired_1mo', 0.35232582688331604), ('lost_job_1mo', 0.31937041878700256), ('is_unemployed', 0.31286755204200745), ('job_search', 0.23360265791416168), ('job_offer"', 0.056336939334869385)]
