In [1]:
from transformers import BertTokenizer
from pathlib import Path
import torch

from box import Box
import pandas as pd
import collections
import os
from tqdm import tqdm, trange
import sys
import random
import numpy as np
# import apex
from sklearn.model_selection import train_test_split

import datetime

from fast_bert.modeling import BertForMultiLabelSequenceClassification
from fast_bert.data_cls import BertDataBunch, InputExample, InputFeatures, MultiLabelTextProcessor, convert_examples_to_features
from fast_bert.learner_cls import BertLearner
from fast_bert.metrics import accuracy_multilabel, accuracy_thresh, fbeta, roc_auc

torch.cuda.empty_cache()

pd.set_option('display.max_colwidth', -1)
run_start_time = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M-%S')

LOG_PATH=Path('/scratch/da2734/twitter/toxic-test/log/')
DATA_PATH=Path('/scratch/da2734/twitter/toxic-test/multi_label_toxic_comments/data')
LABEL_PATH=Path('/scratch/da2734/twitter/toxic-test/multi_label_toxic_comments/label/')
OUTPUT_PATH=Path('/scratch/da2734/twitter/toxic-test/multi_label_toxic_comments/output/')
FINETUNED_PATH = None

args = Box({
    "run_text": "multilabel toxic comments with freezable layers",
    "train_size": -1,
    "val_size": -1,
    "log_path": LOG_PATH,
    "full_data_dir": DATA_PATH,
    "data_dir": DATA_PATH,
    "task_name": "labor_market_classification",
    "no_cuda": False,
#     "bert_model": BERT_PRETRAINED_PATH,
    "output_dir": OUTPUT_PATH,
    "max_seq_length": 512,
    "do_train": True,
    "do_eval": True,
    "do_lower_case": True,
    "train_batch_size": 8,
    "eval_batch_size": 16,
    "learning_rate": 5e-5,
    "num_train_epochs": 6,
    "warmup_proportion": 0.0,
    "no_cuda": False,
    "local_rank": -1,
    "seed": 42,
    "gradient_accumulation_steps": 1,
    "optimize_on_cpu": False,
    "fp16": False,
    "fp16_opt_level": "O1",
    "weight_decay": 0.0,
    "adam_epsilon": 1e-8,
    "max_grad_norm": 1.0,
    "max_steps": -1,
    "warmup_steps": 500,
    "logging_steps": 50,
    "eval_all_checkpoints": True,
    "overwrite_output_dir": True,
    "overwrite_cache": False,
    "seed": 42,
    "loss_scale": 128,
    "task_name": 'intent',
    "model_name": 'bert-base-uncased',
    "model_type": 'bert'
})

import logging

logfile = str(LOG_PATH/'log-{}-{}.txt'.format(run_start_time, args["run_text"]))

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
    datefmt='%m/%d/%Y %H:%M:%S',
    handlers=[
        logging.FileHandler(logfile),
        logging.StreamHandler(sys.stdout)
    ])

logger = logging.getLogger()

logger.info(args)

device = torch.device('cuda')
if torch.cuda.device_count() > 1:
    args.multi_gpu = True
else:
    args.multi_gpu = False

label_cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

databunch = BertDataBunch(
args['data_dir'], 
LABEL_PATH, 
args.model_name, 
train_file='train_sample.csv', 
val_file='val_sample.csv',
# test_data='test.csv',
text_col="comment_text", 
label_col=label_cols,
batch_size_per_gpu=args['train_batch_size'], 
max_seq_length=args['max_seq_length'], 
multi_gpu=args.multi_gpu, 
multi_label=True, 
model_type=args.model_type)

num_labels = len(databunch.labels)
print('num_labels', num_labels)


02/21/2020 14:45:11 - INFO - root -   {'run_text': 'multilabel toxic comments with freezable layers', 'train_size': -1, 'val_size': -1, 'log_path': PosixPath('/scratch/da2734/twitter/toxic-test/log'), 'full_data_dir': PosixPath('/scratch/da2734/twitter/toxic-test/multi_label_toxic_comments/data'), 'data_dir': PosixPath('/scratch/da2734/twitter/toxic-test/multi_label_toxic_comments/data'), 'task_name': 'intent', 'no_cuda': False, 'output_dir': PosixPath('/scratch/da2734/twitter/toxic-test/multi_label_toxic_comments/output'), 'max_seq_length': 512, 'do_train': True, 'do_eval': True, 'do_lower_case': True, 'train_batch_size': 8, 'eval_batch_size': 16, 'learning_rate': 5e-05, 'num_train_epochs': 6, 'warmup_proportion': 0.0, 'local_rank': -1, 'seed': 42, 'gradient_accumulation_steps': 1, 'optimize_on_cpu': False, 'fp16': False, 'fp16_opt_level': 'O1', 'weight_decay': 0.0, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'max_steps': -1, 'warmup_steps': 500, 'logging_steps': 50, 'eval_all_checkp



02/21/2020 14:45:11 - INFO - root -   Loading features from cached file /scratch/da2734/twitter/toxic-test/multi_label_toxic_comments/data/cache/cached_bert_dev_multi_label_512_val_sample.csv


In [2]:
databunch.train_dl.dataset[0][3]

tensor([0., 0., 0., 0., 0., 0.])

In [7]:
metrics = []
metrics.append({'name': 'accuracy_thresh', 'function': accuracy_thresh})
metrics.append({'name': 'roc_auc', 'function': roc_auc})
metrics.append({'name': 'fbeta', 'function': fbeta})

In [9]:
learner = BertLearner.from_pretrained_model(
                                            databunch, 
                                            pretrained_path=args.model_name, 
                                            metrics=metrics, 
                                            device=device, 
                                            logger=logger, 
                                            output_dir=args.output_dir, 
                                            finetuned_wgts_path=FINETUNED_PATH, 
                                            warmup_steps=args.warmup_steps,
                                            multi_gpu=args.multi_gpu, 
                                            is_fp16=args.fp16, 
                                            multi_label=True, 
                                            logging_steps=0)

02/21/2020 14:59:05 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/da2734/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.8f56353af4a709bf5ff0fbc915d8f5b42bfff892cbb6ac98c3c45f481a03c685
02/21/2020 14:59:05 - INFO - transformers.configuration_utils -   Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "do_sample": false,
  "eos_token_ids": 0,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings

In [12]:
learner.fit(args.num_train_epochs, args.learning_rate, validate=True)

02/21/2020 15:00:51 - INFO - root -   ***** Running training *****
02/21/2020 15:00:51 - INFO - root -     Num examples = 1000
02/21/2020 15:00:51 - INFO - root -     Num Epochs = 6
02/21/2020 15:00:51 - INFO - root -     Total train batch size (w. parallel, distributed & accumulation) = 16
02/21/2020 15:00:51 - INFO - root -     Gradient Accumulation steps = 1
02/21/2020 15:00:51 - INFO - root -     Total optimization steps = 378




02/21/2020 15:01:41 - INFO - root -   Running evaluation
02/21/2020 15:01:41 - INFO - root -     Num examples = 1000
02/21/2020 15:01:41 - INFO - root -     Batch size = 32


02/21/2020 15:01:54 - INFO - root -   eval_loss after epoch 1: 0.681287344545126: 
02/21/2020 15:01:54 - INFO - root -   eval_accuracy_thresh after epoch 1: 0.4490000009536743: 
02/21/2020 15:01:54 - INFO - root -   eval_roc_auc after epoch 1: 0.3151577318115324: 
02/21/2020 15:01:54 - INFO - root -   eval_fbeta after epoch 1: 0.05643989518284798: 
02/21/2020 15:01:54 - INFO - root -   lr after epoch 1: 6.300000000000001e-06
02/21/2020 15:01:54 - INFO - root -   train_loss after epoch 1: 0.6865965203633384
02/21/2020 15:01:54 - INFO - root -   





02/21/2020 15:02:40 - INFO - root -   Running evaluation
02/21/2020 15:02:40 - INFO - root -     Num examples = 1000
02/21/2020 15:02:40 - INFO - root -     Batch size = 32


02/21/2020 15:02:52 - INFO - root -   eval_loss after epoch 2: 0.6670222021639347: 
02/21/2020 15:02:52 - INFO - root -   eval_accuracy_thresh after epoch 2: 0.534500002861023: 
02/21/2020 15:02:52 - INFO - root -   eval_roc_auc after epoch 2: 0.3294101413082496: 
02/21/2020 15:02:52 - INFO - root -   eval_fbeta after epoch 2: 0.05643989518284798: 
02/21/2020 15:02:52 - INFO - root -   lr after epoch 2: 1.2600000000000001e-05
02/21/2020 15:02:52 - INFO - root -   train_loss after epoch 2: 0.677210976207067
02/21/2020 15:02:52 - INFO - root -   

02/21/2020 15:03:38 - INFO - root -   Running evaluation
02/21/2020 15:03:38 - INFO - root -     Num examples = 1000
02/21/2020 15:03:38 - INFO - root -     Batch size = 32


02/21/2020 15:03:51 - INFO - root -   eval_loss after epoch 3: 0.6441659070551395: 
02/21/2020 15:03:51 - INFO - root -   eval_accuracy_thresh after epoch 3: 0.7038333415985107: 
02/21/2020 15:03:51 - INFO - root -   eval_roc_auc after epoch 3: 0.36211155670664164: 
02/21/2020 15:03:51 - INFO - root -   eval_fbeta after epoch 3: 0.05643989518284798: 
02/21/2020 15:03:51 - INFO - root -   lr after epoch 3: 1.8900000000000002e-05
02/21/2020 15:03:51 - INFO - root -   train_loss after epoch 3: 0.6588888981985668
02/21/2020 15:03:51 - INFO - root -   

02/21/2020 15:04:37 - INFO - root -   Running evaluation
02/21/2020 15:04:37 - INFO - root -     Num examples = 1000
02/21/2020 15:04:37 - INFO - root -     Batch size = 32


02/21/2020 15:04:49 - INFO - root -   eval_loss after epoch 4: 0.6120012123137712: 
02/21/2020 15:04:49 - INFO - root -   eval_accuracy_thresh after epoch 4: 0.8451666831970215: 
02/21/2020 15:04:49 - INFO - root -   eval_roc_auc after epoch 4: 0.41641497381107695: 
02/21/2020 15:04:49 - INFO - root -   eval_fbeta after epoch 4: 0.05643989518284798: 
02/21/2020 15:04:49 - INFO - root -   lr after epoch 4: 2.5200000000000003e-05
02/21/2020 15:04:49 - INFO - root -   train_loss after epoch 4: 0.6327008056262183
02/21/2020 15:04:49 - INFO - root -   

02/21/2020 15:05:35 - INFO - root -   Running evaluation
02/21/2020 15:05:35 - INFO - root -     Num examples = 1000
02/21/2020 15:05:35 - INFO - root -     Batch size = 32


02/21/2020 15:05:48 - INFO - root -   eval_loss after epoch 5: 0.5655335988849401: 
02/21/2020 15:05:48 - INFO - root -   eval_accuracy_thresh after epoch 5: 0.9321666359901428: 
02/21/2020 15:05:48 - INFO - root -   eval_roc_auc after epoch 5: 0.46807923451920075: 
02/21/2020 15:05:48 - INFO - root -   eval_fbeta after epoch 5: 0.05591175705194473: 
02/21/2020 15:05:48 - INFO - root -   lr after epoch 5: 3.15e-05
02/21/2020 15:05:48 - INFO - root -   train_loss after epoch 5: 0.5960280393797254
02/21/2020 15:05:48 - INFO - root -   

02/21/2020 15:06:34 - INFO - root -   Running evaluation
02/21/2020 15:06:34 - INFO - root -     Num examples = 1000
02/21/2020 15:06:34 - INFO - root -     Batch size = 32


02/21/2020 15:06:46 - INFO - root -   eval_loss after epoch 6: 0.5007277810946107: 
02/21/2020 15:06:46 - INFO - root -   eval_accuracy_thresh after epoch 6: 0.9616666436195374: 
02/21/2020 15:06:46 - INFO - root -   eval_roc_auc after epoch 6: 0.5371402159186967: 
02/21/2020 15:06:46 - INFO - root -   eval_fbeta after epoch 6: 0.053894586861133575: 
02/21/2020 15:06:46 - INFO - root -   lr after epoch 6: 3.7800000000000004e-05
02/21/2020 15:06:46 - INFO - root -   train_loss after epoch 6: 0.5420517159832848
02/21/2020 15:06:46 - INFO - root -   



(378, 0.6322461592930334)

In [13]:
learner.validate()

02/21/2020 15:07:22 - INFO - root -   Running evaluation
02/21/2020 15:07:22 - INFO - root -     Num examples = 1000
02/21/2020 15:07:22 - INFO - root -     Batch size = 32


{'loss': 0.5007277810946107,
 'accuracy_thresh': 0.9616666436195374,
 'roc_auc': 0.5371402159186967,
 'fbeta': 0.053894586861133575}

In [14]:
learner.save_model()

02/21/2020 15:07:49 - INFO - transformers.configuration_utils -   Configuration saved in /scratch/da2734/twitter/toxic-test/multi_label_toxic_comments/output/model_out/config.json
02/21/2020 15:07:49 - INFO - transformers.modeling_utils -   Model weights saved in /scratch/da2734/twitter/toxic-test/multi_label_toxic_comments/output/model_out/pytorch_model.bin


In [20]:
texts = ['I really love the Netflix original movies', 'this movie is not worth watching']
predictions = learner.predict_batch(texts)
print(predictions[0])

02/21/2020 15:09:03 - INFO - root -   Writing example 0 of 2
[('threat', 0.45684200525283813), ('severe_toxic', 0.4514032006263733), ('toxic', 0.4384770393371582), ('insult', 0.4311715364456177), ('identity_hate', 0.41412460803985596), ('obscene', 0.3430982530117035)]


In [21]:
learner.predict_batch(list(pd.read_csv('../data/test.csv')['comment_text'].values))

FileNotFoundError: [Errno 2] File ../data/test.csv does not exist: '../data/test.csv'