In [7]:
from transformers import BertTokenizer
from pathlib import Path
import torch

from box import Box
import pandas as pd
import collections
import os
from tqdm import tqdm, trange
import sys
import random
import numpy as np
# import apex
from sklearn.model_selection import train_test_split

import datetime

from fast_bert.modeling import BertForMultiLabelSequenceClassification
from fast_bert.data_cls import BertDataBunch, InputExample, InputFeatures, MultiLabelTextProcessor, convert_examples_to_features
from fast_bert.learner_cls import BertLearner
from fast_bert.metrics import accuracy_multilabel, accuracy_thresh, fbeta, roc_auc

In [8]:
torch.cuda.empty_cache()

In [9]:
pd.set_option('display.max_colwidth', -1)
run_start_time = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M-%S')

  """Entry point for launching an IPython kernel.


In [10]:
LOG_PATH=Path('/scratch/spf248/twitter/log/')
DATA_PATH=Path('/scratch/spf248/twitter/data/classification/')
LABEL_PATH=Path('/scratch/spf248/twitter/data/classification/')
OUTPUT_PATH=Path('/scratch/spf248/twitter/data/classification/output-0/')
FINETUNED_PATH = None

In [11]:
args = Box({
    "run_text": "multilabel toxic comments with freezable layers",
    "train_size": -1,
    "val_size": -1,
    "log_path": LOG_PATH,
    "full_data_dir": DATA_PATH,
    "data_dir": DATA_PATH,
    "task_name": "labor_market_classification",
    "no_cuda": False,
#     "bert_model": BERT_PRETRAINED_PATH,
    "output_dir": OUTPUT_PATH,
    "max_seq_length": 512,
    "do_train": True,
    "do_eval": True,
    "do_lower_case": True,
    "train_batch_size": 8,
    "eval_batch_size": 16,
    "learning_rate": 5e-5,
    "num_train_epochs": 6,
    "warmup_proportion": 0.0,
    "no_cuda": False,
    "local_rank": -1,
    "seed": 42,
    "gradient_accumulation_steps": 1,
    "optimize_on_cpu": False,
    "fp16": False,
    "fp16_opt_level": "O1",
    "weight_decay": 0.0,
    "adam_epsilon": 1e-8,
    "max_grad_norm": 1.0,
    "max_steps": -1,
    "warmup_steps": 500,
    "logging_steps": 50,
    "eval_all_checkpoints": True,
    "overwrite_output_dir": True,
    "overwrite_cache": False,
    "seed": 42,
    "loss_scale": 128,
    "task_name": 'intent',
    "model_name": 'bert-base-uncased',
    "model_type": 'bert'
})

In [6]:
import logging

logfile = str(LOG_PATH/'log-{}-{}.txt'.format(run_start_time, args["run_text"]))

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
    datefmt='%m/%d/%Y %H:%M:%S',
    handlers=[
        logging.FileHandler(logfile),
        logging.StreamHandler(sys.stdout)
    ])

logger = logging.getLogger()

PermissionError: [Errno 13] Permission denied: '/scratch/spf248/twitter/log/log-2020-02-21_14-12-28-multilabel toxic comments with freezable layers.txt'

In [12]:
logger.info(args)

NameError: name 'logger' is not defined

In [8]:
device = torch.device('cuda')
if torch.cuda.device_count() > 1:
    args.multi_gpu = True
else:
    args.multi_gpu = False

In [9]:
label_cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

In [10]:
databunch = BertDataBunch(
args['data_dir'], 
LABEL_PATH, 
args.model_name, 
train_file='train_sample.csv', 
val_file='val_sample.csv',
# test_data='test.csv',
text_col="comment_text", 
label_col=label_cols,
batch_size_per_gpu=args['train_batch_size'], 
max_seq_length=args['max_seq_length'], 
multi_gpu=args.multi_gpu, 
multi_label=True, 
model_type=args.model_type)

12/23/2019 07:37:27 - INFO - transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/spf248/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
12/23/2019 07:37:27 - INFO - root -   Loading features from cached file /scratch/spf248/twitter/data/classification/cache/cached_bert_train_multi_label_512_train_sample.csv
12/23/2019 07:37:27 - INFO - root -   Loading features from cached file /scratch/spf248/twitter/data/classification/cache/cached_bert_dev_multi_label_512_val_sample.csv


In [11]:
databunch.train_dl.dataset[0][3]

tensor([0., 0., 0., 0., 0., 0.])

In [12]:
num_labels = len(databunch.labels)
num_labels

6

In [13]:
metrics = []
metrics.append({'name': 'accuracy_thresh', 'function': accuracy_thresh})
metrics.append({'name': 'roc_auc', 'function': roc_auc})
metrics.append({'name': 'fbeta', 'function': fbeta})

In [14]:
learner = BertLearner.from_pretrained_model(
databunch, 
args.model_name, 
metrics=metrics, 
device=device, 
logger=logger, 
output_dir=args.output_dir, 
finetuned_wgts_path=FINETUNED_PATH, 
warmup_steps=args.warmup_steps,
multi_gpu=args.multi_gpu, 
is_fp16=args.fp16, 
multi_label=True, 
logging_steps=0)

12/23/2019 07:37:28 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/spf248/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.bf3b9ea126d8c0001ee8a1e8b92229871d06d36d8808208cc2449280da87785c
12/23/2019 07:37:28 - INFO - transformers.configuration_utils -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 6,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,


In [15]:
learner.fit(args.num_train_epochs, args.learning_rate, validate=True)

12/23/2019 07:37:37 - INFO - root -   ***** Running training *****
12/23/2019 07:37:37 - INFO - root -     Num examples = 1000
12/23/2019 07:37:37 - INFO - root -     Num Epochs = 6
12/23/2019 07:37:37 - INFO - root -     Total train batch size (w. parallel, distributed & accumulation) = 8
12/23/2019 07:37:37 - INFO - root -     Gradient Accumulation steps = 1
12/23/2019 07:37:37 - INFO - root -     Total optimization steps = 750


12/23/2019 07:40:51 - INFO - root -   Running evaluation
12/23/2019 07:40:51 - INFO - root -     Num examples = 1000
12/23/2019 07:40:51 - INFO - root -     Batch size = 16


12/23/2019 07:41:56 - INFO - root -   eval_loss after epoch 1: 0.6285575126844739: 
12/23/2019 07:41:56 - INFO - root -   eval_accuracy_thresh after epoch 1: 0.6639999747276306: 
12/23/2019 07:41:56 - INFO - root -   eval_roc_auc after epoch 1: 0.3302805321928832: 
12/23/2019 07:41:56 - INFO - root -   eval_fbeta after epoch 1: 0.05643989518284798: 
12/23/2019 07:41:56 - INFO - root -   lr after epoch 1: 1.25e-05
12/23/2019 07:41:56 - INFO - root -   train_loss after epoch 1: 0.6460264811515808
12/23/2019 07:41:56 - INFO - root -   

12/23/2019 07:45:11 - INFO - root -   Running evaluation
12/23/2019 07:45:11 - INFO - root -     Num examples = 1000
12/23/2019 07:45:11 - INFO - root -     Batch size = 16


12/23/2019 07:46:16 - INFO - root -   eval_loss after epoch 2: 0.5609555632349045: 
12/23/2019 07:46:16 - INFO - root -   eval_accuracy_thresh after epoch 2: 0.949999988079071: 
12/23/2019 07:46:16 - INFO - root -   eval_roc_auc after epoch 2: 0.39329992010159065: 
12/23/2019 07:46:16 - INFO - root -   eval_fbeta after epoch 2: 0.05643989518284798: 
12/23/2019 07:46:16 - INFO - root -   lr after epoch 2: 2.5e-05
12/23/2019 07:46:16 - INFO - root -   train_loss after epoch 2: 0.6022105584144593
12/23/2019 07:46:16 - INFO - root -   

12/23/2019 07:49:32 - INFO - root -   Running evaluation
12/23/2019 07:49:32 - INFO - root -     Num examples = 1000
12/23/2019 07:49:32 - INFO - root -     Batch size = 16


12/23/2019 07:50:36 - INFO - root -   eval_loss after epoch 3: 0.4331205047312237: 
12/23/2019 07:50:36 - INFO - root -   eval_accuracy_thresh after epoch 3: 0.9696666598320007: 
12/23/2019 07:50:36 - INFO - root -   eval_roc_auc after epoch 3: 0.5390792962764253: 
12/23/2019 07:50:36 - INFO - root -   eval_fbeta after epoch 3: 0.04984325170516968: 
12/23/2019 07:50:36 - INFO - root -   lr after epoch 3: 3.7500000000000003e-05
12/23/2019 07:50:36 - INFO - root -   train_loss after epoch 3: 0.5083287932872772
12/23/2019 07:50:36 - INFO - root -   

12/23/2019 07:53:52 - INFO - root -   Running evaluation
12/23/2019 07:53:52 - INFO - root -     Num examples = 1000
12/23/2019 07:53:52 - INFO - root -     Batch size = 16


12/23/2019 07:54:56 - INFO - root -   eval_loss after epoch 4: 0.32372991480524577: 
12/23/2019 07:54:56 - INFO - root -   eval_accuracy_thresh after epoch 4: 0.9703333377838135: 
12/23/2019 07:54:56 - INFO - root -   eval_roc_auc after epoch 4: 0.5947423372793627: 
12/23/2019 07:54:56 - INFO - root -   eval_fbeta after epoch 4: 0.003888495732098818: 
12/23/2019 07:54:56 - INFO - root -   lr after epoch 4: 5e-05
12/23/2019 07:54:56 - INFO - root -   train_loss after epoch 4: 0.3876285011768341
12/23/2019 07:54:56 - INFO - root -   

12/23/2019 07:58:12 - INFO - root -   Running evaluation
12/23/2019 07:58:12 - INFO - root -     Num examples = 1000
12/23/2019 07:58:12 - INFO - root -     Batch size = 16


12/23/2019 07:59:17 - INFO - root -   eval_loss after epoch 5: 0.2665877581115753: 
12/23/2019 07:59:17 - INFO - root -   eval_accuracy_thresh after epoch 5: 0.9703333377838135: 
12/23/2019 07:59:17 - INFO - root -   eval_roc_auc after epoch 5: 0.710155975590457: 
12/23/2019 07:59:17 - INFO - root -   eval_fbeta after epoch 5: 0.0006250000442378223: 
12/23/2019 07:59:17 - INFO - root -   lr after epoch 5: 2.5e-05
12/23/2019 07:59:17 - INFO - root -   train_loss after epoch 5: 0.30569919633865356
12/23/2019 07:59:17 - INFO - root -   

12/23/2019 08:02:33 - INFO - root -   Running evaluation
12/23/2019 08:02:33 - INFO - root -     Num examples = 1000
12/23/2019 08:02:33 - INFO - root -     Batch size = 16


12/23/2019 08:03:37 - INFO - root -   eval_loss after epoch 6: 0.2569584676197597: 
12/23/2019 08:03:37 - INFO - root -   eval_accuracy_thresh after epoch 6: 0.9703333377838135: 
12/23/2019 08:03:37 - INFO - root -   eval_roc_auc after epoch 6: 0.7013661856036189: 
12/23/2019 08:03:37 - INFO - root -   eval_fbeta after epoch 6: 0.0007142857066355646: 
12/23/2019 08:03:37 - INFO - root -   lr after epoch 6: 0.0
12/23/2019 08:03:37 - INFO - root -   train_loss after epoch 6: 0.27559094417095187
12/23/2019 08:03:37 - INFO - root -   



(750, 0.4542474124232928)

In [16]:
learner.validate()

12/23/2019 08:03:37 - INFO - root -   Running evaluation
12/23/2019 08:03:37 - INFO - root -     Num examples = 1000
12/23/2019 08:03:37 - INFO - root -     Batch size = 16


{'loss': 0.2569584676197597,
 'accuracy_thresh': 0.9703333377838135,
 'roc_auc': 0.7013661856036189,
 'fbeta': 0.0007142857066355646}

In [17]:
learner.save_model()

12/23/2019 08:04:43 - INFO - transformers.configuration_utils -   Configuration saved in /scratch/spf248/twitter/data/classification/output-0/model_out/config.json
12/23/2019 08:04:44 - INFO - transformers.modeling_utils -   Model weights saved in /scratch/spf248/twitter/data/classification/output-0/model_out/pytorch_model.bin


In [19]:
learner.predict_batch(list(pd.read_csv('../data/test.csv')['comment_text'].values))

FileNotFoundError: [Errno 2] File b'../data/test.csv' does not exist: b'../data/test.csv'