In [2]:
# gets all this setup
import time

start_time = time.time()

from transformers import BertTokenizer
from pathlib import Path
import torch

from box import Box
import pandas as pd
import collections
import os
from tqdm import tqdm, trange
import sys
import random
import numpy as np
# import apex
from sklearn.model_selection import train_test_split

import datetime

import sys

sys.path.append('../')
from fast_bert.modeling import BertForMultiLabelSequenceClassification
from fast_bert.data_cls import BertDataBunch, InputExample, InputFeatures, MultiLabelTextProcessor, \
    convert_examples_to_features
from fast_bert.learner_cls import BertLearner
from fast_bert.metrics import *

for column in ["is_unemployed", "lost_job_1mo", "job_search", "is_hired_1mo", "job_offer"]:

    print(column, 'creating model and loading..')

    torch.cuda.empty_cache()

    pd.set_option('display.max_colwidth', -1)
    run_start_time = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M-%S')

    if not os.path.exists('/scratch/da2734/twitter/mturk_mar6/log_{}/'.format(column)):
        os.makedirs('/scratch/da2734/twitter/mturk_mar6/log_{}/'.format(column))

    if not os.path.exists('/scratch/da2734/twitter/mturk_mar6/output_binary_{}'.format(column)):
        os.makedirs('/scratch/da2734/twitter/mturk_mar6/output_binary_{}'.format(column))

    LOG_PATH = Path('/scratch/da2734/twitter/mturk_mar6/log_{}/'.format(column))
    DATA_PATH = Path('/scratch/da2734/twitter/mturk_mar6/data_binary/')
    LABEL_PATH = Path('/scratch/da2734/twitter/mturk_mar6/data_binary/')
    OUTPUT_PATH = Path('/scratch/da2734/twitter/mturk_mar6/output_binary_{}'.format(column))
    FINETUNED_PATH = None

    args = Box({
        "run_text": "labor mturk ar 6 binary",
        "train_size": -1,
        "val_size": -1,
        "log_path": LOG_PATH,
        "full_data_dir": DATA_PATH,
        "data_dir": DATA_PATH,
        "task_name": "labor_market_classification",
        "no_cuda": False,
        #     "bert_model": BERT_PRETRAINED_PATH,
        "output_dir": OUTPUT_PATH,
        "max_seq_length": 512,
        "do_train": True,
        "do_eval": True,
        "do_lower_case": True,
        "train_batch_size": 8,
        "eval_batch_size": 16,
        "learning_rate": 5e-5,
        "num_train_epochs": 1,
        "warmup_proportion": 0.0,
        "no_cuda": False,
        "local_rank": -1,
        "seed": 42,
        "gradient_accumulation_steps": 1,
        "optimize_on_cpu": False,
        "fp16": False,
        "fp16_opt_level": "O1",
        "weight_decay": 0.0,
        "adam_epsilon": 1e-8,
        "max_grad_norm": 1.0,
        "max_steps": -1,
        "warmup_steps": 500,
        "logging_steps": 50,
        "eval_all_checkpoints": True,
        "overwrite_output_dir": True,
        "overwrite_cache": True,
        "seed": 42,
        "loss_scale": 128,
        "task_name": 'intent',
        "model_name": 'bert-base-uncased',
        "model_type": 'bert'
    })

    import logging

    logfile = str(LOG_PATH / 'log-{}-{}.txt'.format(run_start_time, args["run_text"]))

    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        handlers=[
            logging.FileHandler(logfile),
            logging.StreamHandler(sys.stdout)
        ])

    logger = logging.getLogger()

    logger.info(args)

    device = torch.device('cuda')
    if torch.cuda.device_count() > 1:
        args.multi_gpu = True
    else:
        args.multi_gpu = False

    # label_cols = ["job_loss","is_unemployed","job_search","is_hired","job_offer"]
    # label_cols = ['pos', 'neg']
    label_cols = ['pos']

    databunch = BertDataBunch(
        args['data_dir'],
        LABEL_PATH,
        args.model_name,
        train_file='train_{}.csv'.format(column),
        val_file='val_{}.csv'.format(column),
        label_file='label_{}.csv'.format(column),
        # test_data='test.csv',
        text_col="text",  # this is the name of the column in the train file that containts the tweet text
        label_col=label_cols,
        batch_size_per_gpu=args['train_batch_size'],
        max_seq_length=args['max_seq_length'],
        multi_gpu=args.multi_gpu,
        multi_label=False,
        model_type=args.model_type)

    num_labels = len(databunch.labels)
    print('num_labels', num_labels)

    print('time taken to load all this stuff:', str(time.time() - start_time), 'seconds')

    # metrics defined: https://github.com/kaushaltrivedi/fast-bert/blob/d89e2aa01d948d6d3cdea7ad106bf5792fea7dfa/fast_bert/metrics.py
    metrics = []
    # metrics.append({'name': 'accuracy_thresh', 'function': accuracy_thresh})
    # metrics.append({'name': 'roc_auc', 'function': roc_auc})
    # metrics.append({'name': 'fbeta', 'function': fbeta})
    metrics.append({'name': 'accuracy', 'function': accuracy})
    # metrics.append({'name': 'accuracy_multilabel', 'function': accuracy_multilabel})

    learner = BertLearner.from_pretrained_model(
        databunch,
        pretrained_path=args.model_name,
        metrics=metrics,
        device=device,
        logger=logger,
        output_dir=args.output_dir,
        finetuned_wgts_path=FINETUNED_PATH,
        warmup_steps=args.warmup_steps,
        multi_gpu=args.multi_gpu,
        is_fp16=args.fp16,
        multi_label=False,
        logging_steps=0)

    learner.fit(args.num_train_epochs, args.learning_rate, validate=True)  # this trains the model


#     break

is_unemployed creating model and loading..
03/30/2020 15:59:13 - INFO - root -   {'run_text': 'labor mturk ar 6 binary', 'train_size': -1, 'val_size': -1, 'log_path': PosixPath('/scratch/da2734/twitter/mturk_mar6/log_is_unemployed'), 'full_data_dir': PosixPath('/scratch/da2734/twitter/mturk_mar6/data_binary'), 'data_dir': PosixPath('/scratch/da2734/twitter/mturk_mar6/data_binary'), 'task_name': 'intent', 'no_cuda': False, 'output_dir': PosixPath('/scratch/da2734/twitter/mturk_mar6/output_is_unemployed'), 'max_seq_length': 512, 'do_train': True, 'do_eval': True, 'do_lower_case': True, 'train_batch_size': 8, 'eval_batch_size': 16, 'learning_rate': 5e-05, 'num_train_epochs': 1, 'warmup_proportion': 0.0, 'local_rank': -1, 'seed': 42, 'gradient_accumulation_steps': 1, 'optimize_on_cpu': False, 'fp16': False, 'fp16_opt_level': 'O1', 'weight_decay': 0.0, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'max_steps': -1, 'warmup_steps': 500, 'logging_steps': 50, 'eval_all_checkpoints': True, 'overw



03/30/2020 15:59:13 - INFO - transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/da2734/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
03/30/2020 15:59:13 - INFO - root -   Loading features from cached file /scratch/da2734/twitter/mturk_mar6/data_binary/cache/cached_bert_train_multi_class_512_train_is_unemployed.csv
03/30/2020 15:59:14 - INFO - root -   Loading features from cached file /scratch/da2734/twitter/mturk_mar6/data_binary/cache/cached_bert_dev_multi_class_512_val_is_unemployed.csv
num_labels 2
time taken to load all this stuff: 0.8670964241027832 seconds
03/30/2020 15:59:14 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/da2734/.cache/torch/transfo

saving to file
/scratch/da2734/twitter/mturk_mar6/output_is_unemployed/model_out_0
03/30/2020 15:59:18 - INFO - transformers.configuration_utils -   Configuration saved in /scratch/da2734/twitter/mturk_mar6/output_is_unemployed/model_out_0/config.json
03/30/2020 15:59:20 - INFO - transformers.modeling_utils -   Model weights saved in /scratch/da2734/twitter/mturk_mar6/output_is_unemployed/model_out_0/pytorch_model.bin
03/30/2020 16:03:15 - INFO - root -   Running evaluation
03/30/2020 16:03:15 - INFO - root -     Num examples = 1101
03/30/2020 16:03:15 - INFO - root -     Batch size = 16


03/30/2020 16:03:31 - INFO - root -   eval_loss after epoch 1: 0.5354319494487583: 
03/30/2020 16:03:31 - INFO - root -   eval_accuracy after epoch 1: 0.7801998183469573: 
>> saving to..  /scratch/da2734/twitter/mturk_mar6/output_is_unemployed/results.json
03/30/2020 16:03:31 - INFO - root -   lr after epoch 1: 0.0
03/30/2020 16:03:31 - INFO - root -   train_loss after epoch 1: 0.43081418589812226
03/30/2020 16:03:31 - INFO - root -   

lost_job_1mo creating model and loading..
03/30/2020 16:03:31 - INFO - root -   {'run_text': 'labor mturk ar 6 binary', 'train_size': -1, 'val_size': -1, 'log_path': PosixPath('/scratch/da2734/twitter/mturk_mar6/log_lost_job_1mo'), 'full_data_dir': PosixPath('/scratch/da2734/twitter/mturk_mar6/data_binary'), 'data_dir': PosixPath('/scratch/da2734/twitter/mturk_mar6/data_binary'), 'task_name': 'intent', 'no_cuda': False, 'output_dir': PosixPath('/scratch/da2734/twitter/mturk_mar6/output_lost_job_1mo'), 'max_seq_length': 512, 'do_train': True, 'do_eval': 

saving to file
/scratch/da2734/twitter/mturk_mar6/output_lost_job_1mo/model_out_0
03/30/2020 16:03:46 - INFO - transformers.configuration_utils -   Configuration saved in /scratch/da2734/twitter/mturk_mar6/output_lost_job_1mo/model_out_0/config.json
03/30/2020 16:03:48 - INFO - transformers.modeling_utils -   Model weights saved in /scratch/da2734/twitter/mturk_mar6/output_lost_job_1mo/model_out_0/pytorch_model.bin
03/30/2020 16:07:44 - INFO - root -   Running evaluation
03/30/2020 16:07:44 - INFO - root -     Num examples = 1103
03/30/2020 16:07:44 - INFO - root -     Batch size = 16


03/30/2020 16:08:00 - INFO - root -   eval_loss after epoch 1: 0.3221706247524075: 
03/30/2020 16:08:00 - INFO - root -   eval_accuracy after epoch 1: 0.898458748866727: 
>> saving to..  /scratch/da2734/twitter/mturk_mar6/output_lost_job_1mo/results.json
03/30/2020 16:08:00 - INFO - root -   lr after epoch 1: 0.0
03/30/2020 16:08:00 - INFO - root -   train_loss after epoch 1: 0.44141741756997677
03/30/2020 16:08:00 - INFO - root -   

job_search creating model and loading..
03/30/2020 16:08:00 - INFO - root -   {'run_text': 'labor mturk ar 6 binary', 'train_size': -1, 'val_size': -1, 'log_path': PosixPath('/scratch/da2734/twitter/mturk_mar6/log_job_search'), 'full_data_dir': PosixPath('/scratch/da2734/twitter/mturk_mar6/data_binary'), 'data_dir': PosixPath('/scratch/da2734/twitter/mturk_mar6/data_binary'), 'task_name': 'intent', 'no_cuda': False, 'output_dir': PosixPath('/scratch/da2734/twitter/mturk_mar6/output_job_search'), 'max_seq_length': 512, 'do_train': True, 'do_eval': True, 'd

saving to file
/scratch/da2734/twitter/mturk_mar6/output_job_search/model_out_0
03/30/2020 16:08:15 - INFO - transformers.configuration_utils -   Configuration saved in /scratch/da2734/twitter/mturk_mar6/output_job_search/model_out_0/config.json
03/30/2020 16:08:17 - INFO - transformers.modeling_utils -   Model weights saved in /scratch/da2734/twitter/mturk_mar6/output_job_search/model_out_0/pytorch_model.bin
03/30/2020 16:12:14 - INFO - root -   Running evaluation
03/30/2020 16:12:14 - INFO - root -     Num examples = 1107
03/30/2020 16:12:14 - INFO - root -     Batch size = 16


03/30/2020 16:12:30 - INFO - root -   eval_loss after epoch 1: 0.3901121497154236: 
03/30/2020 16:12:30 - INFO - root -   eval_accuracy after epoch 1: 0.8654019873532068: 
>> saving to..  /scratch/da2734/twitter/mturk_mar6/output_job_search/results.json
03/30/2020 16:12:30 - INFO - root -   lr after epoch 1: 0.0
03/30/2020 16:12:30 - INFO - root -   train_loss after epoch 1: 0.5369812721379827
03/30/2020 16:12:30 - INFO - root -   

is_hired_1mo creating model and loading..
03/30/2020 16:12:30 - INFO - root -   {'run_text': 'labor mturk ar 6 binary', 'train_size': -1, 'val_size': -1, 'log_path': PosixPath('/scratch/da2734/twitter/mturk_mar6/log_is_hired_1mo'), 'full_data_dir': PosixPath('/scratch/da2734/twitter/mturk_mar6/data_binary'), 'data_dir': PosixPath('/scratch/da2734/twitter/mturk_mar6/data_binary'), 'task_name': 'intent', 'no_cuda': False, 'output_dir': PosixPath('/scratch/da2734/twitter/mturk_mar6/output_is_hired_1mo'), 'max_seq_length': 512, 'do_train': True, 'do_eval': True

saving to file
/scratch/da2734/twitter/mturk_mar6/output_is_hired_1mo/model_out_0
03/30/2020 16:12:45 - INFO - transformers.configuration_utils -   Configuration saved in /scratch/da2734/twitter/mturk_mar6/output_is_hired_1mo/model_out_0/config.json
03/30/2020 16:12:48 - INFO - transformers.modeling_utils -   Model weights saved in /scratch/da2734/twitter/mturk_mar6/output_is_hired_1mo/model_out_0/pytorch_model.bin
03/30/2020 16:16:44 - INFO - root -   Running evaluation
03/30/2020 16:16:44 - INFO - root -     Num examples = 1104
03/30/2020 16:16:44 - INFO - root -     Batch size = 16


03/30/2020 16:17:00 - INFO - root -   eval_loss after epoch 1: 0.23548285317593726: 
03/30/2020 16:17:00 - INFO - root -   eval_accuracy after epoch 1: 0.9365942028985508: 
>> saving to..  /scratch/da2734/twitter/mturk_mar6/output_is_hired_1mo/results.json
03/30/2020 16:17:00 - INFO - root -   lr after epoch 1: 0.0
03/30/2020 16:17:00 - INFO - root -   train_loss after epoch 1: 0.44552475091400023
03/30/2020 16:17:00 - INFO - root -   

job_offer creating model and loading..
03/30/2020 16:17:00 - INFO - root -   {'run_text': 'labor mturk ar 6 binary', 'train_size': -1, 'val_size': -1, 'log_path': PosixPath('/scratch/da2734/twitter/mturk_mar6/log_job_offer'), 'full_data_dir': PosixPath('/scratch/da2734/twitter/mturk_mar6/data_binary'), 'data_dir': PosixPath('/scratch/da2734/twitter/mturk_mar6/data_binary'), 'task_name': 'intent', 'no_cuda': False, 'output_dir': PosixPath('/scratch/da2734/twitter/mturk_mar6/output_job_offer'), 'max_seq_length': 512, 'do_train': True, 'do_eval': True, 'do

saving to file
/scratch/da2734/twitter/mturk_mar6/output_job_offer/model_out_0
03/30/2020 16:17:15 - INFO - transformers.configuration_utils -   Configuration saved in /scratch/da2734/twitter/mturk_mar6/output_job_offer/model_out_0/config.json
03/30/2020 16:17:17 - INFO - transformers.modeling_utils -   Model weights saved in /scratch/da2734/twitter/mturk_mar6/output_job_offer/model_out_0/pytorch_model.bin
03/30/2020 16:21:13 - INFO - root -   Running evaluation
03/30/2020 16:21:13 - INFO - root -     Num examples = 1105
03/30/2020 16:21:13 - INFO - root -     Batch size = 16


03/30/2020 16:21:30 - INFO - root -   eval_loss after epoch 1: 0.23045690836650984: 
03/30/2020 16:21:30 - INFO - root -   eval_accuracy after epoch 1: 0.9149321266968325: 
>> saving to..  /scratch/da2734/twitter/mturk_mar6/output_job_offer/results.json
03/30/2020 16:21:30 - INFO - root -   lr after epoch 1: 0.0
03/30/2020 16:21:30 - INFO - root -   train_loss after epoch 1: 0.4242126832549438
03/30/2020 16:21:30 - INFO - root -   



In [5]:
learner.validate()

03/06/2020 23:41:11 - INFO - root -   Running evaluation
03/06/2020 23:41:11 - INFO - root -     Num examples = 737
03/06/2020 23:41:11 - INFO - root -     Batch size = 16


{'loss': 0.23552671804073008,
 'accuracy_thresh': 0.9175034165382385,
 'roc_auc': 0.8943037311014089,
 'fbeta': 0.1971413791179657,
 'accuracy': 0.0,
 'accuracy_multilabel': 0.6390773405698779}

In [6]:
learner.save_model()

03/06/2020 23:41:38 - INFO - transformers.configuration_utils -   Configuration saved in /scratch/da2734/twitter/mturk_mar6/output_100/model_out/config.json
03/06/2020 23:41:41 - INFO - transformers.modeling_utils -   Model weights saved in /scratch/da2734/twitter/mturk_mar6/output_100/model_out/pytorch_model.bin


In [7]:
texts = ['I just received a job offer']
predictions = learner.predict_batch(texts)
print(predictions[0])

03/06/2020 23:41:52 - INFO - root -   Writing example 0 of 1
[('is_hired_1mo', 0.35232582688331604), ('lost_job_1mo', 0.31937041878700256), ('is_unemployed', 0.31286755204200745), ('job_search', 0.23360265791416168), ('job_offer"', 0.056336939334869385)]
