In [4]:
# gets all this setup
import time

start_time = time.time()

from transformers import BertTokenizer
from pathlib import Path
import torch

from box import Box
import pandas as pd
import collections
import os
from tqdm import tqdm, trange
import sys
import random
import numpy as np
# import apex
from sklearn.model_selection import train_test_split

import datetime

import sys

sys.path.append('../')
from fast_bert.modeling import BertForMultiLabelSequenceClassification
from fast_bert.data_cls import BertDataBunch, InputExample, InputFeatures, MultiLabelTextProcessor, \
    convert_examples_to_features
from fast_bert.learner_cls import BertLearner
from fast_bert.metrics import *

# column = sys.argv[1]
column = 'is_unemployed'

# for column in ["is_unemployed", "lost_job_1mo", "job_search", "is_hired_1mo", "job_offer"]:

print(column, 'creating model and loading..')

torch.cuda.empty_cache() 

pd.set_option('display.max_colwidth', -1)
run_start_time = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M-%S')


if not os.path.exists('/scratch/da2734/twitter/jobs/training_binary/model_log_binary_pos_neg_{}/'.format(column)):
    os.makedirs('/scratch/da2734/twitter/jobs/training_binary/model_log_binary_pos_neg_{}/'.format(column))

if not os.path.exists('/scratch/da2734/twitter/jobs/training_binary/models_may5_7Klabels_removed_allzeros/output_{}'.format(column)):
    os.makedirs(      '/scratch/da2734/twitter/jobs/training_binary/models_may5_7Klabels_removed_allzeros/output_{}'.format(column))

LOG_PATH = Path('/scratch/da2734/twitter/jobs/training_binary/model_log_binary_pos_neg_{}/'.format(column))
print('LOG_PATH', LOG_PATH)
DATA_PATH = Path('/scratch/da2734/twitter/data/may5_7Klabels/data_binary_pos_neg_balanced_removed_allzeros/')
LABEL_PATH = Path('/scratch/da2734/twitter/data/may5_7Klabels/data_binary_pos_neg_balanced_removed_allzeros/')
OUTPUT_PATH = Path('/scratch/da2734/twitter/jobs/training_binary/models_may5_7Klabels_removed_allzeros/output_{}'.format(column))
FINETUNED_PATH = None

args = Box({
    "run_text": "labor mturk may 5 binary",
    "train_size": -1,
    "val_size": -1,
    "log_path": LOG_PATH,
    "full_data_dir": DATA_PATH,
    "data_dir": DATA_PATH,
    "task_name": "labor_market_classification",
    #     "bert_model": BERT_PRETRAINED_PATH,
    "output_dir": OUTPUT_PATH,
    "max_seq_length": 512,
    "do_train": True,
    "do_eval": True,
    "do_lower_case": True,
    "train_batch_size": 8,
    "eval_batch_size": 16,
    "learning_rate": 5e-5,
    "num_train_epochs": 50,
    "warmup_proportion": 0.0,
    "no_cuda": False,
    "local_rank": -1,
    "seed": 42,
    "gradient_accumulation_steps": 1,
    "optimize_on_cpu": False,
    "fp16": False,
    "fp16_opt_level": "O1",
    "weight_decay": 0.0,
    "adam_epsilon": 1e-8,
    "max_grad_norm": 1.0,
    "max_steps": -1,
    "warmup_steps": 500,
    "logging_steps": 50,
    "eval_all_checkpoints": True,
    "overwrite_output_dir": True,
    "overwrite_cache": True,
    "seed": 42,
    "loss_scale": 128,
    "task_name": 'intent',
    "model_name": 'bert-base-uncased',
    "model_type": 'bert'
})

import logging

logfile = str(LOG_PATH / 'log-{}-{}.txt'.format(run_start_time, args["run_text"]))

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
    datefmt='%m/%d/%Y %H:%M:%S',
    handlers=[
        logging.FileHandler(logfile),
        logging.StreamHandler(sys.stdout)
    ])

logger = logging.getLogger()

logger.info(args)

device = torch.device('cuda')
if torch.cuda.device_count() > 1:
    args.multi_gpu = True
else:
    args.multi_gpu = False

label_cols = ['class'] #this is the name of the column in the train and val csv files where the labels are

databunch = BertDataBunch(
    args['data_dir'],
    LABEL_PATH,
    args.model_name,
    train_file='train_{}.csv'.format(column),
    val_file='val_{}.csv'.format(column),
    label_file='label_{}.csv'.format(column),
    # test_data='test.csv',
    text_col="text",  # this is the name of the column in the train file that containts the tweet text
    label_col=label_cols,
    batch_size_per_gpu=args['train_batch_size'],
    max_seq_length=args['max_seq_length'],
    multi_gpu=args.multi_gpu,
    multi_label=False,
    model_type=args.model_type)

num_labels = len(databunch.labels)
print('num_labels', num_labels)

print('time taken to load all this stuff:', str(time.time() - start_time), 'seconds')

# metrics defined: https://github.com/kaushaltrivedi/fast-bert/blob/d89e2aa01d948d6d3cdea7ad106bf5792fea7dfa/fast_bert/metrics.py
metrics = []
# metrics.append({'name': 'accuracy_thresh', 'function': accuracy_thresh})
# metrics.append({'name': 'roc_auc', 'function': roc_auc})
# metrics.append({'name': 'fbeta', 'function': fbeta})
metrics.append({'name': 'accuracy', 'function': accuracy})
# metrics.append({'name': 'accuracy_multilabel', 'function': accuracy_multilabel})

learner = BertLearner.from_pretrained_model(
    databunch,
    pretrained_path=args.model_name,
    metrics=metrics,
    device=device,
    logger=logger,
    output_dir=args.output_dir,
    finetuned_wgts_path=FINETUNED_PATH,
    warmup_steps=args.warmup_steps,
    multi_gpu=args.multi_gpu,
    is_fp16=args.fp16,
    multi_label=False,
    logging_steps=0)

learner.fit(args.num_train_epochs, args.learning_rate, validate=True)  # this trains the model


#     break

is_unemployed creating model and loading..
LOG_PATH /scratch/da2734/twitter/jobs/training_binary/model_log_binary_pos_neg_is_unemployed
05/05/2020 16:00:20 - INFO - root -   {'run_text': 'labor mturk may 5 binary', 'train_size': -1, 'val_size': -1, 'log_path': PosixPath('/scratch/da2734/twitter/jobs/training_binary/model_log_binary_pos_neg_is_unemployed'), 'full_data_dir': PosixPath('/scratch/da2734/twitter/data/may5_7Klabels/data_binary_pos_neg_balanced_removed_allzeros'), 'data_dir': PosixPath('/scratch/da2734/twitter/data/may5_7Klabels/data_binary_pos_neg_balanced_removed_allzeros'), 'task_name': 'intent', 'output_dir': PosixPath('/scratch/da2734/twitter/jobs/training_binary/models_may5_7Klabels_removed_allzeros/output_is_unemployed'), 'max_seq_length': 512, 'do_train': True, 'do_eval': True, 'do_lower_case': True, 'train_batch_size': 8, 'eval_batch_size': 16, 'learning_rate': 5e-05, 'num_train_epochs': 50, 'warmup_proportion': 0.0, 'no_cuda': False, 'local_rank': -1, 'seed': 42, 'g



05/05/2020 16:00:22 - INFO - root -   Writing example 0 of 2147
05/05/2020 16:00:23 - INFO - root -   Saving features into cached file /scratch/da2734/twitter/data/may5_7Klabels/data_binary_pos_neg_balanced_removed_allzeros/cache/cached_bert_train_multi_class_512_train_is_unemployed.csv
05/05/2020 16:00:24 - INFO - root -   Writing example 0 of 536
05/05/2020 16:00:25 - INFO - root -   Saving features into cached file /scratch/da2734/twitter/data/may5_7Klabels/data_binary_pos_neg_balanced_removed_allzeros/cache/cached_bert_dev_multi_class_512_val_is_unemployed.csv
num_labels 2
time taken to load all this stuff: 4.663341283798218 seconds
05/05/2020 16:00:25 - INFO - filelock -   Lock 47383918855440 acquired on /home/da2734/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517.lock
05/05/2020 16:00:25 - INFO - transformers.file_utils -   https://s3.amazonaws.com/models.huggingface.co/ber

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…


05/05/2020 16:00:25 - INFO - transformers.file_utils -   storing https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json in cache at /home/da2734/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517
05/05/2020 16:00:25 - INFO - transformers.file_utils -   creating metadata file for /home/da2734/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517
05/05/2020 16:00:25 - INFO - filelock -   Lock 47383918855440 released on /home/da2734/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517.lock
05/05/2020 16:00:25 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.

saving to file
/scratch/da2734/twitter/jobs/training_binary/models_may5_7Klabels_removed_allzeros/output_is_unemployed/model_out_0
05/05/2020 16:00:36 - INFO - transformers.configuration_utils -   Configuration saved in /scratch/da2734/twitter/jobs/training_binary/models_may5_7Klabels_removed_allzeros/output_is_unemployed/model_out_0/config.json
05/05/2020 16:00:36 - INFO - transformers.modeling_utils -   Model weights saved in /scratch/da2734/twitter/jobs/training_binary/models_may5_7Klabels_removed_allzeros/output_is_unemployed/model_out_0/pytorch_model.bin


KeyboardInterrupt: 