## make sure you have all libraries installed. 
use conda environment: /scratch/da2734/twitter/worldbank_twitter_environment.yml

In [62]:
import os
workers = os.cpu_count()
if 'sched_getaffinity' in dir(os):
    workers = len(os.sched_getaffinity(0))
print('number of cpus:', workers)

import re
with open('/proc/meminfo') as f:
    meminfo = f.read()
matched = re.search(r'^MemTotal:\s+(\d+)', meminfo)
if matched: 
    mem_total_kB = int(matched.groups()[0])
# meminfo 
    
print('memory available (GB):', mem_total_kB/1024/1024)

# import os
# mem=str(os.popen('free -t -m').readlines())
# mem

number of cpus: 16
memory available (GB): 251.8069610595703


In [None]:
#gets all this setup
import time
start_time = time.time()
from transformers import BertTokenizer
from pathlib import Path
import torch

from box import Box
import pandas as pd
import collections
import os
from tqdm import tqdm, trange
import sys
import random
import numpy as np
# import apex
from sklearn.model_selection import train_test_split

import datetime

from fast_bert.modeling import BertForMultiLabelSequenceClassification
from fast_bert.data_cls import BertDataBunch, InputExample, InputFeatures, MultiLabelTextProcessor, convert_examples_to_features
from fast_bert.learner_cls import BertLearner
# from fast_bert.metrics import accuracy_multilabel, accuracy_thresh, fbeta, roc_auc, accuracy
from fast_bert.metrics import *

torch.cuda.empty_cache()

pd.set_option('display.max_colwidth', -1)
run_start_time = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M-%S')

# LOG_PATH=Path('/scratch/da2734/twitter/sana/log/')
# DATA_PATH=Path('/scratch/da2734/twitter/sana/data')
# LABEL_PATH=Path('/scratch/da2734/twitter/sana/data/')
# OUTPUT_PATH=Path('/scratch/da2734/twitter/sana/output/')
LOG_PATH=Path('/scratch/da2734/twitter/mturk_mar6/log/')
DATA_PATH=Path('/scratch/da2734/twitter/mturk_mar6/data')
LABEL_PATH=Path('/scratch/da2734/twitter/mturk_mar6/data/')
OUTPUT_PATH=Path('/scratch/da2734/twitter/mturk_mar6/output_100')
FINETUNED_PATH = None

args = Box({
    "run_text": "multilabel toxic comments with freezable layers",
    "train_size": -1,
    "val_size": -1,
    "log_path": LOG_PATH,
    "full_data_dir": DATA_PATH,
    "data_dir": DATA_PATH,
    "task_name": "labor_market_classification",
    "no_cuda": False,
#     "bert_model": BERT_PRETRAINED_PATH,
    "output_dir": OUTPUT_PATH,
    "max_seq_length": 512,
    "do_train": True,
    "do_eval": True,
    "do_lower_case": True,
    "train_batch_size": 8,
    "eval_batch_size": 200,
    "learning_rate": 5e-5,
    "num_train_epochs": 6,
    "warmup_proportion": 0.0,
    "no_cuda": False,
    "local_rank": -1,
    "seed": 42,
    "gradient_accumulation_steps": 1,
    "optimize_on_cpu": False,
    "fp16": False,
    "fp16_opt_level": "O1",
    "weight_decay": 0.0,
    "adam_epsilon": 1e-8,
    "max_grad_norm": 1.0,
    "max_steps": -1,
    "warmup_steps": 500,
    "logging_steps": 50,
    "eval_all_checkpoints": True,
    "overwrite_output_dir": True,
    "overwrite_cache": False,
    "seed": 42,
    "loss_scale": 128,
    "task_name": 'intent',
    "model_name": 'bert-base-uncased',
    "model_type": 'bert'
})

import logging

logfile = str(LOG_PATH/'log-{}-{}.txt'.format(run_start_time, args["run_text"]))

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
    datefmt='%m/%d/%Y %H:%M:%S',
    handlers=[
        logging.FileHandler(logfile),
        logging.StreamHandler(sys.stdout)
    ])

logger = logging.getLogger()

# logger.info(args)

device = torch.device('cuda')
if torch.cuda.device_count() > 1:
    args.multi_gpu = True
else:
    args.multi_gpu = False

label_cols = ["job_loss","is_unemployed","job_search","is_hired","job_offer"]

# databunch defined here https://github.com/kaushaltrivedi/fast-bert/blob/master/fast_bert/data_cls.py
databunch = BertDataBunch(
                        args['data_dir'], 
                        LABEL_PATH, 
                        args.model_name, 
                        train_file='train.csv', 
                        val_file='val.csv',
                        # test_data='test.csv',
                        text_col="text", #this is the name of the column in the train file that containts the tweet text
                        label_col=label_cols,
                        batch_size_per_gpu=args['train_batch_size'], 
                        max_seq_length=args['max_seq_length'], 
                        multi_gpu=args.multi_gpu, 
                        multi_label=True, 
                        model_type=args.model_type)

num_labels = len(databunch.labels)
print('num_labels', num_labels)

print('__Python VERSION:', sys.version)
print('__pyTorch VERSION:', torch.__version__)
print('__CUDA VERSION')
print('__CUDNN VERSION:', torch.backends.cudnn.version())
print('__Number CUDA Devices:', torch.cuda.device_count())
print('__Devices')
print('Active CUDA Device: GPU', torch.cuda.current_device())

print ('Available devices ', torch.cuda.device_count())
# print ('Current cuda device ', torch.cuda.current_device)

# metrics defined: https://github.com/kaushaltrivedi/fast-bert/blob/d89e2aa01d948d6d3cdea7ad106bf5792fea7dfa/fast_bert/metrics.py
metrics = []
metrics.append({'name': 'accuracy_thresh', 'function': accuracy_thresh})
metrics.append({'name': 'roc_auc', 'function': roc_auc})
metrics.append({'name': 'roc_auc_save_to_plot', 'function': roc_auc_save_to_plot})
metrics.append({'name': 'fbeta', 'function': fbeta})
metrics.append({'name': 'accuracy', 'function': accuracy})
metrics.append({'name': 'accuracy_multilabel', 'function': accuracy_multilabel})


learner = BertLearner.from_pretrained_model(
                                            databunch, 
                                            pretrained_path='/scratch/da2734/twitter/mturk_mar6/output_100/model_out/', 
                                            metrics=metrics, 
                                            device=device, 
                                            logger=logger, 
                                            output_dir=args.output_dir, 
                                            finetuned_wgts_path=FINETUNED_PATH, 
                                            warmup_steps=args.warmup_steps,
                                            multi_gpu=args.multi_gpu, 
                                            is_fp16=args.fp16, 
                                            multi_label=True, 
                                            logging_steps=0)

print('time taken to load all this stuff:', str(time.time() - start_time), 'seconds')



# loading random and filtered samples

In [12]:
# filtered contains 8G of data!!
import time
start_time = time.time()
import pyarrow.parquet as pq
from glob import glob
import os
country_code = 'US'
month = '2012-1'
path_to_data = '/scratch/spf248/twitter/data/classification/US/filtered/'
tweets_filtered=pq.ParquetDataset(glob(os.path.join(path_to_data,                                           
#                                            country_code,
#                                            month,
                                           '*.parquet'))).read().to_pandas()
print('time taken to load keyword filtered sample:', str(time.time() - start_time), 'seconds')
print(tweets_filtered.shape)

time taken to load keyword filtered sample: 329.5322268009186 seconds
(92121093, 11)


In [13]:
# random contains 7.3G of data!!
import time
start_time = time.time()
import pyarrow.parquet as pq
from glob import glob
import os
country_code = 'US'
month = '2012-1'
path_to_data = '/scratch/spf248/twitter/data/classification/US/random'
tweets_random=pq.ParquetDataset(glob(os.path.join(path_to_data,                                           
#                                            country_code,
#                                            month,
                                           '*.parquet'))).read().to_pandas()
print('time taken to load random sample:', str(time.time() - start_time), 'seconds')
print(tweets_random.shape)

time taken to load random sample: 320.91191005706787 seconds
(92114009, 11)


# read csv output from model

In [22]:
import glob
import pandas as pd
import time
start_time = time.time()
model_output_path = '/scratch/spf248/twitter/data/classification/US/BERT/twitter_sam/mturk_mar6/pred/'
model_output_filtered = pd.concat([pd.read_csv(f) for f in glob.glob(model_output_path+'filtered*.csv')], ignore_index = True)
print('time taken to load filtered sample:', str(time.time() - start_time), 'seconds')

import glob
import pandas as pd
import time
start_time = time.time()
model_output_path = '/scratch/spf248/twitter/data/classification/US/BERT/twitter_sam/mturk_mar6/pred/'
model_output_random = pd.concat([pd.read_csv(f) for f in glob.glob(model_output_path+'random*.csv')], ignore_index = True)
print('time taken to load random sample:', str(time.time() - start_time), 'seconds')

model_output_random.columns = ['tweet_id', 'offer_model', 'search_model', 'unemployed_model', 'hired_model', 'loss_model']
model_output_filtered.columns = ['tweet_id', 'search_model', 'unemployed_model', 'offer_model', 'hired_model', 'loss_model']

time taken to load filtered sample: 71.29864716529846 seconds
time taken to load random sample: 69.16849756240845 seconds


In [51]:
model_output_filtered.head()

Unnamed: 0,tweet_id,search_model,unemployed_model,offer_model,hired_model,loss_model
0,367703789572079616,0.069985,0.05853,0.048582,0.03625,0.029396
1,367703936251490304,0.056376,0.046692,0.075235,0.03498,0.032195
2,367704099858300929,0.069512,0.049335,0.056562,0.043067,0.029765
3,367704339076243457,0.064265,0.044906,0.063739,0.041979,0.0369
4,367704640948682752,0.080633,0.048776,0.059704,0.051603,0.030082


In [50]:
model_output_random.columns = ['tweet_id', 'offer_model', 'search_model', 'unemployed_model', 'hired_model', 'loss_model']
model_output_filtered.columns = ['tweet_id', 'search_model', 'unemployed_model', 'offer_model', 'hired_model', 'loss_model']

In [44]:
tweets_random.head()

Unnamed: 0,tweet_id,text,fired,hired,job,laid_off,position,quit,unemployed,work,keyword
0,367881326273105920,@shoebydoo32 I only left to go back home for t...,False,False,False,False,False,False,False,False,False
1,367881326281519105,oh my god bina id idnt read that u played GTA ...,False,False,False,False,False,False,False,False,False
2,367882185916702722,I Have To Make What I Think Is The Best Decisi...,False,False,False,False,False,False,False,False,False
3,367883553121394690,@elizrod_ that's from hard work 😉,False,False,False,False,False,False,False,True,True
4,367883867719348224,RT @MileenaSucks: Can I just lay out in the gr...,False,False,False,False,False,False,False,False,False


# working with smaller samples for each

In [63]:
tweets_random_sample = tweets_random.head(10000)
print(tweets_random_sample.shape)
tweets_filtered_sample = tweets_filtered.head(10000)
print(tweets_filtered_sample.shape)

model_output_filtered_sample = model_output_filtered.head(10000)
print(model_output_filtered_sample.shape)
model_output_random_sample = model_output_random.head(10000)
print(model_output_random_sample.shape)

(10000, 11)
(10000, 11)
(10000, 6)
(10000, 6)


In [68]:
tweets_filtered_sample.head()

Unnamed: 0,tweet_id,text,fired,hired,job,laid_off,position,quit,unemployed,work,keyword
0,276933934124765184,Damn i have to much homework,False,False,False,False,False,False,False,True,True
1,277143632232992768,Does a bedazzler work on leather? Serious ques...,False,False,False,False,False,False,False,True,True
2,277150490276540416,RT @porcelain10: Washington Post D Milbank bas...,False,False,False,False,False,False,False,True,True
3,277151157208637440,Finally off work,False,False,False,False,False,False,False,True,True
4,277175194454462465,Irrational: No BioShock PS Vita until Sony and...,False,False,False,False,False,False,False,True,True


In [69]:
model_output_filtered_sample.head()

Unnamed: 0,tweet_id,search_model,unemployed_model,offer_model,hired_model,loss_model
0,367703789572079616,0.069985,0.05853,0.048582,0.03625,0.029396
1,367703936251490304,0.056376,0.046692,0.075235,0.03498,0.032195
2,367704099858300929,0.069512,0.049335,0.056562,0.043067,0.029765
3,367704339076243457,0.064265,0.044906,0.063739,0.041979,0.0369
4,367704640948682752,0.080633,0.048776,0.059704,0.051603,0.030082


In [71]:
# tweets_filtered_sample.dtypes
# tweets_filtered_sample['tweet_id'] = tweets_filtered_sample['tweet_id'].apply(pd.to_numeric)
# tweets_filtered_sample.dtypes
# merged_filtered_sample = tweets_filtered_sample.merge(model_output_filtered_sample, on='tweet_id')

merged_filtered_sample = pd.concat([tweets_filtered_sample, model_output_filtered_sample], join="inner", axis = 'tweet_id')

merged_filtered_sample.head()

ValueError: No axis named tweet_id for object type <class 'pandas.core.frame.DataFrame'>

# converting original tweets column type to int from object

In [41]:
start_time = time.time()
tweets_filtered['tweet_id'] = tweets_filtered['tweet_id'].apply(pd.to_numeric)
print('time taken for filtered conversion:', str(time.time() - start_time), 'seconds')

KeyboardInterrupt: 

In [None]:
start_time = time.time()
tweets_random['tweet_id'] = tweets_random['tweet_id'].apply(pd.to_numeric)
print('time taken for random conversion:', str(time.time() - start_time), 'seconds')

# merging original tweets with model output

In [None]:
start_time = time.time()
merged_filtered = tweets_filtered.merge(model_output_filtered, on='tweet_id')
print(merged_filtered.shape)
print('time taken for merge filtered:', str(time.time() - start_time), 'seconds')

In [None]:
start_time = time.time()
merged_random = tweets_random.merge(model_output_random, on='tweet_id')
print(merged_random.shape)
print('time taken for merge random:', str(time.time() - start_time), 'seconds')

# then we can pick tweets close to any threshold...

In [58]:
threshold = 0.5
boundary_width = 0.05
columns = ['offer_model','search_model','unemployed_model','hired_model','loss_model']

for column in columns:
    start_time = time.time()
    all_filtered_boundary = merged_filtered.loc[(merged_filtered[column] >= threshold - boundary_width) & 
                                                (merged_filtered[column] <= threshold + boundary_width)]
    all_filtered_boundary.to_csv('../mturk_mar6/boundary/filtered_{}.csv'.format(column))
    # print(all_filtered_boundary['text'])
    print(column, 'filtered time taken:', str(time.time() - start_time), 'seconds')

offer_model filtered time taken: 0.00632476806640625 seconds
search_model filtered time taken: 0.005393505096435547 seconds
unemployed_model filtered time taken: 0.004188060760498047 seconds
hired_model filtered time taken: 0.015344381332397461 seconds
loss_model filtered time taken: 0.0038330554962158203 seconds


In [60]:
# merged_random = merged_filtered_sample

for column in columns:
    start_time = time.time()
    all_filtered_boundary = merged_random.loc[(merged_random[column] >= threshold - boundary_width) & 
                                                (merged_random[column] <= threshold + boundary_width)]
    all_filtered_boundary.to_csv('../mturk_mar6/boundary/random_{}.csv'.format(column))
    # print(all_filtered_boundary['text'])
    print(column, 'random time taken:', str(time.time() - start_time), 'seconds')

offer_model random time taken: 0.004136085510253906 seconds
search_model random time taken: 0.002611875534057617 seconds
unemployed_model random time taken: 0.0023164749145507812 seconds
hired_model random time taken: 0.0022461414337158203 seconds
loss_model random time taken: 0.0023136138916015625 seconds
