In [1]:
import os
import sys
import numpy as np
import pandas as pd
from timeit import default_timer as timer
from glob import glob
import pyarrow.parquet as pq
import socket
import logging

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
from pytorch_pretrained_bert.modeling import BertForNextSentencePrediction
from pytorch_pretrained_bert import BertTokenizer

from fastprogress import master_bar, progress_bar

# Parameters

In [2]:
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)

logger = logging.getLogger("bert")

In [3]:
targets = {
0:'I lost my job today',
5:'I was fired earlier this week',
# 'I recently got laid off',
# 'I just quit my job',

1:'Now I am unemployed',
6:'I am currently not working',
    
2:'I am searching for a new position',
7:'Anyone hiring?',
# 'I am looking for a job',
    
3:'I got hired today',
8:'I recently started working at my new job',
# 'I just found a position',
    
4:'Here is a job opportunity you might be interested in',
9:'Looking for a new position?', 
}

print('Target Sentences:\n')
print('\n'.join(targets.values()))

Target Sentences:

I lost my job today
I was fired earlier this week
Now I am unemployed
I am currently not working
I am searching for a new position
Anyone hiring?
I got hired today
I recently started working at my new job
Here is a job opportunity you might be interested in
Looking for a new position?


In [4]:
def get_env_var(varname,default):
    
    if os.environ.get(varname) != None:
        var = int(os.environ.get(varname))
        print(varname,':', var)
    else:
        var = default
        print(varname,':', var,'(Default)')
    return var

SLURM_ARRAY_TASK_ID = get_env_var('SLURM_ARRAY_TASK_ID',0)

SLURM_ARRAY_TASK_ID : 0 (Default)


In [5]:
n_partition = 50
print('# Partitions:', n_partition)

if SLURM_ARRAY_TASK_ID>=n_partition*len(targets):
    sys.exit('Sorry come again ;)')

# Partitions: 50


In [6]:
i_target, i_partition = np.unravel_index(SLURM_ARRAY_TASK_ID, (len(targets), n_partition))
print("Target Sentence Index:", i_target, ' / ', len(targets)-1)
print("Partition Index:", i_partition, ' / ', n_partition-1)

Target Sentence Index: 0  /  9
Partition Index: 0  /  49


In [7]:
target = targets[i_target]
print("Target Sentence:", target)

Target Sentence: I lost my job today


In [8]:
country = 'US'
print('Country:', country)

if 'samuel' in socket.gethostname().lower():
    path_to_data = os.path.join('../../data/classification/',country)
else:
    path_to_data = os.path.join('/scratch/spf248/twitter/data/classification/',country)
print("path to data:", path_to_data)

os.makedirs(os.path.join(path_to_data,'similarity'), exist_ok=True)

output_file  = 'target-'+str(i_target)+'-partition-'+str(i_partition)+'.csv'
print("output file:", output_file)

if os.path.exists(os.path.join(path_to_data,output_file)):
    sys.exit('Let"s go for a walk!')

Country: US
path to data: ../../data/classification/US
output file: target-0-partition-0.csv


In [9]:
frac = 1.0
print('Sampling:', frac*100,'%')

Sampling: 1.0 %


In [10]:
# Default
max_seq_length = 128
print('max_seq_length:', max_seq_length)

max_seq_length: 128


In [11]:
# Default
BATCH_SIZE = 32
print('BATCH_SIZE:', BATCH_SIZE)

BATCH_SIZE: 32


# Load Data

In [12]:
start = timer()

files_filtered=list(np.array_split(sorted(glob(os.path.join(path_to_data,'filtered','*.parquet'))),n_partition)[i_partition])
print('Import Files:')
print('\n'.join(files_filtered))
print('# Files:', len(files_filtered))

tweets_filtered=pq.ParquetDataset(files_filtered).read().to_pandas().set_index('tweet_id')['text']
print('# Tweets Containing At Least One Keyword:', tweets_filtered.shape[0])

print("Done in", round(timer()-start), "sec")

Import Files:
../../data/classification/US/filtered/part-00000-8b187f2d-a1f8-446d-845c-ad55c236e103-c000.snappy.parquet
# Files: 1
# Tweets Containing At Least One Keyword: 9111
Done in 0 sec


In [13]:
start = timer()

files_random=list(np.array_split(sorted(glob(os.path.join(path_to_data,'random','*.parquet'))),n_partition)[i_partition])
print('Import Files:')
print('\n'.join(files_random))
print('# Files:', len(files_random))

tweets_random=pq.ParquetDataset(files_random).read().to_pandas().set_index('tweet_id')['text']
print('# Random Tweets:', tweets_random.shape[0])

print("Done in", round(timer()-start), "sec")

Import Files:
../../data/classification/US/random/part-00000-ee5d86d2-6cc0-4365-82d3-7693f30340c7-c000.snappy.parquet
# Files: 1
# Random Tweets: 8667
Done in 0 sec


In [14]:
print('Partition Data...')
start = timer()

data=pd.concat([tweets_filtered,tweets_random]).sample(frac=frac,random_state=0).copy()
print('# Tweets Sampled:', data.shape[0])
del tweets_filtered, tweets_random

print("Done in", round(timer()-start), "sec")

Partition Data...
# Tweets Sampled: 178
Done in 0 sec


In [15]:
data.apply(len).quantile(0.95)

262.2500000000001

In [16]:
data.head()

tweet_id
1035899433400000512    RT @nyssabarfield: If another girl got your at...
1016808169560002560    RT @DavidBegnaud: The officer in this video - ...
1050222272743698432    @politico Nikki Haley isn't as stupid as the N...
1042124258703421440    @CBS4Dom @RobCBS4 Beautiful @RobCBS4! Always g...
1029291462633172992    @StormyDaniels So are you going to tell us wha...
Name: text, dtype: object

# Load Model

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [18]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

12/22/2019 17:06:36 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /Users/samuelfraiberger/.pytorch_pretrained_bert/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [19]:
model = BertForNextSentencePrediction.from_pretrained("bert-base-uncased").to(device)

12/22/2019 17:06:37 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /Users/samuelfraiberger/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
12/22/2019 17:06:37 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /Users/samuelfraiberger/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /var/folders/f9/mwb36gs54dl_yvpsjtjrsz_80000gn/T/tmpry4tn3qt
12/22/2019 17:06:40 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attent

# Format Data

In [20]:
class InputFeatures(object):
    """A single set of features of data."""
    def __init__(self, input_ids, input_mask, segment_ids, target):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.target = target

In [21]:
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""
    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()

In [22]:
def convert_sentence_pair(first_sentences, second_sentences, max_seq_length, tokenizer):
    features = []
    for (ex_index, (first_sentence, second_sentence)) in enumerate(zip(first_sentences, second_sentences)):
        tokens_a = tokenizer.tokenize(first_sentence)

        tokens_b = None
        tokens_b = tokenizer.tokenize(second_sentence)
        # Modifies `tokens_a` and `tokens_b` in place so that the total
        # length is less than the specified length.
        # Account for [CLS], [SEP], [SEP] with "- 3"
        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0   0   0   0  0     0 0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambigiously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
        segment_ids = [0] * len(tokens)

        if tokens_b:
            tokens += tokens_b + ["[SEP]"]
            segment_ids += [1] * (len(tokens_b) + 1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("tokens: %s" % " ".join(
                    [str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            logger.info(
                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))

        features.append(
                InputFeatures(
                    input_ids=input_ids,
                    input_mask=input_mask,
                    segment_ids=segment_ids,
                    target=1
        ))
    return features

In [23]:
print('Partition Data...')
start = timer()

sentence_pairs  = convert_sentence_pair(
[target]*data.shape[0],
data.tolist(),
max_seq_length=max_seq_length,
tokenizer=tokenizer)

print("Done in", round(timer()-start), "sec")

12/22/2019 17:06:43 - INFO - bert -   *** Example ***
12/22/2019 17:06:43 - INFO - bert -   tokens: [CLS] i lost my job today [SEP] rt @ ny ##ssa ##bar ##field : if another girl got your attention , ion want yours no more [SEP]
12/22/2019 17:06:43 - INFO - bert -   input_ids: 101 1045 2439 2026 3105 2651 102 19387 1030 6396 11488 8237 3790 1024 2065 2178 2611 2288 2115 3086 1010 10163 2215 6737 2053 2062 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12/22/2019 17:06:43 - INFO - bert -   input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12/22/2019 17:06:43 - INFO - bert -   segment_ids: 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1

Partition Data...
Done in 0 sec


# Infer Sentence Similarities

In [24]:
BATCH_SIZE = BATCH_SIZE # 16 # 32 # 64 # 128 # 256 # 512 #1024

print("Infer Similarities...")
start = timer()

logger.info("***** Running evaluation *****")
all_input_ids   = torch.tensor([f.input_ids for f in sentence_pairs], dtype=torch.long)
all_input_mask  = torch.tensor([f.input_mask for f in sentence_pairs], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in sentence_pairs], dtype=torch.long)
eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids)

# Run prediction for full data
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=BATCH_SIZE)

logger.info("  Num examples = %d", len(data))
logger.info("  Batch size = %d", BATCH_SIZE)

model.eval()

res = []

mb = progress_bar(eval_dataloader)
for input_ids, input_mask, segment_ids in mb:
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)

    with torch.no_grad():
        res.append(nn.functional.softmax(
            model(input_ids, segment_ids, input_mask), dim=1
        )[:, 0].detach().cpu().numpy())
        
res = np.concatenate(res)
scores = pd.DataFrame({'score':res,'target':[target]*len(res)},index=data.index)

print("Done in", round(timer()-start), "sec")

12/22/2019 17:06:43 - INFO - bert -   ***** Running evaluation *****
12/22/2019 17:06:43 - INFO - bert -     Num examples = 178
12/22/2019 17:06:43 - INFO - bert -     Batch size = 32


Infer Similarities...


Done in 65 sec


In [25]:
print("Save...")
start = timer()

scores.to_csv(os.path.join(path_to_data,'similarity',output_file))

print("Done in", round(timer()-start), "sec")

Save...
Done in 0 sec


In [26]:
print('50 most similar tweets:\n')
print('\n'.join(pd.concat([scores,data],1).sort_values(by='score',ascending=False)['text'].head(50).tolist()))

50 most similar tweets:

I half ass did my makeup today cause I never do my makeup anymore and I have finally quit crying over A Star is Born 💫 https://t.co/GLjZbMUPm9
@AP I read no remorse in the article. Just woe is me. Sorry she finally got caught and it cost her. I am glad the rest of the cast isn’t losing their jobs though.
Had a long ass week at Busch Gardens but I had a lot of fun with my new co workers and here’s a video of one of the the dopest rides in Florida https://t.co/HfptNzvtGW
Alright, work is a shit show. I already fucked shit up. I’m ready for my shift to be over and I still have 5 hours too go. Shoot me.
RT @tryna_be_famous: Just seen the nigga that possibly got fired two nights ago walk into the building. Lemme take my ass home.
On my way to work. Good morning everyone I hope all has a blessed day!
I really love my job.
RT @nhannahjones: There is a man in Greenwood TODAY, whose teeth got shot out of his face for working on voting rights, who lives on the ed…
if i j

In [27]:
sys.exit('See Ya :-)')

SystemExit: See Ya :-)

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


# Test Speed

In [None]:
# BATCH_SIZE = 128
# max_seq_length = 140
# 6 40 Intel(R) Skylake @ 2.40GHz 376 4 Tesla V100
pd.DataFrame([
(4933,13),
(49330,131),
(493300,1315),
(4932959,13150), # MaxRSS: 41446568K
],columns=['N','T'])

In [None]:
# N = 49330
# max_seq_length = 140
# 6 40 Intel(R) Skylake @ 2.40GHz 376 4 Tesla V100
pd.DataFrame([
(1024,132),
(512,132),
(256,131),
(128,131),
(64,134),
(32,138),
(16,147)],columns=['BATCH_SIZE','T'])