In [1]:
!pip install pyarrow --user

Collecting pyarrow
  Downloading pyarrow-1.0.1-cp38-cp38-manylinux2014_x86_64.whl (17.3 MB)
[K     |████████████████████████████████| 17.3 MB 6.4 MB/s eta 0:00:01     |███████████████████▎            | 10.4 MB 6.4 MB/s eta 0:00:02
Installing collected packages: pyarrow
Successfully installed pyarrow-1.0.1


In [1]:
import transformers

In [2]:
from transformers import BertTokenizer, BertModel, BertConfig, BertForTokenClassification, BertForMaskedLM
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer, FillMaskPipeline
import torch
import os
import pandas as pd
import pyarrow
import time

In [9]:
pd.options.display.max_colwidth = 100

## Utils

In [3]:
def get_token_in_sequence_with_most_attention(model, tokenizer, input_sequence):
    """Run an input sequence through the BERT model, collect and average attention scores per token and return token with most average attention."""
    tokenized_input_sequence = tokenizer.tokenize(input_sequence)
    input_ids = torch.tensor(tokenizer.encode(input_sequence, add_special_tokens=False)).unsqueeze(0)
    outputs = model(input_ids)
    last_hidden_states, pooler_outputs, hidden_states, attentions = outputs
    attention_tensor = torch.squeeze(torch.stack(attentions))
    attention_tensor_averaged = torch.mean(attention_tensor, (0, 1))
    attention_average_scores_per_token = torch.sum(attention_tensor_averaged, dim=0)
    attention_scores_dict = dict()
    for token_position in range(len(tokenized_input_sequence)):
        attention_scores_dict[token_position] = attention_average_scores_per_token[token_position].item()
    print('Attention scores dictionary: ', attention_scores_dict, '\n')
    return {'token_index': max(attention_scores_dict, key=attention_scores_dict.get),
            'token_str': tokenized_input_sequence[max(attention_scores_dict, key=attention_scores_dict.get)]}

In [4]:
def extract_keywords_from_mlm_results(mlm_results_list, K_kw_explore):
    selected_keywords_list = list()
    for rank_mlm_keyword in range(K_kw_explore):
        selected_keywords_list.append(mlm_results_list[rank_mlm_keyword]['token_str'])
    return selected_keywords_list

In [3]:
model_folder_path = '/scratch/mt4493/twitter_labor/trained_models/DeepPavlov_bert-base-cased-conversational_jul23_iter0_preprocessed_12207397'
data_folder_path = '/scratch/mt4493/twitter_labor/twitter-labor-data/data/inference/DeepPavlov_bert-base-cased-conversational_jul23_iter0_preprocessed_12207397-12226078/output'

In [4]:
column = 'job_offer'
job_offer_df = pd.read_parquet(os.path.join(data_folder_path, column, '{}_all.parquet'.format(column)))
job_offer_df = job_offer_df[~job_offer_df.text.str.contains("RT", na=False)].reset_index(drop=True)
job_offer_df = job_offer_df.sort_values(by=["score"], ascending=False).reset_index()
job_offer_df.head()

Unnamed: 0,index,score,text
0,25897079,0.993284,Patient Registration Representative - Patient ...
1,10520163,0.99302,Raleigh #NC #USA - Senior Software Engineer Us...
2,51369362,0.993005,#Career #opportunity for #Administrative Assis...
3,47480907,0.992976,"We're #hiring! Click to apply: Truck Driver, C..."
4,55766589,0.992974,Patient Services Representative(PSR) -... - Su...


In [10]:
top_job_offer_df = job_offer_df[:350000]
top_job_offer_df.tail(n=50)

Unnamed: 0,index,score,text
349950,34394126,0.979542,#jobs #oklahoma Registered Nurse I or II: Overview: We\'re a Little Different Our mission is… ht...
349951,20340313,0.97954,"San Clemente, CA - Administrative Assistant - Bilingual Spanish - PrideStaff: to 4PM. The starti..."
349952,38615196,0.97954,LIFE HEALTH: Optician (Optometry Tech) Ireland Army Community Hospital at Bristol Bay Native (Fo...
349953,51168110,0.979539,"https://t.co/YrirGZBSRy Retweeted #Stockton, CA #Nursing : Nursing Supervisor RN - Part Time - W..."
349954,24972159,0.979538,I have a job opening for a SAP MM/MRP expert. Contact Me in midwest http://t.co/ggT9mWm1tx #cont...
349955,29731906,0.979537,Production artist needed! North Charlotte area. $14-16/hr Send resume &amp; portfolio to Gavan.L...
349956,54109235,0.979537,#Job #Charlotte ☀☀$360.Cash/Day. (For 12 Days) Start Tomorrow__Help Moving &amp; Cleaning Big Ap...
349957,52821087,0.979536,Pompano Beach FL - Need Graphic Designer - Service needed Presentation PowerPoint. Type of organ...
349958,7746026,0.979535,United States Postal Service is now hiring CITY CARRIER ASSISTANT.\nhttps://t.co/pA0xXDZH4p
349959,32278021,0.979535,Attention Employers! Get FREE #Staffing for #SalesJobs. CFM #Talent4Hire. No Fees Staffing... ht...


In [11]:
top_job_offer_df.to_parquet(os.path.join(data_folder_path, column, '{}_top.parquet'.format(column)))

In [4]:
column = 'is_hired_1mo'
is_hired_1mo_df = pd.read_parquet(os.path.join(data_folder_path, column, '{}_all.parquet'.format(column)))
is_hired_1mo_df = is_hired_1mo_df.sort_values(by=["score"], ascending=False).reset_index()
is_hired_1mo_df.head()

Unnamed: 0,tweet_id,score,text
0,329284724180787201,0.967185,I just got hired !
1,298117857798144000,0.966951,"Well, I got the job!"
2,978991385151004672,0.96651,I got the job!!!
3,662755069675327489,0.966293,Got hired today!!!
4,460953833707802624,0.966203,I got the job!!!! 🙏


In [5]:
    labels = ['is_hired_1mo', 'is_unemployed', 'job_offer', 'job_search', 'lost_job_1mo']
    base_rates = [
        1.7342911457049017e-05,
        0.0003534645020523677,
        0.005604641971672389,
        0.00015839552996469054,
        1.455338466552472e-05]
    N_random = 92114009
    base_ranks = [int(x * N_random) for x in base_rates]
    label2rank = dict(zip(labels, base_ranks))

In [10]:
    model_path = os.path.join(model_folder_path, column, 'models', 'best_model')
    config = BertConfig.from_pretrained(model_path, output_hidden_states=True, output_attentions=True)
    tokenizer = BertTokenizer.from_pretrained(model_path)
    model = BertModel.from_pretrained(model_path, config=config)

In [6]:
df = is_hired_1mo_df[:label2rank[column]]
df.head()

Unnamed: 0,tweet_id,score,text
0,329284724180787201,0.967185,I just got hired !
1,298117857798144000,0.966951,"Well, I got the job!"
2,978991385151004672,0.96651,I got the job!!!
3,662755069675327489,0.966293,Got hired today!!!
4,460953833707802624,0.966203,I got the job!!!! 🙏


In [36]:
df['tokenized_text'] = df['text'].apply(tokenizer.tokenize)
df_wordcount = df.explode('tokenized_text')
df_wordcount.head(n=50)
#df.groupby('tokenized_text').count()
#df_wordcount.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tokenized_text'] = df['text'].apply(tokenizer.tokenize)


Unnamed: 0,tweet_id,score,text,tokenized_text
0,329284724180787201,0.967185,I just got hired !,I
0,329284724180787201,0.967185,I just got hired !,just
0,329284724180787201,0.967185,I just got hired !,got
0,329284724180787201,0.967185,I just got hired !,hired
0,329284724180787201,0.967185,I just got hired !,!
1,298117857798144000,0.966951,"Well, I got the job!",Well
1,298117857798144000,0.966951,"Well, I got the job!",","
1,298117857798144000,0.966951,"Well, I got the job!",I
1,298117857798144000,0.966951,"Well, I got the job!",got
1,298117857798144000,0.966951,"Well, I got the job!",the


In [37]:
df_wordcount = df_wordcount['tokenized_text'].value_counts().rename_axis('word').reset_index(name='count_top_tweets')

Unnamed: 0,unique_values,counts
0,!,1264
1,job,1160
2,new,816
3,I,810
4,my,671
...,...,...
2477,tables,1
2478,begin,1
2479,Cl,1
2480,##pool,1


## Data and model import 

In [7]:
start_time = time.time()
for column in ['is_hired_1mo', 'is_unemployed', 'job_offer', 'job_search', 'lost_job_1mo']:
    print('****************{}****************'.format(column))
    #load model
    model_path = os.path.join(model_folder_path, column, 'models', 'best_model')
    config = BertConfig.from_pretrained(model_path, output_hidden_states=True, output_attentions=True)
    tokenizer = BertTokenizer.from_pretrained(model_path)
    model = BertModel.from_pretrained(model_path, config=config)
    #load data
    data_path = os.path.join(data_folder_path, column, '{}_all.parquet'.format(column))
    df = pd.read_parquet(data_path)
    df = df[~df.text.str.contains("RT", na=False)].reset_index(drop=True)
    df = df.sort_values(by=['score'], ascending=False).reset_index(drop=True)
    for tweet_index in range(10):
        print('---> Tweet #{}'.format(str(tweet_index)))
        input_sequence = df['text'][tweet_index]
        tokenized_tweet = tokenizer.tokenize(input_sequence)
        print('Tweet: {} \n'.format(input_sequence))
        print('Tokenized version of tweet: {} \n'.format(tokenized_tweet))
        #identify high attention token
        print('$$$ Attention $$$\n')
        print('')
        attention_results_dict = get_token_in_sequence_with_most_attention(model, tokenizer, input_sequence)
        attention_token_str = attention_results_dict['token_str']
        attention_token_index = attention_results_dict['token_index']
        print('Token with the most attention: "{}"'.format(attention_token_str))
    del df
    print("--- %s seconds ---" % (time.time() - start_time))

****************is_hired_1mo****************
---> Tweet #0
Tweet: I just got hired ! 

Tokenized version of tweet: ['I', 'just', 'got', 'hired', '!'] 

$$$ Attention $$$


Attention scores dictionary:  {0: 1.448033332824707, 1: 0.82781982421875, 2: 0.7723453044891357, 3: 1.2157444953918457, 4: 0.7360572218894958} 

Token with the most attention: "I"
---> Tweet #1
Tweet: Well, I got the job! 

Tokenized version of tweet: ['Well', ',', 'I', 'got', 'the', 'job', '!'] 

$$$ Attention $$$


Attention scores dictionary:  {0: 1.6094640493392944, 1: 0.776115894317627, 2: 0.8612417578697205, 3: 1.0840789079666138, 4: 0.7891156673431396, 5: 1.083493709564209, 6: 0.7964902520179749} 

Token with the most attention: "Well"
---> Tweet #2
Tweet: I got the job!!! 

Tokenized version of tweet: ['I', 'got', 'the', 'job', '!', '!', '!'] 

$$$ Attention $$$


Attention scores dictionary:  {0: 1.718056082725525, 1: 1.1336854696273804, 2: 0.9232265949249268, 3: 1.216264009475708, 4: 0.680431604385376, 5: 0

Attention scores dictionary:  {0: 2.0183258056640625, 1: 0.6758396029472351, 2: 0.7621186971664429, 3: 1.097381353378296, 4: 1.0040760040283203, 5: 0.5110704898834229, 6: 0.8319424986839294, 7: 1.099245548248291} 

Token with the most attention: "J"
---> Tweet #8
Tweet: I'm homeless https://t.co/m28haQJToV 

Tokenized version of tweet: ['I', "'", 'm', 'homeless', 'https', ':', '/', '/', 't', '.', 'co', '/', 'm', '##28', '##ha', '##Q', '##J', '##To', '##V'] 

$$$ Attention $$$


Attention scores dictionary:  {0: 2.175520658493042, 1: 0.8550349473953247, 2: 1.0308195352554321, 3: 1.5380719900131226, 4: 1.2373863458633423, 5: 0.5842524170875549, 6: 0.5080752968788147, 7: 0.6225125193595886, 8: 0.9367058873176575, 9: 1.0400174856185913, 10: 0.8522230386734009, 11: 0.7216823101043701, 12: 0.8540211319923401, 13: 1.1991965770721436, 14: 0.9593065977096558, 15: 1.1466953754425049, 16: 1.0031530857086182, 17: 0.8571969866752625, 18: 0.878128170967102} 

Token with the most attention: "I"
---> 

Attention scores dictionary:  {0: 3.6904642581939697, 1: 1.1555966138839722, 2: 1.1546025276184082, 3: 2.0413217544555664, 4: 1.0345762968063354, 5: 0.9986982941627502, 6: 0.8164278268814087, 7: 0.9660937786102295, 8: 1.2609678506851196, 9: 0.9256583452224731, 10: 0.9077053666114807, 11: 1.0381320714950562, 12: 1.3715018033981323, 13: 0.9855841994285583, 14: 1.0388861894607544, 15: 0.9589338898658752, 16: 1.1530513763427734, 17: 1.360892653465271, 18: 1.1869503259658813, 19: 0.964089572429657, 20: 0.8633710741996765, 21: 0.8711097240447998, 22: 0.5749786496162415, 23: 0.768746018409729, 24: 1.0240448713302612, 25: 0.9553028345108032, 26: 0.7948763370513916, 27: 0.9602640867233276, 28: 0.6192051768302917, 29: 0.6011557579040527, 30: 0.7452808022499084, 31: 0.8988987803459167, 32: 0.9006732106208801, 33: 0.9963534474372864, 34: 1.1866179704666138, 35: 0.7976015210151672, 36: 0.8171982765197754, 37: 0.839331865310669, 38: 1.0395092964172363, 39: 0.7954014539718628, 40: 0.8549489378929138,

Attention scores dictionary:  {0: 3.5131876468658447, 1: 0.690924346446991, 2: 0.8967656493186951, 3: 0.7658674120903015, 4: 0.6766733527183533, 5: 0.6440050005912781, 6: 0.7597180008888245, 7: 1.1327658891677856, 8: 1.1024794578552246, 9: 1.1467875242233276, 10: 0.763604462146759, 11: 0.9770479202270508, 12: 0.8791464567184448, 13: 1.9595192670822144, 14: 1.0470134019851685, 15: 1.3681585788726807, 16: 0.9180591702461243, 17: 1.8195639848709106, 18: 1.1955755949020386, 19: 1.137458086013794, 20: 0.9018426537513733, 21: 0.9596684575080872, 22: 0.9891238212585449, 23: 1.1339647769927979, 24: 0.8622927665710449, 25: 1.2562669515609741, 26: 1.201363205909729, 27: 1.020580768585205, 28: 1.0250861644744873, 29: 0.9228612780570984, 30: 1.3134238719940186, 31: 0.9545947313308716, 32: 0.686636745929718, 33: 0.6280810832977295, 34: 0.7086610794067383, 35: 0.8452686667442322, 36: 0.8896167874336243, 37: 0.9595435261726379, 38: 1.1402227878570557, 39: 0.8525086045265198, 40: 0.6813849210739136, 4

OSError: Could not open parquet input source '/scratch/mt4493/twitter_labor/twitter-labor-data/data/inference/DeepPavlov_bert-base-cased-conversational_jul23_iter0_preprocessed_12207397-12226078/output/job_search/job_search_all.parquet': Invalid: Parquet magic bytes not found in footer. Either the file is corrupted or this is not a parquet file.

### is_hired_1mo

In [26]:
#load inference data from random set
data_path = '/home/manuto/Documents/world_bank/bert_twitter_labor/data/inference/convBERT/it0/random'
is_hired_1mo_df = pd.read_pickle(os.path.join(data_path,'is_hired_1mo_ONNX_BERT_ST_merged_random_100m_jun22.pkl')).reset_index(drop=True)
is_hired_1mo_df = is_hired_1mo_df[~is_hired_1mo_df.text.str.contains("RT", na=False)].reset_index(drop=True)
is_hired_1mo_df = is_hired_1mo_df[['tweet_id','second','text']]
is_hired_1mo_df.columns = ['tweet_id','score','text']
is_hired_1mo_df.head()

Unnamed: 0,tweet_id,score,text
0,662755069675327489,0.985959,Got hired today!!!
1,459847518688665600,0.985849,Just got hired at Google.
2,535195703203880961,0.985819,Just got hired at Hobby Lobby!
3,331873004168552448,0.985771,Got hired at Chick Fil A!
4,327479239966330881,0.985664,Got hired today at the hooters on the beach! 😉...


In [27]:
#load model
PATH_MODEL_FOLDER = '/home/manuto/Downloads/best_model'
config = BertConfig.from_pretrained(PATH_MODEL_FOLDER, output_hidden_states=True, output_attentions=True)
tokenizer = BertTokenizer.from_pretrained(PATH_MODEL_FOLDER)
model = BertModel.from_pretrained(PATH_MODEL_FOLDER, config=config)

We identify the token with most attention in each of the top 40 tweets. 

For each of these high-attention tokens, we identify 5 most similar tokens through masked language modeling (MLM).

In [29]:
mlm_pipeline_custom = pipeline('fill-mask', model=PATH_MODEL_FOLDER, tokenizer=PATH_MODEL_FOLDER,
                        config=PATH_MODEL_FOLDER, topk=5)

Some weights of the model checkpoint at /home/manuto/Downloads/best_model were not used when initializing BertForMaskedLM: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at /home/manuto/Downloads/best_model and are newly initialized: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight

#### MLM with Vanilla BERT

In [43]:
for tweet_index in range(10):
    print('*********** Tweet #{} ***********'.format(str(tweet_index)))
    input_sequence = is_hired_1mo_df['text'][tweet_index]
    tokenized_tweet = tokenizer.tokenize(input_sequence)
    print('Tweet: {} \n'.format(input_sequence))
    print('Tokenized version of tweet: {} \n'.format(tokenized_tweet))
    #identify high attention token
    print('$$$ Attention $$$')
    print('')
    attention_results_dict = get_token_in_sequence_with_most_attention(model, tokenizer, input_sequence)
    attention_token_str = attention_results_dict['token_str']
    attention_token_index = attention_results_dict['token_index']
    print('Token with most attention on average: {}'.format(attention_token_str))
    print('Token index with most attention on average: {} \n'.format(attention_token_index))
    #do MLM
    ## replace high-attention token by a [MASK] token
    print('$$$ MLM $$$ \n')
    tokenized_tweet[attention_token_index] = '[MASK]'
    mlm_results_list = mlm_pipeline_bert(' '.join(tokenized_tweet))
    selected_keywords_list = extract_keywords_from_mlm_results(mlm_results_list, 5)
    print('Ranked MLM selected keywords for token "{}": \n'.format(attention_results_dict['token_str']), selected_keywords_list)
    for i in range(len(mlm_results_list)):
        keyword = mlm_results_list[i]['token_str']
        print('\n MLM candidate #{} : {}'.format(str(i), keyword))
        print(mlm_results_list[i])
        tweets_containing_keyword_df = is_hired_1mo_df[is_hired_1mo_df['text'].str.contains(keyword)]
        print('Random sample of tweets containing keyword {}:'.format(keyword))
        print(tweets_containing_keyword_df.sample(n=5)[['score','text']].head())
    print('*********** End of keyword exploration on tweet #{} ***********'.format(str(tweet_index)))

*********** Tweet #0 ***********
Tweet: Got hired today!!! 

Tokenized version of tweet: ['Got', 'hired', 'today', '!', '!', '!'] 

$$$ Attention $$$

Attention scores dictionary:  {0: 1.512681484222412, 1: 1.4664392471313477, 2: 0.8879140615463257, 3: 0.6898781657218933, 4: 0.6860101222991943, 5: 0.7570768594741821}
Token with most attention on average: Got
Token index with most attention on average: 0 

$$$ MLM $$$ 

Ranked MLM selected keywords for token "Got": 
 ['get', 'got', '"', 'i', 'just']

 MLM candidate #0 : get
{'sequence': '[CLS] get hired today!!! [SEP]', 'score': 0.18747079372406006, 'token': 2131, 'token_str': 'get'}
Random sample of tweets containing keyword get:
            score                                               text
458648   0.314845  It's officially the first weekend of the Summe...
2995072  0.018597  Not really wanting to get up, but it's gym tim...
2541645  0.022016  So happy me and Ricky get to sleep in together...
3111085  0.017911  i have the worst

#### MLM with our fine-tuned ConvBERT model

In [46]:
for tweet_index in range(10):
    print('*********** Tweet #{} ***********'.format(str(tweet_index)))
    input_sequence = is_hired_1mo_df['text'][tweet_index]
    tokenized_tweet = tokenizer.tokenize(input_sequence)
    print('Tweet: {} \n'.format(input_sequence))
    print('Tokenized version of tweet: {} \n'.format(tokenized_tweet))
    #identify high attention token
    print('$$$ Attention $$$')
    print('')
    attention_results_dict = get_token_in_sequence_with_most_attention(model, tokenizer, input_sequence)
    attention_token_str = attention_results_dict['token_str']
    attention_token_index = attention_results_dict['token_index']
    print('Token with most attention on average: {}'.format(attention_token_str))
    print('Token index with most attention on average: {} \n'.format(attention_token_index))
    #do MLM
    ## replace high-attention token by a [MASK] token
    print('$$$ MLM $$$ \n')
    tokenized_tweet[attention_token_index] = '[MASK]'
    mlm_results_list = mlm_pipeline_custom(' '.join(tokenized_tweet))
    selected_keywords_list = extract_keywords_from_mlm_results(mlm_results_list, 5)
    print('Ranked MLM selected keywords for token "{}": \n'.format(attention_results_dict['token_str']), selected_keywords_list)
    for i in range(len(mlm_results_list)):
        keyword = mlm_results_list[i]['token_str']
        keyword = keyword.replace('##', '')
        print('\n MLM candidate #{} : {}'.format(str(i), keyword))
        print(mlm_results_list[i])
        tweets_containing_keyword_df = is_hired_1mo_df[is_hired_1mo_df['text'].str.contains(keyword)]
        print('Random sample of tweets containing keyword {}:'.format(keyword))
        print(tweets_containing_keyword_df.sample(n=5)[['score','text']].head())
    print('*********** End of keyword exploration on tweet #{} ***********'.format(str(tweet_index)))

*********** Tweet #0 ***********
Tweet: Got hired today!!! 

Tokenized version of tweet: ['Got', 'hired', 'today', '!', '!', '!'] 

$$$ Attention $$$

Attention scores dictionary:  {0: 1.512681484222412, 1: 1.4664392471313477, 2: 0.8879140615463257, 3: 0.6898781657218933, 4: 0.6860101222991943, 5: 0.7570768594741821}
Token with most attention on average: Got
Token index with most attention on average: 0 

$$$ MLM $$$ 

Ranked MLM selected keywords for token "Got": 
 ['h', '##ghan', '##ark', '##ahan', '##CF']

 MLM candidate #0 : h
{'sequence': '[CLS] h hired today!!! [SEP]', 'score': 0.00047591223847121, 'token': 177, 'token_str': 'h'}
Random sample of tweets containing keyword h:
            score                                               text
3954877  0.014232  Editing a hilarious @Tim__Kang. Can't stop lau...
2359046  0.023845  i don't work til 330, niggas should have fled ...
3002059  0.018553  You have a beautiful soul and that's why every...
187451   0.855582  We started some