In [1]:
import os
import logging
import argparse
import pandas as pd
import numpy as np
import pytz
import pyarrow
from pathlib import Path
from transformers import BertTokenizer, BertModel, BertConfig, BertForTokenClassification, pipeline, \
    AutoModelForTokenClassification, AutoTokenizer
import torch
import nltk
from nltk.corpus import stopwords
import string
from nltk.util import skipgrams
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
from collections import Counter

In [2]:
def get_token_in_sequence_with_most_attention(model, tokenizer, input_sequence):
    """
    Run an input sequence through the BERT model, collect and average attention scores per token and return token with
    most average attention.
    """
    tokenized_input_sequence = tokenizer.tokenize(input_sequence)
    input_ids = torch.tensor(tokenizer.encode(input_sequence, add_special_tokens=False)).unsqueeze(0)
    outputs = model(input_ids)
    last_hidden_states, pooler_outputs, hidden_states, attentions = outputs
    attention_tensor = torch.squeeze(torch.stack(attentions))
    attention_tensor_averaged = torch.mean(attention_tensor, (0, 1))
    attention_average_scores_per_token = torch.sum(attention_tensor_averaged, dim=0)
    attention_scores_dict = dict()
    for token_position in range(len(tokenized_input_sequence)):
        attention_scores_dict[token_position] = attention_average_scores_per_token[token_position].item()
    print(attention_scores_dict)
    return {'token_index': max(attention_scores_dict, key=attention_scores_dict.get),
            'token_str': tokenized_input_sequence[max(attention_scores_dict, key=attention_scores_dict.get)]}


def extract_keywords_from_mlm_results(mlm_results_list, K_kw_explore):
    selected_keywords_list = list()
    for rank_mlm_keyword in range(K_kw_explore):
        selected_keywords_list.append(mlm_results_list[rank_mlm_keyword]['token_str'])
    return selected_keywords_list


def drop_stopwords_punctuation(df):
    punctuation_list = [i for i in string.punctuation]
    all_stops = stopwords.words('english') + punctuation_list
    df = df[~df['word'].isin(all_stops)].reset_index(drop=True)
    return df


def calculate_lift(top_df, nb_keywords):
    top_wordcount_df = top_df.explode('tokenized_text')
    top_wordcount_df = top_wordcount_df['tokenized_text'].value_counts().rename_axis(
        'word').reset_index(name='count_top_tweets')
    full_random_wordcount_df = pd.read_parquet(
        '/scratch/mt4493/twitter_labor/twitter-labor-data/data/wordcount_random/wordcount_random.parquet')
    wordcount_df = top_wordcount_df.join(full_random_wordcount_df, on=['word'])
    wordcount_df = drop_stopwords_punctuation(wordcount_df)
    wordcount_df['lift'] = (wordcount_df['count_top_tweets'] / wordcount_df[
        'count']) * N_random / label2rank[column]
    wordcount_df = wordcount_df.sort_values(by=["lift"], ascending=False).reset_index()
    # Keep only word with lift > 1
    wordcount_df = wordcount_df[wordcount_df['lift'] > 1]
    if wordcount_df.shape[0] < nb_keywords:
        return wordcount_df['word'].tolist()
    else:
        return wordcount_df['word'][:nb_keywords].tolist()


def sample_tweets_containing_selected_keywords(keyword, nb_tweets_per_keyword, data_df, lowercase):
    if not lowercase:
        tweets_containing_keyword_df = data_df[data_df['text'].str.contains(keyword)].reset_index(drop=True)
    else:
        tweets_containing_keyword_df = data_df[data_df['lowercased_text'].str.contains(keyword)].reset_index(drop=True)
    tweets_containing_keyword_df = tweets_containing_keyword_df.sort_values(by=["score"], ascending=False).reset_index(
        drop=True)
    if tweets_containing_keyword_df.shape[0] < nb_tweets_per_keyword:
        print("Only {} tweets containing keyword {} (< {}). Sending all of them to labelling.".format(
            str(tweets_containing_keyword_df.shape[0]), keyword, str(nb_tweets_per_keyword)))
        return tweets_containing_keyword_df
    else:
        return tweets_containing_keyword_df[:nb_tweets_per_keyword]


def mlm_with_selected_keywords(top_df, model_name, keyword_list, nb_tweets_per_keyword, nb_keywords_per_tweet,
                               lowercase):
    """
    For each keyword K in the keyword_list list, select nb_tweets_per_keyword tweets containing the keyword.
    For each of the nb_tweets_per_keyword tweets, do masked language on keyword K.
    Retain the top nb_keywords_per_tweet keywords from MLM and store them in the final_selected_keywords list.
    """
    mlm_pipeline = pipeline('fill-mask', model=model_name, tokenizer=model_name,
                            config=model_name, topk=nb_keywords_per_tweet)
    final_selected_keywords_list = list()
    if lowercase:
        keyword_list = [keyword.lower() for keyword in keyword_list]
    for keyword in keyword_list:
        tweets_containing_keyword_df = sample_tweets_containing_selected_keywords(keyword, nb_tweets_per_keyword,
                                                                                  top_df, lowercase)
        for tweet_index in range(tweets_containing_keyword_df.shape[0]):
            tweet = tweets_containing_keyword_df['text'][tweet_index]
            tweet = tweet.replace(keyword, '[MASK]')
            mlm_results_list = mlm_pipeline(tweet)
            final_selected_keywords_list = + extract_keywords_from_mlm_results(mlm_results_list, nb_keywords_per_tweet)
    return final_selected_keywords_list


def eliminate_keywords_contained_in_positives_from_training(keyword_list, column):
    train_df = pd.read_csv(
        os.path.join('/scratch/mt4493/twitter_labor/twitter-labor-data/data/jul23_iter0/preprocessed',
                     'train_{}.csv'.format(column)),
        lineterminator='\n')
    positive_train_df = train_df[train_df['class'] == 1].reset_index(drop=True)
    final_keyword_list = list()
    for keyword in keyword_list:
        if positive_train_df['text'].str.contains(keyword).sum() == 0:
            final_keyword_list.append(keyword)
    return final_keyword_list

def k_skip_n_grams(sent, k, n):
    return list(skipgrams(sent, k=k, n=n))


text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
               'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated","repeated",
              'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens

    # corpus from which the word statistics are going to be used
    # for word segmentation
    segmenter="twitter",

    # corpus from which the word statistics are going to be used
    # for spell correction
    corrector="twitter",

    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words

    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,

    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

def ekphrasis_preprocessing(tweet):
    return " ".join(text_processor.pre_process_doc(tweet))

  self.tok = re.compile(r"({})".format("|".join(pipeline)))


Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


In [3]:
    mlm_pipeline = pipeline('fill-mask', model='bert-base-uncased', tokenizer='bert-base-uncased',
                            config='bert-base-uncased', topk=10)
    labels = ['is_hired_1mo', 'is_unemployed', 'job_offer', 'job_search', 'lost_job_1mo']
    base_rates = [
        1.7342911457049017e-05,
        0.0003534645020523677,
        0.005604641971672389,
        0.00015839552996469054,
        1.455338466552472e-05]
    N_random = 92114009
    base_ranks = [int(x * N_random) for x in base_rates]
    label2rank = dict(zip(labels, base_ranks))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
tokenizer = BertTokenizer.from_pretrained('DeepPavlov/bert-base-cased-conversational')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=207979.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=24.0, style=ProgressStyle(description_w…




In [None]:
column='is_hired_1mo'
all_data_df = pd.read_parquet('/home/manuto/Downloads/test_active_learning/is_hired_1mo_all.parquet')
all_data_df.head()

In [None]:
all_data_df = all_data_df[~all_data_df.text.str.contains("RT", na=False)].reset_index(drop=True)
all_data_df = all_data_df.sort_values(by=["score"], ascending=False).reset_index(drop=True)
all_data_df['tokenized_text'] = all_data_df['text'].apply(tokenizer.tokenize)
all_data_df['lowercased_text'] = all_data_df['text'].str.lower()
all_data_df['tokenized_preprocessed_text'] = all_data_df['text'].apply(text_processor.pre_process_doc)
all_data_df.head()

In [None]:
top_df = all_data_df[:label2rank[column]]
top_df.head()

In [None]:
final_nb_tweets_per_keyword = 5
explore_kw_data_df = top_df
top_lift_keywords_list = calculate_lift(explore_kw_data_df, nb_keywords=10)

In [None]:
selected_keywords_list = mlm_with_selected_keywords(top_df=explore_kw_data_df, model_name='bert-base-cased',
                                                            keyword_list=top_lift_keywords_list,
                                                            nb_tweets_per_keyword=1,
                                                            nb_keywords_per_tweet=5, lowercase=True
                                                            )

In [103]:
df = pd.read_csv("/home/manuto/Documents/world_bank/bert_twitter_labor/twitter-labor-data/data/jul23_iter0/preprocessed/train_is_unemployed.csv", lineterminator='\n')
df.head()

Unnamed: 0,tweet_id,text,class
0,524448969363963905,I only work 4 days next week which is fine but...,0.0
1,516983754045526016,@Lawwren__ We saw that you need a job we would...,0.0
2,440127142966210560,me: if I eat food in the kitchen will I get la...,0.0
3,557797964396388352,Currently curled in fetal position because of ...,0.0
4,412754561527316480,I am currently texting my nephews ex girlfrien...,0.0


In [104]:
df['tokenized_preprocessed_text'] = df['text'].apply(text_processor.pre_process_doc)
df.head()

Unnamed: 0,tweet_id,text,class,tokenized_preprocessed_text
0,524448969363963905,I only work 4 days next week which is fine but...,0.0,"[i, only, work, <number>, days, next, week, wh..."
1,516983754045526016,@Lawwren__ We saw that you need a job we would...,0.0,"[<user>, we, saw, that, you, need, a, job, we,..."
2,440127142966210560,me: if I eat food in the kitchen will I get la...,0.0,"[me, :, if, i, eat, food, in, the, kitchen, wi..."
3,557797964396388352,Currently curled in fetal position because of ...,0.0,"[currently, curled, in, fetal, position, becau..."
4,412754561527316480,I am currently texting my nephews ex girlfrien...,0.0,"[i, am, currently, texting, my, nephews, ex, g..."


In [105]:
df['skipgrams'] = df['tokenized_preprocessed_text'].apply(k_skip_n_grams,k=2,n=3)

In [106]:
skipgrams_count = df.explode('skipgrams').reset_index(drop=True)
skipgrams_count.head()

Unnamed: 0,tweet_id,text,class,tokenized_preprocessed_text,skipgrams
0,524448969363963905,I only work 4 days next week which is fine but...,0.0,"[i, only, work, <number>, days, next, week, wh...","(i, only, work)"
1,524448969363963905,I only work 4 days next week which is fine but...,0.0,"[i, only, work, <number>, days, next, week, wh...","(i, only, <number>)"
2,524448969363963905,I only work 4 days next week which is fine but...,0.0,"[i, only, work, <number>, days, next, week, wh...","(i, only, days)"
3,524448969363963905,I only work 4 days next week which is fine but...,0.0,"[i, only, work, <number>, days, next, week, wh...","(i, work, <number>)"
4,524448969363963905,I only work 4 days next week which is fine but...,0.0,"[i, only, work, <number>, days, next, week, wh...","(i, work, days)"


In [107]:
skipgrams_count['share_specific_tokens'] = skipgrams_count['skipgrams'].apply(lambda token_list: sum('<' in token for token in [str(i) for i in token_list])/len(token_list))
punctuation_list = [i for i in string.punctuation]
skipgrams_count['share_punctuation'] = skipgrams_count['skipgrams'].apply(lambda token_list: len(list(set(token_list).intersection(punctuation_list))) / len(token_list))
skipgrams_count['total_share_irrelevant_tokens'] = skipgrams_count['share_specific_tokens'] + skipgrams_count['share_punctuation']
skipgrams_count.head(n=50)

Unnamed: 0,tweet_id,text,class,tokenized_preprocessed_text,skipgrams,share_specific_tokens,share_punctuation,total_share_irrelevant_tokens
0,524448969363963905,I only work 4 days next week which is fine but...,0.0,"[i, only, work, <number>, days, next, week, wh...","(i, only, work)",0.0,0.0,0.0
1,524448969363963905,I only work 4 days next week which is fine but...,0.0,"[i, only, work, <number>, days, next, week, wh...","(i, only, <number>)",0.333333,0.0,0.333333
2,524448969363963905,I only work 4 days next week which is fine but...,0.0,"[i, only, work, <number>, days, next, week, wh...","(i, only, days)",0.0,0.0,0.0
3,524448969363963905,I only work 4 days next week which is fine but...,0.0,"[i, only, work, <number>, days, next, week, wh...","(i, work, <number>)",0.333333,0.0,0.333333
4,524448969363963905,I only work 4 days next week which is fine but...,0.0,"[i, only, work, <number>, days, next, week, wh...","(i, work, days)",0.0,0.0,0.0
5,524448969363963905,I only work 4 days next week which is fine but...,0.0,"[i, only, work, <number>, days, next, week, wh...","(i, <number>, days)",0.333333,0.0,0.333333
6,524448969363963905,I only work 4 days next week which is fine but...,0.0,"[i, only, work, <number>, days, next, week, wh...","(only, work, <number>)",0.333333,0.0,0.333333
7,524448969363963905,I only work 4 days next week which is fine but...,0.0,"[i, only, work, <number>, days, next, week, wh...","(only, work, days)",0.0,0.0,0.0
8,524448969363963905,I only work 4 days next week which is fine but...,0.0,"[i, only, work, <number>, days, next, week, wh...","(only, work, next)",0.0,0.0,0.0
9,524448969363963905,I only work 4 days next week which is fine but...,0.0,"[i, only, work, <number>, days, next, week, wh...","(only, <number>, days)",0.333333,0.0,0.333333


In [108]:
skipgrams_count = skipgrams_count[skipgrams_count['total_share_irrelevant_tokens']<(2/3)].reset_index(drop=True)
top_structures_dict = dict(skipgrams_count['skipgrams'].value_counts(dropna=False))
top_structures_dict

{('i', 'am', 'currently'): 150,
 ('i', 'am', 'unemployed'): 98,
 ('.', 'i', 'am'): 82,
 ('i', 'laid', 'off'): 61,
 ('i', 'am', 'a'): 53,
 ('i', 'am', 'to'): 50,
 ('i', 'am', '.'): 49,
 ('i', 'am', 'not'): 49,
 ('and', 'i', 'am'): 45,
 ('i', 'am', 'for'): 44,
 ('i', 'am', 'working'): 43,
 (',', 'i', 'am'): 43,
 ('i', 'quit', 'i'): 43,
 ('laid', 'off', '.'): 42,
 ('quit', 'i', 'quit'): 40,
 ('i', 'do', 'not'): 38,
 ('i', 'am', 'in'): 37,
 ('it', "'", 's'): 35,
 ('i', 'am', 'the'): 34,
 ('i', 'am', 'and'): 32,
 ('now', 'i', 'am'): 32,
 ('got', 'laid', 'off'): 31,
 ('i', 'am', 'now'): 30,
 ('.', 'i', 'have'): 29,
 ('was', 'laid', 'off'): 29,
 ('am', 'currently', '.'): 26,
 ('i', 'am', 'on'): 26,
 ('laid', 'off', '<number>'): 25,
 ('am', 'unemployed', '.'): 25,
 ('i', 'have', 'a'): 25,
 ('am', 'currently', 'working'): 24,
 ('i', 'have', 'been'): 24,
 ('am', 'currently', 'a'): 24,
 ('for', 'a', 'job'): 24,
 ('i', 'was', 'off'): 23,
 ('quit', 'my', 'job'): 23,
 ('i', 'was', 'laid'): 23,
 ('i'

In [116]:
from collections import Counter
d = Counter(top_structures_dict)
d.most_common(3)

[(('i', 'am', 'currently'), 150),
 (('i', 'am', 'unemployed'), 98),
 (('.', 'i', 'am'), 82)]

In [113]:
def sample_tweets_from_selected_structures(structure_dict, data_df, nb_top_structures, nb_tweets_per_structure):
    structure_list = Counter(structure_dict).most_common(nb_top_structures)
    for index in range(nb_top_structures):
        structure = structure_list[index][0]
        

0