In [100]:
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
import pandas as pd
from nltk.util import skipgrams
import string

In [101]:
def k_skip_n_grams(sent, k, n):
    return list(skipgrams(sent, k=k, n=n))

In [102]:
text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
               'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={ "hashtags", "allcaps", "elongated","repeated",
              'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens

    # corpus from which the word statistics are going to be used
    # for word segmentation
    segmenter="twitter",

    # corpus from which the word statistics are going to be used
    # for spell correction
    corrector="twitter",

    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words

    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,

    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


In [103]:
df = pd.read_csv("/home/manuto/Documents/world_bank/bert_twitter_labor/twitter-labor-data/data/jul23_iter0/preprocessed/train_is_unemployed.csv", lineterminator='\n')
df.head()

Unnamed: 0,tweet_id,text,class
0,524448969363963905,I only work 4 days next week which is fine but...,0.0
1,516983754045526016,@Lawwren__ We saw that you need a job we would...,0.0
2,440127142966210560,me: if I eat food in the kitchen will I get la...,0.0
3,557797964396388352,Currently curled in fetal position because of ...,0.0
4,412754561527316480,I am currently texting my nephews ex girlfrien...,0.0


In [104]:
df['tokenized_preprocessed_text'] = df['text'].apply(text_processor.pre_process_doc)
df.head()

Unnamed: 0,tweet_id,text,class,tokenized_preprocessed_text
0,524448969363963905,I only work 4 days next week which is fine but...,0.0,"[i, only, work, <number>, days, next, week, wh..."
1,516983754045526016,@Lawwren__ We saw that you need a job we would...,0.0,"[<user>, we, saw, that, you, need, a, job, we,..."
2,440127142966210560,me: if I eat food in the kitchen will I get la...,0.0,"[me, :, if, i, eat, food, in, the, kitchen, wi..."
3,557797964396388352,Currently curled in fetal position because of ...,0.0,"[currently, curled, in, fetal, position, becau..."
4,412754561527316480,I am currently texting my nephews ex girlfrien...,0.0,"[i, am, currently, texting, my, nephews, ex, g..."


In [105]:
df['skipgrams'] = df['tokenized_preprocessed_text'].apply(k_skip_n_grams,k=2,n=3)

In [106]:
skipgrams_count = df.explode('skipgrams').reset_index(drop=True)
skipgrams_count.head()

Unnamed: 0,tweet_id,text,class,tokenized_preprocessed_text,skipgrams
0,524448969363963905,I only work 4 days next week which is fine but...,0.0,"[i, only, work, <number>, days, next, week, wh...","(i, only, work)"
1,524448969363963905,I only work 4 days next week which is fine but...,0.0,"[i, only, work, <number>, days, next, week, wh...","(i, only, <number>)"
2,524448969363963905,I only work 4 days next week which is fine but...,0.0,"[i, only, work, <number>, days, next, week, wh...","(i, only, days)"
3,524448969363963905,I only work 4 days next week which is fine but...,0.0,"[i, only, work, <number>, days, next, week, wh...","(i, work, <number>)"
4,524448969363963905,I only work 4 days next week which is fine but...,0.0,"[i, only, work, <number>, days, next, week, wh...","(i, work, days)"


In [107]:
skipgrams_count['share_specific_tokens'] = skipgrams_count['skipgrams'].apply(lambda token_list: sum('<' in token for token in [str(i) for i in token_list])/len(token_list))
punctuation_list = [i for i in string.punctuation]
skipgrams_count['share_punctuation'] = skipgrams_count['skipgrams'].apply(lambda token_list: len(list(set(token_list).intersection(punctuation_list))) / len(token_list))
skipgrams_count['total_share_irrelevant_tokens'] = skipgrams_count['share_specific_tokens'] + skipgrams_count['share_punctuation']
skipgrams_count.head(n=50)

Unnamed: 0,tweet_id,text,class,tokenized_preprocessed_text,skipgrams,share_specific_tokens,share_punctuation,total_share_irrelevant_tokens
0,524448969363963905,I only work 4 days next week which is fine but...,0.0,"[i, only, work, <number>, days, next, week, wh...","(i, only, work)",0.0,0.0,0.0
1,524448969363963905,I only work 4 days next week which is fine but...,0.0,"[i, only, work, <number>, days, next, week, wh...","(i, only, <number>)",0.333333,0.0,0.333333
2,524448969363963905,I only work 4 days next week which is fine but...,0.0,"[i, only, work, <number>, days, next, week, wh...","(i, only, days)",0.0,0.0,0.0
3,524448969363963905,I only work 4 days next week which is fine but...,0.0,"[i, only, work, <number>, days, next, week, wh...","(i, work, <number>)",0.333333,0.0,0.333333
4,524448969363963905,I only work 4 days next week which is fine but...,0.0,"[i, only, work, <number>, days, next, week, wh...","(i, work, days)",0.0,0.0,0.0
5,524448969363963905,I only work 4 days next week which is fine but...,0.0,"[i, only, work, <number>, days, next, week, wh...","(i, <number>, days)",0.333333,0.0,0.333333
6,524448969363963905,I only work 4 days next week which is fine but...,0.0,"[i, only, work, <number>, days, next, week, wh...","(only, work, <number>)",0.333333,0.0,0.333333
7,524448969363963905,I only work 4 days next week which is fine but...,0.0,"[i, only, work, <number>, days, next, week, wh...","(only, work, days)",0.0,0.0,0.0
8,524448969363963905,I only work 4 days next week which is fine but...,0.0,"[i, only, work, <number>, days, next, week, wh...","(only, work, next)",0.0,0.0,0.0
9,524448969363963905,I only work 4 days next week which is fine but...,0.0,"[i, only, work, <number>, days, next, week, wh...","(only, <number>, days)",0.333333,0.0,0.333333


In [108]:
skipgrams_count = skipgrams_count[skipgrams_count['total_share_irrelevant_tokens']<(2/3)].reset_index(drop=True)
top_structures_dict = dict(skipgrams_count['skipgrams'].value_counts(dropna=False))
top_structures_dict

{('i', 'am', 'currently'): 150,
 ('i', 'am', 'unemployed'): 98,
 ('.', 'i', 'am'): 82,
 ('i', 'laid', 'off'): 61,
 ('i', 'am', 'a'): 53,
 ('i', 'am', 'to'): 50,
 ('i', 'am', '.'): 49,
 ('i', 'am', 'not'): 49,
 ('and', 'i', 'am'): 45,
 ('i', 'am', 'for'): 44,
 ('i', 'am', 'working'): 43,
 (',', 'i', 'am'): 43,
 ('i', 'quit', 'i'): 43,
 ('laid', 'off', '.'): 42,
 ('quit', 'i', 'quit'): 40,
 ('i', 'do', 'not'): 38,
 ('i', 'am', 'in'): 37,
 ('it', "'", 's'): 35,
 ('i', 'am', 'the'): 34,
 ('i', 'am', 'and'): 32,
 ('now', 'i', 'am'): 32,
 ('got', 'laid', 'off'): 31,
 ('i', 'am', 'now'): 30,
 ('.', 'i', 'have'): 29,
 ('was', 'laid', 'off'): 29,
 ('am', 'currently', '.'): 26,
 ('i', 'am', 'on'): 26,
 ('laid', 'off', '<number>'): 25,
 ('am', 'unemployed', '.'): 25,
 ('i', 'have', 'a'): 25,
 ('am', 'currently', 'working'): 24,
 ('i', 'have', 'been'): 24,
 ('am', 'currently', 'a'): 24,
 ('for', 'a', 'job'): 24,
 ('i', 'was', 'off'): 23,
 ('quit', 'my', 'job'): 23,
 ('i', 'was', 'laid'): 23,
 ('i'

In [116]:
from collections import Counter
d = Counter(top_structures_dict)
d.most_common(3)

[(('i', 'am', 'currently'), 150),
 (('i', 'am', 'unemployed'), 98),
 (('.', 'i', 'am'), 82)]

In [113]:
def sample_tweets_from_selected_structures(structure_dict, data_df, nb_top_structures, nb_tweets_per_structure):
    structure_list = Counter(structure_dict).most_common(nb_top_structures)
    for index in range(nb_top_structures):
        structure = structure_list[index][0]
        

0