In [19]:
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import LogisticRegression
from ast import literal_eval
from sklearn.metrics import classification_report, roc_curve, auc
import gensim.downloader as api
from sklearn.preprocessing import scale
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

## Utils

In [20]:
def get_w2v_general(tweet, size, vectors, aggregation, tf_idf_weighting, **args):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tweet.split():
        if tf_idf_weighting:
            try:
                word_id = cv.vocabulary_[word]
                word_tfidf_score = matrix.toarray()[0][word_id]
                vec += word_tfidf_score * vectors[word.lower()].reshape((1, size))
                count += 1.
            except KeyError:
                continue            
        else:
            try:
                vec += vectors[word.lower()].reshape((1, size))
                count += 1.
            except KeyError:
                continue
    if aggregation == 'mean':
        if count != 0:
            vec /= count
        return vec
    elif aggregation == 'sum':
        return vec

In [77]:
def which_word_is_picked_up(text):
    vec = np.zeros(200).reshape((1, 200))
    word_list = []
    for word in text.split():
        try:
            vec += glove_twitter[word].reshape((1, 200))
            word_list.append(word)
        except KeyError:
            continue
    return word_list

In [22]:
def perf_measure(y_actual, y_hat):
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    for i in range(len(y_hat)): 
        if y_actual[i]==y_hat[i]==1:
           TP += 1
        if y_hat[i]==1 and y_actual[i]!=y_hat[i]:
           FP += 1
        if y_actual[i]==y_hat[i]==0:
           TN += 1
        if y_hat[i]==0 and y_actual[i]!=y_hat[i]:
           FN += 1

    return(TP, FP, TN, FN)

## Preprocessing pipeline

In [23]:
text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


In [24]:
def ekphrasis_preprocessing(tweet):
    return " ".join(text_processor.pre_process_doc(tweet))

In [25]:
path = "/home/manuto/Documents/world_bank/bert_twitter_labor/code/twitter/data/may20_9Klabels/data_binary_pos_neg_balanced/preprocessed_glove"
label = "is_unemployed"
train_file_name = "train_{}.csv".format(label)
val_file_name = "val_{}.csv".format(label)

In [26]:
df_train = pd.read_csv(os.path.join(path, train_file_name))
df_train['ekphrasis_text'] = df_train['text'].apply(ekphrasis_preprocessing)
df_train.head()

Unnamed: 0,id,text,class,ProcessedText,ProcessedText_length,ekphrasis_text
0,2772,I was late again got work today. They gonna fi...,0,i was late again got work today. they gonna fi...,16,i was late again got work today . they gonna f...
1,1205,Got laid off today :/#work,1,got laid off today :/#work,5,got laid off today <annoyed> <hashtag> work </...
2,4512,Microsoft posted a job you might be interested...,0,microsoft posted a job you might be interested...,22,microsoft posted a job you might be interested...
3,7249,Now the owners wife is having a breakdown and ...,0,now the owners wife is having a breakdown and ...,16,now the owners wife is having a breakdown and ...
4,9453,I am happy today. Good. Time to get some food ...,1,i am happy today. good. time to get some food ...,27,i am happy today . good . time to get some foo...


In [27]:
df_val = pd.read_csv(os.path.join(path, val_file_name))
df_val['ekphrasis_text'] = df_val['text'].apply(ekphrasis_preprocessing)
df_val.head()

Unnamed: 0,id,text,class,ProcessedText,ProcessedText_length,ekphrasis_text
0,3792,Unemployed.... What am I gonna do with myself?,1,unemployed.... what am i gonna do with myself?,8,unemployed . <repeated> what am i gonna do wit...
1,4842,I have taken over 40 pictures of my brothers c...,1,i have taken over 40 pictures of my brothers c...,18,i have taken over <number> pictures of my brot...
2,5576,I am unable to quit as I am currently too legi...,0,i am unable to quit as i am currently too legi...,15,i am unable to quit as i am currently too legi...
3,5687,I was part of a corporate restructuring at my ...,1,i was part of a corporate restructuring at my ...,23,i was part of a corporate restructuring at my ...
4,7995,@realDonaldTrump Have you tried setting a card...,0,@realdonaldtrump have you tried setting a card...,43,<user> have you tried setting a card table out...


In [28]:
df_filtered = pd.read_csv("/home/manuto/Documents/world_bank/bert_twitter_labor/data/glove_cnn_prediction_data/glove_predictions/GLOVE_ST_filtered_ALL_is_unemployed.csv")
df_filtered = df_filtered[df_filtered['text'].apply(lambda x: isinstance(x, str))]
df_filtered['ProcessedText'] = df_filtered["text"].apply(str.lower)
df_filtered['ekphrasis_text'] = df_filtered['text'].apply(ekphrasis_preprocessing)
df_filtered.head()

Unnamed: 0,tweet_id,glove_pos_model,text,text_tokenized,glove_cnn_class_pred,ProcessedText,ekphrasis_text
0,330275754157948928,0.9999999999917516,How To Avoid Work-at-Home Scams http://t.co/mm...,"[127, 17, 5840, 321989, 69097, 0, 616487]",0.300822,how to avoid work-at-home scams http://t.co/mm...,how to avoid work - at - home scams <url> <has...
1,326440717394255873,0.9999999999906112,RT @FastCompany Instead Of Taking Your Daughte...,"[4, 0, 1660, 40, 1069, 62, 17361, 17, 0, 17140...",0.116052,rt @fastcompany instead of taking your daughte...,rt <user> instead of taking your daughters to ...
2,328029087752929280,0.9999999999905302,#mothersdaygift #itworks #Tighten #Tone #FullT...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 616487, 0, 0]",0.607089,#mothersdaygift #itworks #tighten #tone #fullt...,<hashtag> mothers day gift </hashtag> <hashtag...
3,345961088258547712,0.999999999990338,Find Other Moms In Your Area http://t.co/PSkL3...,"[471, 470, 3948, 36, 62, 3701, 0, 17, 923, 86,...",0.216608,find other moms in your area http://t.co/pskl3...,find other moms in your area <url> to meet up ...
4,364496673935851521,0.9999999999901836,EntrepreneurMoms Pilot Turned Linen-Maker Jenn...,"[0, 11831, 2924, 0, 13510, 0, 0, 1030, 0, 0, 0...",0.35823,entrepreneurmoms pilot turned linen-maker jenn...,entrepreneurmoms pilot turned linen - maker je...


In [29]:
df_random = pd.read_csv("/home/manuto/Documents/world_bank/bert_twitter_labor/data/glove_cnn_prediction_data/glove_predictions/GLOVE_ST_random_ALL_is_unemployed.csv")
df_random = df_random[df_random['text'].apply(lambda x: isinstance(x, str))].reset_index(drop=True)
df_random['ProcessedText'] = df_random["text"].apply(str.lower)
df_random['ekphrasis_text'] = df_random['text'].apply(ekphrasis_preprocessing)
df_random.head()

Unnamed: 0,tweet_id,glove_pos_model,text,text_tokenized,glove_cnn_class_pred,ProcessedText,ekphrasis_text
0,1067199112435679233,1.0,December Giveaway \n#storkvisionmarion #storkv...,"[3129, 2250, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 268...",0.353126,december giveaway \n#storkvisionmarion #storkv...,december giveaway <hashtag> stork vision mario...
1,444443081644179456,1.0,RT @Reza848: #LORDJASONJEROME @LORDJASONJEROME...,"[4, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0.373738,rt @reza848: #lordjasonjerome @lordjasonjerome...,rt <user> : <hashtag> lordjasonjerome </hashta...
2,443970350230298624,1.0,RT @HingerLovera: #LORDJASONJEROME @LORDJASONJ...,"[4, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0.373738,rt @hingerlovera: #lordjasonjerome @lordjasonj...,rt <user> : <hashtag> lordjasonjerome </hashta...
3,444230598459392000,1.0,RT @Bethany2211: #LORDJASONJEROME @LORDJASONJE...,"[4, 0, 0, 0, 0, 0, 59614, 0, 0, 0]",0.58685,rt @bethany2211: #lordjasonjerome @lordjasonje...,rt <user> : <hashtag> lordjasonjerome </hashta...
4,444277343730008064,1.0,RT @azamater19: #LORDJASONJEROME @LORDJASONJER...,"[4, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0.373738,rt @azamater19: #lordjasonjerome @lordjasonjer...,rt <user> : <hashtag> lordjasonjerome </hashta...


In [106]:
df_random_bert = pd.read_csv("/home/manuto/Documents/world_bank/bert_twitter_labor/data/glove_cnn_prediction_data/glove_predictions/BERT_ST_random_ALL_is_unemployed.csv")
df_random_bert = df_random_bert[df_random_bert['text'].apply(lambda x: isinstance(x, str))].reset_index(drop=True)
df_random_bert['ProcessedText'] = df_random_bert["text"].apply(str.lower)
df_random_bert['ekphrasis_text'] = df_random_bert['text'].apply(ekphrasis_preprocessing)
df_random_bert.head()

Unnamed: 0,tweet_id,pos_model,text,text_tokenized,glove_cnn_class_pred,ProcessedText,ekphrasis_text
0,728443901963444224,0.938718,I got fired today. 😂,"[11, 144, 9118, 382215, 0]",0.987249,i got fired today. 😂,i got fired today . 😂
1,785875357358358529,0.936386,I was fired today. I'm a badass,"[11, 94, 9118, 382215, 0, 12, 8267]",0.764424,i was fired today. i'm a badass,i was fired today . i am a badass
2,336917153766834177,0.935609,Lol Vinny just got fired,"[89, 37790, 60, 144, 9118]",0.837681,lol vinny just got fired,lol vinny just got fired
3,285522710845603841,0.934539,Andy Reid got fired... About time,"[5423, 22022, 144, 0, 122, 136]",0.679201,andy reid got fired... about time,andy reid got fired . <repeated> about time
4,496205184654008321,0.934478,i got fired https://t.co/FjMrgTf8kJ,"[11, 144, 9118, 0, 0]",0.959649,i got fired https://t.co/fjmrgtf8kj,i got fired <url>


## Analysis of words picked up by GloVe with different preprocessing methods

In [30]:
glove_twitter = api.load("glove-twitter-200")

### Training set

In [79]:
df_train["word_picked_up_by_glove"] = df_train["text"].apply(which_word_is_picked_up)
df_train["word_picked_up_by_glove_lowercased"] = df_train["ProcessedText"].apply(which_word_is_picked_up)
df_train["word_picked_up_by_glove_ekphrasis"] = df_train["ekphrasis_text"].apply(which_word_is_picked_up)
df_train["len_list_words_picked_up"] = df_train["word_picked_up_by_glove"].apply(len)
df_train["len_list_words_picked_up_lowercased"] = df_train["word_picked_up_by_glove_lowercased"].apply(len)
df_train["len_list_words_picked_up_ekphrasis"] = df_train["word_picked_up_by_glove_ekphrasis"].apply(len)
df_train.head()

Unnamed: 0,id,text,class,ProcessedText,ProcessedText_length,ekphrasis_text,word_picked_up_by_glove_lowercased,word_picked_up_by_glove_ekphrasis,len_list_words_picked_up_lowercased,len_list_words_picked_up_ekphrasis,word_picked_up_by_glove,len_list_words_picked_up
0,2772,I was late again got work today. They gonna fi...,0,i was late again got work today. they gonna fi...,16,i was late again got work today . they gonna f...,"[i, was, late, again, got, work, today., they,...","[i, was, late, again, got, work, today, ., the...",16,17,"[was, late, again, got, work, today., gonna, f...",14
1,1205,Got laid off today :/#work,1,got laid off today :/#work,5,got laid off today <annoyed> <hashtag> work </...,"[got, laid, off, today]","[got, laid, off, today, <hashtag>, work]",4,6,"[laid, off, today]",3
2,4512,Microsoft posted a job you might be interested...,0,microsoft posted a job you might be interested...,22,microsoft posted a job you might be interested...,"[microsoft, posted, a, job, you, might, be, in...","[microsoft, posted, a, job, you, might, be, in...",22,25,"[posted, a, job, you, might, be, interested, i...",11
3,7249,Now the owners wife is having a breakdown and ...,0,now the owners wife is having a breakdown and ...,16,now the owners wife is having a breakdown and ...,"[now, the, owners, wife, is, having, a, breakd...","[now, the, owners, wife, is, having, a, breakd...",15,17,"[the, owners, wife, is, having, a, breakdown, ...",13
4,9453,I am happy today. Good. Time to get some food ...,1,i am happy today. good. time to get some food ...,27,i am happy today . good . time to get some foo...,"[i, am, happy, today., good., time, to, get, s...","[i, am, happy, today, ., good, ., time, to, ge...",25,30,"[am, happy, today., to, get, some, food, and, ...",20


In [81]:
print("Average number of words picked up by GloVe without lowercasing: ", df_train["len_list_words_picked_up"].mean())
print("Average number of words picked up by GloVe with lowercasing: ", df_train["len_list_words_picked_up_lowercased"].mean())
print("Average number of words picked up by GloVe with ekphrasis preprocessing: ", df_train["len_list_words_picked_up_ekphrasis"].mean())

Average number of words picked up by GloVe without lowercasing:  12.736338797814208
Average number of words picked up by GloVe with lowercasing:  16.04678961748634
Average number of words picked up by GloVe with ekphrasis preprocessing:  21.021174863387976


In [33]:
df_train.loc[df_train['class']==1]['len_list_words_picked_up_ekphrasis'].mean()

20.67991775188485

In [34]:
print("Original size of training set: ", df_train.shape[0])
df_train_2 = df_train.loc[df_train["len_list_words_picked_up_ekphrasis"] >= 2].reset_index(drop=True)
print("Original size of training set without tweets in which less than 2 words were picked up: ", df_train_2.shape[0])
df_train_3 = df_train.loc[df_train["len_list_words_picked_up_ekphrasis"] >= 3].reset_index(drop=True)
print("Original size of training set without tweets in which less than 3 words were picked up: ", df_train_3.shape[0])
df_train_4 = df_train.loc[df_train["len_list_words_picked_up_ekphrasis"] >= 4].reset_index(drop=True)
print("Original size of training set without tweets in which less than 4 words were picked up: ", df_train_4.shape[0])
df_train_5 = df_train.loc[df_train["len_list_words_picked_up_ekphrasis"] >= 5].reset_index(drop=True)
print("Original size of training set without tweets in which less than 5 words were picked up: ", df_train_5.shape[0])
df_train_10 = df_train.loc[df_train["len_list_words_picked_up_ekphrasis"] >= 10].reset_index(drop=True)
print("Original size of training set without tweets in which less than 10 words were picked up: ", df_train_10.shape[0])

Original size of training set:  2928
Original size of training set without tweets in which less than 2 words were picked up:  2928
Original size of training set without tweets in which less than 3 words were picked up:  2927
Original size of training set without tweets in which less than 4 words were picked up:  2919
Original size of training set without tweets in which less than 5 words were picked up:  2905
Original size of training set without tweets in which less than 10 words were picked up:  2720


### Validation set

In [35]:
df_val["word_picked_up_by_glove_lowercased"] = df_val["ProcessedText"].apply(which_word_is_picked_up)
df_val["word_picked_up_by_glove_ekphrasis"] = df_val["ekphrasis_text"].apply(which_word_is_picked_up)
df_val["len_list_words_picked_up_lowercased"] = df_val["word_picked_up_by_glove_lowercased"].apply(len)
df_val["len_list_words_picked_up_ekphrasis"] = df_val["word_picked_up_by_glove_ekphrasis"].apply(len)
df_val.head()

Unnamed: 0,id,text,class,ProcessedText,ProcessedText_length,ekphrasis_text,word_picked_up_by_glove_lowercased,word_picked_up_by_glove_ekphrasis,len_list_words_picked_up_lowercased,len_list_words_picked_up_ekphrasis
0,3792,Unemployed.... What am I gonna do with myself?,1,unemployed.... what am i gonna do with myself?,8,unemployed . <repeated> what am i gonna do wit...,"[what, am, i, gonna, do, with]","[unemployed, ., what, am, i, gonna, do, with, ...",6,10
1,4842,I have taken over 40 pictures of my brothers c...,1,i have taken over 40 pictures of my brothers c...,18,i have taken over <number> pictures of my brot...,"[i, have, taken, over, pictures, of, my, broth...","[i, have, taken, over, <number>, pictures, of,...",14,22
2,5576,I am unable to quit as I am currently too legi...,0,i am unable to quit as i am currently too legi...,15,i am unable to quit as i am currently too legi...,"[i, am, unable, to, quit, as, i, am, currently...","[i, am, unable, to, quit, as, i, am, currently...",10,23
3,5687,I was part of a corporate restructuring at my ...,1,i was part of a corporate restructuring at my ...,23,i was part of a corporate restructuring at my ...,"[i, was, part, of, a, corporate, restructuring...","[i, was, part, of, a, corporate, restructuring...",17,24
4,7995,@realDonaldTrump Have you tried setting a card...,0,@realdonaldtrump have you tried setting a card...,43,<user> have you tried setting a card table out...,"[have, you, tried, setting, a, card, table, ou...","[<user>, have, you, tried, setting, a, card, t...",39,48


In [36]:
print("Average number of words picked up by GloVe with lowercasing: ", df_val["len_list_words_picked_up_lowercased"].mean())
print("Average number of words picked up by GloVe with ekphrasis preprocessing: ", df_val["len_list_words_picked_up_ekphrasis"].mean())

Average number of words picked up by GloVe with lowercasing:  15.860655737704919
Average number of words picked up by GloVe with ekphrasis preprocessing:  20.82103825136612


### Filtered set

In [83]:
df_filtered["word_picked_up"] = df_filtered["text"].apply(which_word_is_picked_up)
df_filtered["word_picked_up_by_glove_lowercased"] = df_filtered["ProcessedText"].apply(which_word_is_picked_up)
df_filtered["word_picked_up_by_glove_ekphrasis"] = df_filtered["ekphrasis_text"].apply(which_word_is_picked_up)
df_filtered["len_list_words_picked_up"] = df_filtered["word_picked_up"].apply(len)
df_filtered["len_list_words_picked_up_lowercased"] = df_filtered["word_picked_up_by_glove_lowercased"].apply(len)
df_filtered["len_list_words_picked_up_ekphrasis"] = df_filtered["word_picked_up_by_glove_ekphrasis"].apply(len)
df_filtered.head()

Unnamed: 0,tweet_id,glove_pos_model,text,text_tokenized,glove_cnn_class_pred,ProcessedText,ekphrasis_text,word_picked_up_by_glove_lowercased,word_picked_up_by_glove_ekphrasis,len_list_words_picked_up_lowercased,...,y_predict_proba_lowercased,y_predict_proba_ekphrasis,y_predict_proba_ekphrasis_new,y_predict_proba_ekphrasis_2,y_predict_proba_ekphrasis_3,y_predict_proba_ekphrasis_4,y_predict_proba_ekphrasis_5,y_predict_proba_ekphrasis_10,word_picked_up,len_list_words_picked_up
0,330275754157948928,0.9999999999917516,How To Avoid Work-at-Home Scams http://t.co/mm...,"[127, 17, 5840, 321989, 69097, 0, 616487]",0.300822,how to avoid work-at-home scams http://t.co/mm...,how to avoid work - at - home scams <url> <has...,"[how, to, avoid, work-at-home, scams, #moms]","[how, to, avoid, work, -, at, -, home, scams, ...",6,...,0.734378,0.02065,0.012303,0.02065,0.020642,0.020755,0.020805,0.012317,[#moms],1
1,326440717394255873,0.9999999999906112,RT @FastCompany Instead Of Taking Your Daughte...,"[4, 0, 1660, 40, 1069, 62, 17361, 17, 0, 17140...",0.116052,rt @fastcompany instead of taking your daughte...,rt <user> instead of taking your daughters to ...,"[rt, instead, of, taking, your, daughters, to,...","[rt, <user>, instead, of, taking, your, daught...",12,...,0.355912,0.127769,0.103971,0.127769,0.127459,0.122626,0.126385,0.089704,[#moms],1
2,328029087752929280,0.9999999999905302,#mothersdaygift #itworks #Tighten #Tone #FullT...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 616487, 0, 0]",0.607089,#mothersdaygift #itworks #tighten #tone #fullt...,<hashtag> mothers day gift </hashtag> <hashtag...,[#moms],"[<hashtag>, mothers, day, gift, <hashtag>, it,...",1,...,1.0,0.539414,0.56757,0.539414,0.537024,0.5298,0.527055,0.495295,[#moms],1
3,345961088258547712,0.999999999990338,Find Other Moms In Your Area http://t.co/PSkL3...,"[471, 470, 3948, 36, 62, 3701, 0, 17, 923, 86,...",0.216608,find other moms in your area http://t.co/pskl3...,find other moms in your area <url> to meet up ...,"[find, other, moms, in, your, area, to, meet, ...","[find, other, moms, in, your, area, <url>, to,...",11,...,0.621039,0.161171,0.163799,0.161171,0.160041,0.152524,0.15195,0.12397,[#moms],1
4,364496673935851521,0.9999999999901836,EntrepreneurMoms Pilot Turned Linen-Maker Jenn...,"[0, 11831, 2924, 0, 13510, 0, 0, 1030, 0, 0, 0...",0.35823,entrepreneurmoms pilot turned linen-maker jenn...,entrepreneurmoms pilot turned linen - maker je...,"[pilot, turned, jenny, main, photo, #moms]","[pilot, turned, linen, -, maker, jenny, davids...",6,...,0.907417,0.254525,0.292834,0.254525,0.254443,0.260042,0.247254,0.204188,[#moms],1


In [85]:
print("Average number of words picked up by GloVe without lowercasing: ", df_filtered["len_list_words_picked_up"].mean())
print("Average number of words picked up by GloVe with lowercasing: ", df_filtered["len_list_words_picked_up_lowercased"].mean())
print("Average number of words picked up by GloVe with ekphrasis preprocessing: ", df_filtered["len_list_words_picked_up_ekphrasis"].mean())

Average number of words picked up by GloVe without lowercasing:  7.756749386419417
Average number of words picked up by GloVe with lowercasing:  11.000954458685573
Average number of words picked up by GloVe with ekphrasis preprocessing:  19.65998545586765


### Random set

In [87]:
df_random["word_picked_up"] = df_random["text"].apply(which_word_is_picked_up)
df_random["word_picked_up_by_glove_lowercased"] = df_random["ProcessedText"].apply(which_word_is_picked_up)
df_random["word_picked_up_by_glove_ekphrasis"] = df_random["ekphrasis_text"].apply(which_word_is_picked_up)
df_random["len_list_words_picked_up"] = df_random["word_picked_up"].apply(len)
df_random["len_list_words_picked_up_lowercased"] = df_random["word_picked_up_by_glove_lowercased"].apply(len)
df_random["len_list_words_picked_up_ekphrasis"] = df_random["word_picked_up_by_glove_ekphrasis"].apply(len)
df_random.head()

Unnamed: 0,tweet_id,glove_pos_model,text,text_tokenized,glove_cnn_class_pred,ProcessedText,ekphrasis_text,word_picked_up_by_glove_lowercased,word_picked_up_by_glove_ekphrasis,len_list_words_picked_up_lowercased,...,y_predict_proba_lowercased,y_predict_proba_ekphrasis,y_predict_proba_ekphrasis_new,y_predict_proba_ekphrasis_2,y_predict_proba_ekphrasis_3,y_predict_proba_ekphrasis_4,y_predict_proba_ekphrasis_5,y_predict_proba_ekphrasis_10,word_picked_up,len_list_words_picked_up
0,1067199112435679233,1.0,December Giveaway \n#storkvisionmarion #storkv...,"[3129, 2250, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 268...",0.353126,december giveaway \n#storkvisionmarion #storkv...,december giveaway <hashtag> stork vision mario...,"[december, giveaway, #mom]","[december, giveaway, <hashtag>, stork, vision,...",3,...,0.983625,0.24833,0.331122,0.24833,0.246965,0.246123,0.259627,0.17279,[#mom],1
1,444443081644179456,1.0,RT @Reza848: #LORDJASONJEROME @LORDJASONJEROME...,"[4, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0.373738,rt @reza848: #lordjasonjerome @lordjasonjerome...,rt <user> : <hashtag> lordjasonjerome </hashta...,"[rt, #iphonegames]","[rt, <user>, :, <hashtag>, lordjasonjerome, <u...",2,...,0.997048,0.189556,0.194416,0.189556,0.188968,0.189258,0.189435,0.177733,[#iphonegames],1
2,443970350230298624,1.0,RT @HingerLovera: #LORDJASONJEROME @LORDJASONJ...,"[4, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0.373738,rt @hingerlovera: #lordjasonjerome @lordjasonj...,rt <user> : <hashtag> lordjasonjerome </hashta...,"[rt, #iphonegames]","[rt, <user>, :, <hashtag>, lordjasonjerome, <u...",2,...,0.997048,0.189556,0.194416,0.189556,0.188968,0.189258,0.189435,0.177733,[#iphonegames],1
3,444230598459392000,1.0,RT @Bethany2211: #LORDJASONJEROME @LORDJASONJE...,"[4, 0, 0, 0, 0, 0, 59614, 0, 0, 0]",0.58685,rt @bethany2211: #lordjasonjerome @lordjasonje...,rt <user> : <hashtag> lordjasonjerome </hashta...,"[rt, #iphonegames]","[rt, <user>, :, <hashtag>, lordjasonjerome, <u...",2,...,0.997048,0.054976,0.056643,0.054976,0.054914,0.057704,0.057663,0.066567,[#iphonegames],1
4,444277343730008064,1.0,RT @azamater19: #LORDJASONJEROME @LORDJASONJER...,"[4, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0.373738,rt @azamater19: #lordjasonjerome @lordjasonjer...,rt <user> : <hashtag> lordjasonjerome </hashta...,"[rt, #iphonegames]","[rt, <user>, :, <hashtag>, lordjasonjerome, <u...",2,...,0.997048,0.018662,0.01849,0.018662,0.018675,0.020568,0.02109,0.019658,[#iphonegames],1


In [88]:
print("Average number of words picked up by GloVe without lowercasing: ", df_random["len_list_words_picked_up"].mean())
print("Average number of words picked up by GloVe with lowercasing: ", df_random["len_list_words_picked_up_lowercased"].mean())
print("Average number of words picked up by GloVe with ekphrasis preprocessing: ", df_random["len_list_words_picked_up_ekphrasis"].mean())

Average number of words picked up by GloVe without lowercasing:  6.828454545454545
Average number of words picked up by GloVe with lowercasing:  9.421727272727273
Average number of words picked up by GloVe with ekphrasis preprocessing:  16.50581818181818


In [110]:
df_random_bert["word_picked_up"] = df_random_bert["text"].apply(which_word_is_picked_up)
df_random_bert["word_picked_up_by_glove_lowercased"] = df_random_bert["ProcessedText"].apply(which_word_is_picked_up)
df_random_bert["word_picked_up_by_glove_ekphrasis"] = df_random_bert["ekphrasis_text"].apply(which_word_is_picked_up)
df_random_bert["len_list_words_picked_up"] = df_random_bert["word_picked_up"].apply(len)
df_random_bert["len_list_words_picked_up_lowercased"] = df_random_bert["word_picked_up_by_glove_lowercased"].apply(len)
df_random_bert["len_list_words_picked_up_ekphrasis"] = df_random_bert["word_picked_up_by_glove_ekphrasis"].apply(len)
df_random_bert.head()

Unnamed: 0,tweet_id,pos_model,text,text_tokenized,glove_cnn_class_pred,ProcessedText,ekphrasis_text,word_picked_up,word_picked_up_by_glove_lowercased,word_picked_up_by_glove_ekphrasis,len_list_words_picked_up,len_list_words_picked_up_lowercased,len_list_words_picked_up_ekphrasis
0,728443901963444224,0.938718,I got fired today. 😂,"[11, 144, 9118, 382215, 0]",0.987249,i got fired today. 😂,i got fired today . 😂,"[got, fired, today.]","[i, got, fired, today.]","[i, got, fired, today, .]",3,4,5
1,785875357358358529,0.936386,I was fired today. I'm a badass,"[11, 94, 9118, 382215, 0, 12, 8267]",0.764424,i was fired today. i'm a badass,i was fired today . i am a badass,"[was, fired, today., a, badass]","[i, was, fired, today., a, badass]","[i, was, fired, today, ., i, am, a, badass]",5,6,9
2,336917153766834177,0.935609,Lol Vinny just got fired,"[89, 37790, 60, 144, 9118]",0.837681,lol vinny just got fired,lol vinny just got fired,"[just, got, fired]","[lol, vinny, just, got, fired]","[lol, vinny, just, got, fired]",3,5,5
3,285522710845603841,0.934539,Andy Reid got fired... About time,"[5423, 22022, 144, 0, 122, 136]",0.679201,andy reid got fired... about time,andy reid got fired . <repeated> about time,"[got, time]","[andy, reid, got, about, time]","[andy, reid, got, fired, ., about, time]",2,5,7
4,496205184654008321,0.934478,i got fired https://t.co/FjMrgTf8kJ,"[11, 144, 9118, 0, 0]",0.959649,i got fired https://t.co/fjmrgtf8kj,i got fired <url>,"[i, got, fired]","[i, got, fired]","[i, got, fired, <url>]",3,3,4


## Training and validation results

### TF-IDF vectorizer

In [98]:
df_train["data"] = "train"
df_val["data"] = "val"
df_filtered["data"] = "filtered"
df_random["data"] = "random"

In [99]:
all_data_df = pd.concat([df_train, df_val, df_filtered, df_random]).reset_index(drop=True)
all_data_df["lower_cased_text"] = all_data_df['text'].apply(str.lower)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [100]:
def get_tfidf_matrix(data, matrix):
    indexes = all_data_df.loc[all_data_df['data']==data].index
    return matrix[indexes]

In [101]:
all_data_df = pd.concat([df_train, df_val, df_filtered, df_random]).reset_index(drop=True)
all_data_df["lower_cased_text"] = all_data_df['text'].apply(str.lower)
#lowercased data
cv_lowercased = TfidfVectorizer()
X_lowercased = cv_lowercased.fit_transform(all_data_df['lower_cased_text'])
#preprocessed data
cv_ekphrasis = TfidfVectorizer()
X_ekphrasis = cv_ekphrasis.fit_transform(all_data_df['ekphrasis_text'])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [102]:
#create TF-IDF matrix for each dataset
## lowercased
X_lowercased_train = get_tfidf_matrix("train", X_lowercased)
X_lowercased_val = get_tfidf_matrix("val", X_lowercased)
X_lowercased_filtered = get_tfidf_matrix("filtered", X_lowercased)
X_lowercased_random = get_tfidf_matrix("random", X_lowercased)
## preprocessed ekphrasis
X_ekphrasis_train = get_tfidf_matrix("train", X_ekphrasis)
X_ekphrasis_val = get_tfidf_matrix("val", X_ekphrasis)
X_ekphrasis_filtered = get_tfidf_matrix("filtered", X_ekphrasis)
X_ekphrasis_random = get_tfidf_matrix("random", X_ekphrasis)

In [108]:
#create embeddings lowercased
train_vecs_glove_mean_lowercased = scale(np.concatenate([get_w2v_general(z, 200, glove_twitter,'mean', tf_idf_weighting=False) for z in df_train["text"]]))
validation_vecs_glove_mean_lowercased = scale(np.concatenate([get_w2v_general(z, 200, glove_twitter,'mean', tf_idf_weighting=False) for z in df_val["text"]]))
filtered_vecs_glove_mean_lowercased = scale(np.concatenate([get_w2v_general(z, 200, glove_twitter,'mean', tf_idf_weighting=False) for z in df_filtered["text"]]))
random_vecs_glove_mean_lowercased = scale(np.concatenate([get_w2v_general(z, 200, glove_twitter,'mean', tf_idf_weighting=False) for z in df_random["text"]]))
random_vecs_bert_mean_lowercased = scale(np.concatenate([get_w2v_general(z, 200, glove_twitter,'mean', tf_idf_weighting=False) for z in df_random_bert["text"]]))

#create embeddings ekphrasis
train_vecs_glove_mean_ekphrasis = scale(np.concatenate([get_w2v_general(z, 200, glove_twitter,'mean', tf_idf_weighting=False) for z in df_train["ekphrasis_text"]]))
train_vecs_glove_mean_ekphrasis_2 = scale(np.concatenate([get_w2v_general(z, 200, glove_twitter,'mean', tf_idf_weighting=False) for z in df_train_2["ekphrasis_text"]]))
train_vecs_glove_mean_ekphrasis_3 = scale(np.concatenate([get_w2v_general(z, 200, glove_twitter,'mean', tf_idf_weighting=False) for z in df_train_3["ekphrasis_text"]]))
train_vecs_glove_mean_ekphrasis_4 = scale(np.concatenate([get_w2v_general(z, 200, glove_twitter,'mean', tf_idf_weighting=False) for z in df_train_4["ekphrasis_text"]]))
train_vecs_glove_mean_ekphrasis_5 = scale(np.concatenate([get_w2v_general(z, 200, glove_twitter,'mean', tf_idf_weighting=False) for z in df_train_5["ekphrasis_text"]]))
train_vecs_glove_mean_ekphrasis_10 = scale(np.concatenate([get_w2v_general(z, 200, glove_twitter,'mean', tf_idf_weighting=False) for z in df_train_10["ekphrasis_text"]]))
validation_vecs_glove_mean_ekphrasis = scale(np.concatenate([get_w2v_general(z, 200, glove_twitter,'mean', tf_idf_weighting=False) for z in df_val["ekphrasis_text"]]))
filtered_vecs_glove_mean_ekphrasis = scale(np.concatenate([get_w2v_general(z, 200, glove_twitter,'mean', tf_idf_weighting=False) for z in df_filtered["ekphrasis_text"]]))
random_vecs_glove_mean_ekphrasis = scale(np.concatenate([get_w2v_general(z, 200, glove_twitter,'mean', tf_idf_weighting=False) for z in df_random["ekphrasis_text"]]))
random_vecs_bert_mean_ekphrasis = scale(np.concatenate([get_w2v_general(z, 200, glove_twitter,'mean', tf_idf_weighting=False) for z in df_random_bert["ekphrasis_text"]]))


### Validation results for model trained on lowercased data

In [43]:
clf_lowercased = LogisticRegression(max_iter=1000)
clf_lowercased.fit(train_vecs_glove_mean_lowercased,df_train["class"])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [44]:
df_val["class_predict_lowercased"] = clf_lowercased.predict(validation_vecs_glove_mean_lowercased)
df_val["y_predict_proba_lowercased"] = clf_lowercased.predict_proba(validation_vecs_glove_mean_lowercased)[:, 1]
TP, FP, TN, FN = perf_measure(df_val["class"], df_val["class_predict_lowercased"])
print("Precision: ", TP/(TP+FP))
print("Recall: ", TP/(TP+FN))
fpr, tpr, thresholds = roc_curve(df_val["class"], df_val["y_predict_proba_lowercased"])
print("AUC: ", auc(fpr, tpr))

Precision:  0.7331606217616581
Recall:  0.7628032345013477
AUC:  0.8070274992346805


### Validation results for model trained on ekphrasis data

In [45]:
clf_ekphrasis = LogisticRegression(max_iter=1000)
clf_ekphrasis.fit(train_vecs_glove_mean_ekphrasis,df_train["class"])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [46]:
df_val["class_predict_ekphrasis"] = clf_ekphrasis.predict(validation_vecs_glove_mean_ekphrasis)
df_val["y_predict_proba_ekphrasis"] = clf_ekphrasis.predict_proba(validation_vecs_glove_mean_ekphrasis)[:, 1]
TP, FP, TN, FN = perf_measure(df_val["class"], df_val["class_predict_ekphrasis"])
print("Precision: ", TP/(TP+FP))
print("Recall: ", TP/(TP+FN))
fpr, tpr, thresholds = roc_curve(df_val["class"], df_val["y_predict_proba_ekphrasis"])
print("AUC: ", auc(fpr, tpr))

Precision:  0.7301587301587301
Recall:  0.7439353099730458
AUC:  0.8198475334313937


### Validation results for model trained on ekphrasis data with number of words picked up as feature

In [47]:
#add feature of number of words picked up on train X
train_array_new_feature = np.zeros((2928,201))
for i in range(train_vecs_glove_mean_ekphrasis.shape[0]):
    train_array_new_feature[i] = np.append(train_vecs_glove_mean_ekphrasis[i], df_train['len_list_words_picked_up_ekphrasis'][i])

In [48]:
#add feature of number of words picked up on val X
validation_array_new_feature = np.zeros((732,201))
for i in range(validation_vecs_glove_mean_ekphrasis.shape[0]):
    validation_array_new_feature[i] = np.append(validation_vecs_glove_mean_ekphrasis[i], df_val['len_list_words_picked_up_ekphrasis'][i])

In [49]:
clf_ekphrasis_new = LogisticRegression(max_iter=1000)
clf_ekphrasis_new.fit(train_array_new_feature,df_train["class"])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [50]:
df_val["class_predict_ekphrasis_new"] = clf_ekphrasis_new.predict(validation_array_new_feature)
df_val["y_predict_proba_ekphrasis_new"] = clf_ekphrasis_new.predict_proba(validation_array_new_feature)[:, 1]
TP, FP, TN, FN = perf_measure(df_val["class"], df_val["class_predict_ekphrasis_new"])
print("Precision: ", TP/(TP+FP))
print("Recall: ", TP/(TP+FN))
fpr, tpr, thresholds = roc_curve(df_val["class"], df_val["y_predict_proba_ekphrasis_new"])
print("AUC: ", auc(fpr, tpr))

Precision:  0.746031746031746
Recall:  0.7601078167115903
AUC:  0.8236853305060067


### Train models on training set with a minimum threshold of 2, 3, 4, 5 and 10 words picked up

In [51]:
#model for minimum threshold = 2
clf_ekphrasis_2 = LogisticRegression(max_iter=1000)
clf_ekphrasis_2.fit(train_vecs_glove_mean_ekphrasis_2,df_train_2["class"])
#model for minimum threshold = 3
clf_ekphrasis_3 = LogisticRegression(max_iter=1000)
clf_ekphrasis_3.fit(train_vecs_glove_mean_ekphrasis_3,df_train_3["class"])
#model for minimum threshold = 4
clf_ekphrasis_4 = LogisticRegression(max_iter=1000)
clf_ekphrasis_4.fit(train_vecs_glove_mean_ekphrasis_4,df_train_4["class"])
#model for minimum threshold = 5
clf_ekphrasis_5 = LogisticRegression(max_iter=1000)
clf_ekphrasis_5.fit(train_vecs_glove_mean_ekphrasis_5,df_train_5["class"])
#model for minimum threshold = 10
clf_ekphrasis_10 = LogisticRegression(max_iter=1000)
clf_ekphrasis_10.fit(train_vecs_glove_mean_ekphrasis_10,df_train_10["class"])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## Analysis filtered set

In [89]:
df_filtered = df_filtered.reset_index(drop=True)

In [90]:
#add feature of number of words picked up on filtered X
filtered_array_new_feature = np.zeros((22002,201))
for i in range(filtered_vecs_glove_mean_ekphrasis.shape[0]):
    #print(filtered_vecs_glove_mean_ekphrasis[i])
    #print(df_filtered['len_list_words_picked_up_ekphrasis'][i])
    filtered_array_new_feature[i] = np.append(filtered_vecs_glove_mean_ekphrasis[i], df_filtered['len_list_words_picked_up_ekphrasis'][i])

In [91]:
df_filtered['y_predict_proba_lowercased'] = clf_lowercased.predict_proba(filtered_vecs_glove_mean_lowercased)[:, 1]
df_filtered['y_predict_proba_ekphrasis'] = clf_ekphrasis.predict_proba(filtered_vecs_glove_mean_ekphrasis)[:, 1]
df_filtered['y_predict_proba_ekphrasis_new'] = clf_ekphrasis_new.predict_proba(filtered_array_new_feature)[:, 1]
df_filtered['y_predict_proba_ekphrasis_2'] = clf_ekphrasis_2.predict_proba(filtered_vecs_glove_mean_ekphrasis)[:, 1]
df_filtered['y_predict_proba_ekphrasis_3'] = clf_ekphrasis_3.predict_proba(filtered_vecs_glove_mean_ekphrasis)[:, 1]
df_filtered['y_predict_proba_ekphrasis_4'] = clf_ekphrasis_4.predict_proba(filtered_vecs_glove_mean_ekphrasis)[:, 1]
df_filtered['y_predict_proba_ekphrasis_5'] = clf_ekphrasis_5.predict_proba(filtered_vecs_glove_mean_ekphrasis)[:, 1]
df_filtered['y_predict_proba_ekphrasis_10'] = clf_ekphrasis_10.predict_proba(filtered_vecs_glove_mean_ekphrasis)[:, 1]
df_filtered.head(n=50)

Unnamed: 0,tweet_id,glove_pos_model,text,text_tokenized,glove_cnn_class_pred,ProcessedText,ekphrasis_text,word_picked_up_by_glove_lowercased,word_picked_up_by_glove_ekphrasis,len_list_words_picked_up_lowercased,...,y_predict_proba_lowercased,y_predict_proba_ekphrasis,y_predict_proba_ekphrasis_new,y_predict_proba_ekphrasis_2,y_predict_proba_ekphrasis_3,y_predict_proba_ekphrasis_4,y_predict_proba_ekphrasis_5,y_predict_proba_ekphrasis_10,word_picked_up,len_list_words_picked_up
0,330275754157948928,0.9999999999917516,How To Avoid Work-at-Home Scams http://t.co/mm...,"[127, 17, 5840, 321989, 69097, 0, 616487]",0.300822,how to avoid work-at-home scams http://t.co/mm...,how to avoid work - at - home scams <url> <has...,"[how, to, avoid, work-at-home, scams, #moms]","[how, to, avoid, work, -, at, -, home, scams, ...",6,...,0.734378,0.02065,0.012303,0.02065,0.020642,0.020755,0.020805,0.012317,[#moms],1
1,326440717394255873,0.9999999999906112,RT @FastCompany Instead Of Taking Your Daughte...,"[4, 0, 1660, 40, 1069, 62, 17361, 17, 0, 17140...",0.116052,rt @fastcompany instead of taking your daughte...,rt <user> instead of taking your daughters to ...,"[rt, instead, of, taking, your, daughters, to,...","[rt, <user>, instead, of, taking, your, daught...",12,...,0.355912,0.127769,0.103971,0.127769,0.127459,0.122626,0.126385,0.089704,[#moms],1
2,328029087752929280,0.9999999999905302,#mothersdaygift #itworks #Tighten #Tone #FullT...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 616487, 0, 0]",0.607089,#mothersdaygift #itworks #tighten #tone #fullt...,<hashtag> mothers day gift </hashtag> <hashtag...,[#moms],"[<hashtag>, mothers, day, gift, <hashtag>, it,...",1,...,1.0,0.539414,0.56757,0.539414,0.537024,0.5298,0.527055,0.495295,[#moms],1
3,345961088258547712,0.999999999990338,Find Other Moms In Your Area http://t.co/PSkL3...,"[471, 470, 3948, 36, 62, 3701, 0, 17, 923, 86,...",0.216608,find other moms in your area http://t.co/pskl3...,find other moms in your area <url> to meet up ...,"[find, other, moms, in, your, area, to, meet, ...","[find, other, moms, in, your, area, <url>, to,...",11,...,0.621039,0.161171,0.163799,0.161171,0.160041,0.152524,0.15195,0.12397,[#moms],1
4,364496673935851521,0.9999999999901836,EntrepreneurMoms Pilot Turned Linen-Maker Jenn...,"[0, 11831, 2924, 0, 13510, 0, 0, 1030, 0, 0, 0...",0.35823,entrepreneurmoms pilot turned linen-maker jenn...,entrepreneurmoms pilot turned linen - maker je...,"[pilot, turned, jenny, main, photo, #moms]","[pilot, turned, linen, -, maker, jenny, davids...",6,...,0.907417,0.254525,0.292834,0.254525,0.254443,0.260042,0.247254,0.204188,[#moms],1
5,362708247825743874,0.9999999999882372,Tattoo markers. #funatwork #mom http://t.co/Y2...,"[3172, 0, 0, 268056, 0]",0.744509,tattoo markers. #funatwork #mom http://t.co/y2...,tattoo markers . <hashtag> fun at work </hasht...,"[tattoo, #mom]","[tattoo, markers, ., <hashtag>, fun, at, work,...",2,...,0.999973,0.872795,0.8284,0.872795,0.87191,0.864451,0.849355,0.737899,[#mom],1
6,783869653558956033,0.99999999998679,RT @KKL_fan: Beautiful Ad!! \n#TrumpPence16 \n...,"[4, 0, 533, 0, 0, 0, 0, 0, 0, 0, 0]",0.149115,rt @kkl_fan: beautiful ad!! \n#trumppence16 \n...,rt <user> : beautiful ad ! <repeated> <hashtag...,"[rt, beautiful, #moms]","[rt, <user>, :, beautiful, ad, !, <hashtag>, t...",3,...,0.995786,0.182804,0.184161,0.182804,0.181328,0.183467,0.186367,0.187729,[#moms],1
7,443820277991227392,0.9999999999805111,RT @goodfirednada: #LORDJASONJEROME @LORDJASON...,"[4, 0, 0, 0, 0, 184303, 0, 111748, 208615, 0]",0.120411,rt @goodfirednada: #lordjasonjerome @lordjason...,rt <user> : <hashtag> lordjasonjerome </hashta...,"[rt, #iphonegames, #gain, #followback, #android]","[rt, <user>, :, <hashtag>, lordjasonjerome, <u...",5,...,0.745385,0.115092,0.107949,0.115092,0.114885,0.112703,0.116738,0.110964,[#iphonegames],1
8,780094360566325250,0.9999999999802708,My girl💘 #coolkids #nyckids #citykids #mygirl ...,"[30, 0, 0, 0, 0, 0, 0, 268056, 0, 0, 0, 0, 0]",0.640879,my girl💘 #coolkids #nyckids #citykids #mygirl ...,my girl 💘 <hashtag> cool kids </hashtag> <hash...,"[my, #mom]","[my, girl, <hashtag>, cool, kids, <hashtag>, n...",2,...,0.999928,0.649088,0.685884,0.649088,0.64671,0.648431,0.647792,0.658387,[#mom],1
9,455557903442001922,0.9999999999796404,What if? #changeyourlife #extraincome #ItWorks...,"[87, 0, 0, 0, 0, 616487, 0, 0, 0, 0]",0.611903,what if? #changeyourlife #extraincome #itworks...,what if ? <hashtag> change your life </hashtag...,"[what, #moms]","[what, if, ?, <hashtag>, change, your, life, <...",2,...,0.999987,0.54719,0.544872,0.54719,0.544565,0.533584,0.526693,0.508063,[#moms],1


The model trained on only lowercased data (without preprocessing) assign very high probabilities to irrelevant tweets.

In [92]:
df_filtered.sort_values(by ='y_predict_proba_lowercased', ascending=False).reset_index(drop=True).head()

Unnamed: 0,tweet_id,glove_pos_model,text,text_tokenized,glove_cnn_class_pred,ProcessedText,ekphrasis_text,word_picked_up_by_glove_lowercased,word_picked_up_by_glove_ekphrasis,len_list_words_picked_up_lowercased,...,y_predict_proba_lowercased,y_predict_proba_ekphrasis,y_predict_proba_ekphrasis_new,y_predict_proba_ekphrasis_2,y_predict_proba_ekphrasis_3,y_predict_proba_ekphrasis_4,y_predict_proba_ekphrasis_5,y_predict_proba_ekphrasis_10,word_picked_up,len_list_words_picked_up
0,328029087752929280,0.9999999999905302,#mothersdaygift #itworks #Tighten #Tone #FullT...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 616487, 0, 0]",0.607089,#mothersdaygift #itworks #tighten #tone #fullt...,<hashtag> mothers day gift </hashtag> <hashtag...,[#moms],"[<hashtag>, mothers, day, gift, <hashtag>, it,...",1,...,1.0,0.539414,0.56757,0.539414,0.537024,0.5298,0.527055,0.495295,[#moms],1
1,488313257741287424,0.9999967353806156,#itWorks #Skinny #healthy #babyBounceBack #bab...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 269014, 0, 0, 0]",0.749485,#itworks #skinny #healthy #babybounceback #bab...,<hashtag> it works </hashtag> <hashtag> skinny...,[#sexy],"[<hashtag>, it, works, <hashtag>, skinny, <has...",1,...,1.0,0.343103,0.378475,0.343103,0.341331,0.349394,0.362769,0.321221,[#sexy],1
2,479375690911862784,0.9999983752894293,#WCW _mylannnn #sexy #lilbooty #prettyface #Bl...,"[0, 0, 269014, 0, 0, 0, 0, 0, 0]",0.829723,#wcw _mylannnn #sexy #lilbooty #prettyface #bl...,<hashtag> wcw </hashtag> _mylannnn <hashtag> s...,[#sexy],"[<hashtag>, wcw, <hashtag>, sexy, <hashtag>, l...",1,...,1.0,0.116543,0.114173,0.116543,0.115749,0.11841,0.119056,0.089743,[#sexy],1
3,737972811143221248,0.9999972745549393,I'm live: https://t.co/zwQRiKSP50 #sexy #femdo...,"[0, 0, 0, 269014, 0, 0, 0, 0]",0.776279,i'm live: https://t.co/zwqriksp50 #sexy #femdo...,i am live : <url> <hashtag> sexy </hashtag> <h...,[#sexy],"[i, am, live, :, <url>, <hashtag>, sexy, <hash...",1,...,1.0,0.308325,0.305389,0.308325,0.307622,0.287075,0.296658,0.213396,[#sexy],1
4,946788483754053632,0.999999895358982,Floorwork @tcdanceacademy #yogatrapezetraining...,"[0, 0, 0, 269014, 0, 0, 0, 0]",0.776279,floorwork @tcdanceacademy #yogatrapezetraining...,floorwork <user> <hashtag> yoga trapeze traini...,[#sexy],"[<user>, <hashtag>, yoga, trapeze, training, <...",1,...,1.0,0.017406,0.013319,0.017406,0.01738,0.018747,0.019393,0.020997,[#sexy],1


On the other hand, the model trained on preprocessed text performs way better. The first 5 tweets are relevant to the unemployment subject.

In [93]:
df_filtered.sort_values(by ='y_predict_proba_ekphrasis', ascending=False).reset_index(drop=True).head()

Unnamed: 0,tweet_id,glove_pos_model,text,text_tokenized,glove_cnn_class_pred,ProcessedText,ekphrasis_text,word_picked_up_by_glove_lowercased,word_picked_up_by_glove_ekphrasis,len_list_words_picked_up_lowercased,...,y_predict_proba_lowercased,y_predict_proba_ekphrasis,y_predict_proba_ekphrasis_new,y_predict_proba_ekphrasis_2,y_predict_proba_ekphrasis_3,y_predict_proba_ekphrasis_4,y_predict_proba_ekphrasis_5,y_predict_proba_ekphrasis_10,word_picked_up,len_list_words_picked_up
0,467423895654395904,0.9999882321081556,unemployed biketoworkday rodeanyway,"[32814, 0, 0]",0.865931,unemployed biketoworkday rodeanyway,unemployed biketoworkday rodeanyway,[unemployed],[unemployed],1,...,0.999982,1.0,1.0,1.0,1.0,1.0,1.0,1.0,[unemployed],1
1,459512998047277056,0.999987560761154,Officially unemployed,"[3621, 32814]",0.991599,officially unemployed,officially unemployed,"[officially, unemployed]","[officially, unemployed]",2,...,0.999858,1.0,1.0,1.0,1.0,1.0,1.0,1.0,[unemployed],1
2,360837810544058371,0.9999909659593824,Officially unemployed 😔,"[3621, 32814, 0]",0.991599,officially unemployed 😔,officially unemployed 😔,"[officially, unemployed]","[officially, unemployed]",2,...,0.999858,1.0,1.0,1.0,1.0,1.0,1.0,1.0,[unemployed],1
3,366301696806223873,0.9999914615842797,Officially unemployed 👌,"[3621, 32814, 0]",0.991599,officially unemployed 👌,officially unemployed 👌,"[officially, unemployed]","[officially, unemployed]",2,...,0.999858,1.0,1.0,1.0,1.0,1.0,1.0,1.0,[unemployed],1
4,628295079162056704,0.999988388445644,Officially unemployed 👀,"[3621, 32814, 0]",0.991599,officially unemployed 👀,officially unemployed 👀,"[officially, unemployed]","[officially, unemployed]",2,...,0.999858,1.0,1.0,1.0,1.0,1.0,1.0,1.0,[unemployed],1


Adding the number of words picked up by GloVe as a feature in the covariates doesn't change the results, at least for the top tweets.

In [94]:
df_filtered.sort_values(by ='y_predict_proba_ekphrasis_new', ascending=False).reset_index(drop=True).head()

Unnamed: 0,tweet_id,glove_pos_model,text,text_tokenized,glove_cnn_class_pred,ProcessedText,ekphrasis_text,word_picked_up_by_glove_lowercased,word_picked_up_by_glove_ekphrasis,len_list_words_picked_up_lowercased,...,y_predict_proba_lowercased,y_predict_proba_ekphrasis,y_predict_proba_ekphrasis_new,y_predict_proba_ekphrasis_2,y_predict_proba_ekphrasis_3,y_predict_proba_ekphrasis_4,y_predict_proba_ekphrasis_5,y_predict_proba_ekphrasis_10,word_picked_up,len_list_words_picked_up
0,467423895654395904,0.9999882321081556,unemployed biketoworkday rodeanyway,"[32814, 0, 0]",0.865931,unemployed biketoworkday rodeanyway,unemployed biketoworkday rodeanyway,[unemployed],[unemployed],1,...,0.999982,1.0,1.0,1.0,1.0,1.0,1.0,1.0,[unemployed],1
1,360837810544058371,0.9999909659593824,Officially unemployed 😔,"[3621, 32814, 0]",0.991599,officially unemployed 😔,officially unemployed 😔,"[officially, unemployed]","[officially, unemployed]",2,...,0.999858,1.0,1.0,1.0,1.0,1.0,1.0,1.0,[unemployed],1
2,459512998047277056,0.999987560761154,Officially unemployed,"[3621, 32814]",0.991599,officially unemployed,officially unemployed,"[officially, unemployed]","[officially, unemployed]",2,...,0.999858,1.0,1.0,1.0,1.0,1.0,1.0,1.0,[unemployed],1
3,628295079162056704,0.999988388445644,Officially unemployed 👀,"[3621, 32814, 0]",0.991599,officially unemployed 👀,officially unemployed 👀,"[officially, unemployed]","[officially, unemployed]",2,...,0.999858,1.0,1.0,1.0,1.0,1.0,1.0,1.0,[unemployed],1
4,366301696806223873,0.9999914615842797,Officially unemployed 👌,"[3621, 32814, 0]",0.991599,officially unemployed 👌,officially unemployed 👌,"[officially, unemployed]","[officially, unemployed]",2,...,0.999858,1.0,1.0,1.0,1.0,1.0,1.0,1.0,[unemployed],1


In [95]:
df_filtered_save = df_filtered[['text','glove_pos_model','word_picked_up','len_list_words_picked_up','y_predict_proba_lowercased','len_list_words_picked_up_lowercased', 'word_picked_up_by_glove_lowercased', 'y_predict_proba_ekphrasis','len_list_words_picked_up_ekphrasis','word_picked_up_by_glove_ekphrasis','y_predict_proba_ekphrasis_new','glove_cnn_class_pred']]
df_filtered_save.columns = ['text','glove_logit_basic_proba', 'words_glove_no_lower','nb_words_glove_no_lower', 'glove_logit_lower_proba', 'nb_words_glove_lower', 'words_glove_lower', 'glove_logit_preprocess_proba','nb_words_glove_preprocess', 'words_glove_preprocess','glove_logit_preprocess_extra_feature_proba', 'glove_cnn_proba']
df_filtered_save.head()

Unnamed: 0,text,glove_logit_basic_proba,words_glove_no_lower,nb_words_glove_no_lower,glove_logit_lower_proba,nb_words_glove_lower,words_glove_lower,glove_logit_preprocess_proba,nb_words_glove_preprocess,words_glove_preprocess,glove_logit_preprocess_extra_feature_proba,glove_cnn_proba
0,How To Avoid Work-at-Home Scams http://t.co/mm...,0.9999999999917516,[#moms],1,0.734378,6,"[how, to, avoid, work-at-home, scams, #moms]",0.02065,12,"[how, to, avoid, work, -, at, -, home, scams, ...",0.012303,0.300822
1,RT @FastCompany Instead Of Taking Your Daughte...,0.9999999999906112,[#moms],1,0.355912,12,"[rt, instead, of, taking, your, daughters, to,...",0.127769,17,"[rt, <user>, instead, of, taking, your, daught...",0.103971,0.116052
2,#mothersdaygift #itworks #Tighten #Tone #FullT...,0.9999999999905302,[#moms],1,1.0,1,[#moms],0.539414,29,"[<hashtag>, mothers, day, gift, <hashtag>, it,...",0.56757,0.607089
3,Find Other Moms In Your Area http://t.co/PSkL3...,0.999999999990338,[#moms],1,0.621039,11,"[find, other, moms, in, your, area, to, meet, ...",0.161171,30,"[find, other, moms, in, your, area, <url>, to,...",0.163799,0.216608
4,EntrepreneurMoms Pilot Turned Linen-Maker Jenn...,0.9999999999901836,[#moms],1,0.907417,6,"[pilot, turned, jenny, main, photo, #moms]",0.254525,21,"[pilot, turned, linen, -, maker, jenny, davids...",0.292834,0.35823


In [96]:
df_filtered_save.to_csv("/home/manuto/Documents/world_bank/bert_twitter_labor/data/glove_cnn_prediction_data/glove_predictions/results_slides/glove_filtered.csv", index=False)

### Threshold analysis on filtered set

#### Minimum threshold of picked up words = 2

Top 5 tweets for model trained on tweets where at least 2 words are picked up

In [264]:
df_filtered_2 = df_filtered.sort_values(by ='y_predict_proba_ekphrasis_2', ascending=False).reset_index(drop=True)
df_filtered_2.head()

Unnamed: 0,tweet_id,glove_pos_model,text,ProcessedText,ekphrasis_text,word_picked_up_by_glove_lowercased,word_picked_up_by_glove_ekphrasis,len_list_words_picked_up_lowercased,len_list_words_picked_up_ekphrasis,y_predict_proba_lowercased,y_predict_proba_ekphrasis,y_predict_proba_ekphrasis_new,y_predict_proba_ekphrasis_2,y_predict_proba_ekphrasis_3,y_predict_proba_ekphrasis_4,y_predict_proba_ekphrasis_5,y_predict_proba_ekphrasis_10
0,467423895654395904,0.999988,unemployed biketoworkday rodeanyway,unemployed biketoworkday rodeanyway,unemployed biketoworkday rodeanyway,[unemployed],[unemployed],1,1,0.999982,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,360837810544058371,0.999991,Officially unemployed 😔,officially unemployed 😔,officially unemployed 😔,"[officially, unemployed]","[officially, unemployed]",2,2,0.999858,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,628295079162056704,0.999988,Officially unemployed 👀,officially unemployed 👀,officially unemployed 👀,"[officially, unemployed]","[officially, unemployed]",2,2,0.999858,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,459512998047277056,0.999988,Officially unemployed,officially unemployed,officially unemployed,"[officially, unemployed]","[officially, unemployed]",2,2,0.999858,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,366301696806223873,0.999991,Officially unemployed 👌,officially unemployed 👌,officially unemployed 👌,"[officially, unemployed]","[officially, unemployed]",2,2,0.999858,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Top 5 tweets for model trained on tweets where at least 2 words are picked up, excluding tweets where less than 2 words were picked up

In [265]:
df_filtered_2.loc[df_filtered_2['len_list_words_picked_up_ekphrasis']>=2].reset_index(drop=True).head()

Unnamed: 0,tweet_id,glove_pos_model,text,ProcessedText,ekphrasis_text,word_picked_up_by_glove_lowercased,word_picked_up_by_glove_ekphrasis,len_list_words_picked_up_lowercased,len_list_words_picked_up_ekphrasis,y_predict_proba_lowercased,y_predict_proba_ekphrasis,y_predict_proba_ekphrasis_new,y_predict_proba_ekphrasis_2,y_predict_proba_ekphrasis_3,y_predict_proba_ekphrasis_4,y_predict_proba_ekphrasis_5,y_predict_proba_ekphrasis_10
0,360837810544058371,0.999991,Officially unemployed 😔,officially unemployed 😔,officially unemployed 😔,"[officially, unemployed]","[officially, unemployed]",2,2,0.999858,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,628295079162056704,0.999988,Officially unemployed 👀,officially unemployed 👀,officially unemployed 👀,"[officially, unemployed]","[officially, unemployed]",2,2,0.999858,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,459512998047277056,0.999988,Officially unemployed,officially unemployed,officially unemployed,"[officially, unemployed]","[officially, unemployed]",2,2,0.999858,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,366301696806223873,0.999991,Officially unemployed 👌,officially unemployed 👌,officially unemployed 👌,"[officially, unemployed]","[officially, unemployed]",2,2,0.999858,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,363167939265961986,0.999991,Never unemployed,never unemployed,never unemployed,"[never, unemployed]","[never, unemployed]",2,2,0.998368,0.999996,0.999997,0.999996,0.999996,0.999996,0.999995,0.999994


#### Minimum threshold of picked up words = 3

Top 5 tweets for model trained on tweets where at least 3 words are picked up

In [266]:
df_filtered_3 = df_filtered.sort_values(by ='y_predict_proba_ekphrasis_3', ascending=False).reset_index(drop=True)
df_filtered_3.head()

Unnamed: 0,tweet_id,glove_pos_model,text,ProcessedText,ekphrasis_text,word_picked_up_by_glove_lowercased,word_picked_up_by_glove_ekphrasis,len_list_words_picked_up_lowercased,len_list_words_picked_up_ekphrasis,y_predict_proba_lowercased,y_predict_proba_ekphrasis,y_predict_proba_ekphrasis_new,y_predict_proba_ekphrasis_2,y_predict_proba_ekphrasis_3,y_predict_proba_ekphrasis_4,y_predict_proba_ekphrasis_5,y_predict_proba_ekphrasis_10
0,467423895654395904,0.999988,unemployed biketoworkday rodeanyway,unemployed biketoworkday rodeanyway,unemployed biketoworkday rodeanyway,[unemployed],[unemployed],1,1,0.999982,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,366301696806223873,0.999991,Officially unemployed 👌,officially unemployed 👌,officially unemployed 👌,"[officially, unemployed]","[officially, unemployed]",2,2,0.999858,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,360837810544058371,0.999991,Officially unemployed 😔,officially unemployed 😔,officially unemployed 😔,"[officially, unemployed]","[officially, unemployed]",2,2,0.999858,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,628295079162056704,0.999988,Officially unemployed 👀,officially unemployed 👀,officially unemployed 👀,"[officially, unemployed]","[officially, unemployed]",2,2,0.999858,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,459512998047277056,0.999988,Officially unemployed,officially unemployed,officially unemployed,"[officially, unemployed]","[officially, unemployed]",2,2,0.999858,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Top 5 tweets for model trained on tweets where at least 3 words are picked up, excluding tweets where less than 3 words were picked up

In [267]:
df_filtered_3.loc[df_filtered_3['len_list_words_picked_up_ekphrasis']>=3].reset_index(drop=True).head()

Unnamed: 0,tweet_id,glove_pos_model,text,ProcessedText,ekphrasis_text,word_picked_up_by_glove_lowercased,word_picked_up_by_glove_ekphrasis,len_list_words_picked_up_lowercased,len_list_words_picked_up_ekphrasis,y_predict_proba_lowercased,y_predict_proba_ekphrasis,y_predict_proba_ekphrasis_new,y_predict_proba_ekphrasis_2,y_predict_proba_ekphrasis_3,y_predict_proba_ekphrasis_4,y_predict_proba_ekphrasis_5,y_predict_proba_ekphrasis_10
0,351971366259535874,1.0,Ily workaholics ☺😊,ily workaholics ☺😊,ily workaholics ☺ 😊,"[ily, workaholics]","[ily, workaholics, ☺]",2,3,0.999935,0.999983,0.999952,0.999983,0.999982,0.999981,0.999979,0.999962
1,291581684883812352,1.0,workaholics #GETWEIRD,workaholics #getweird,workaholics <hashtag> getweird </hashtag>,[workaholics],"[workaholics, <hashtag>, getweird]",1,3,1.0,0.999977,0.99996,0.999977,0.999976,0.999975,0.999975,0.999903
2,649649233084026880,0.999988,I'm unemployed,i'm unemployed,i am unemployed,[unemployed],"[i, am, unemployed]",1,3,0.999982,0.999957,0.999968,0.999957,0.999956,0.99995,0.999946,0.999937
3,915207920572358656,0.999996,That's unemployed,that's unemployed,that ' s unemployed,[unemployed],"[that, ', s, unemployed]",1,4,0.999982,0.999834,0.999679,0.999834,0.99983,0.999799,0.999792,0.999664
4,279790255048118274,0.99999,jobless #nothappy,jobless #nothappy,jobless <hashtag> not happy </hashtag>,[jobless],"[jobless, <hashtag>, not, happy]",1,4,0.999884,0.999541,0.999318,0.999541,0.99953,0.999442,0.999452,0.998075


#### Minimum threshold of picked up words = 4

Top 5 tweets for model trained on tweets where at least 4 words are picked up

In [268]:
df_filtered_4 = df_filtered.sort_values(by ='y_predict_proba_ekphrasis_4', ascending=False).reset_index(drop=True)
df_filtered_4.head()

Unnamed: 0,tweet_id,glove_pos_model,text,ProcessedText,ekphrasis_text,word_picked_up_by_glove_lowercased,word_picked_up_by_glove_ekphrasis,len_list_words_picked_up_lowercased,len_list_words_picked_up_ekphrasis,y_predict_proba_lowercased,y_predict_proba_ekphrasis,y_predict_proba_ekphrasis_new,y_predict_proba_ekphrasis_2,y_predict_proba_ekphrasis_3,y_predict_proba_ekphrasis_4,y_predict_proba_ekphrasis_5,y_predict_proba_ekphrasis_10
0,467423895654395904,0.999988,unemployed biketoworkday rodeanyway,unemployed biketoworkday rodeanyway,unemployed biketoworkday rodeanyway,[unemployed],[unemployed],1,1,0.999982,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,366301696806223873,0.999991,Officially unemployed 👌,officially unemployed 👌,officially unemployed 👌,"[officially, unemployed]","[officially, unemployed]",2,2,0.999858,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,360837810544058371,0.999991,Officially unemployed 😔,officially unemployed 😔,officially unemployed 😔,"[officially, unemployed]","[officially, unemployed]",2,2,0.999858,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,459512998047277056,0.999988,Officially unemployed,officially unemployed,officially unemployed,"[officially, unemployed]","[officially, unemployed]",2,2,0.999858,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,628295079162056704,0.999988,Officially unemployed 👀,officially unemployed 👀,officially unemployed 👀,"[officially, unemployed]","[officially, unemployed]",2,2,0.999858,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Top 5 tweets for model trained on tweets where at least 4 words are picked up, excluding tweets where less than 4 words were picked up

In [269]:
df_filtered_4.loc[df_filtered_4['len_list_words_picked_up_ekphrasis']>=4].reset_index(drop=True).head()

Unnamed: 0,tweet_id,glove_pos_model,text,ProcessedText,ekphrasis_text,word_picked_up_by_glove_lowercased,word_picked_up_by_glove_ekphrasis,len_list_words_picked_up_lowercased,len_list_words_picked_up_ekphrasis,y_predict_proba_lowercased,y_predict_proba_ekphrasis,y_predict_proba_ekphrasis_new,y_predict_proba_ekphrasis_2,y_predict_proba_ekphrasis_3,y_predict_proba_ekphrasis_4,y_predict_proba_ekphrasis_5,y_predict_proba_ekphrasis_10
0,915207920572358656,0.999996,That's unemployed,that's unemployed,that ' s unemployed,[unemployed],"[that, ', s, unemployed]",1,4,0.999982,0.999834,0.999679,0.999834,0.99983,0.999799,0.999792,0.999664
1,279790255048118274,0.99999,jobless #nothappy,jobless #nothappy,jobless <hashtag> not happy </hashtag>,[jobless],"[jobless, <hashtag>, not, happy]",1,4,0.999884,0.999541,0.999318,0.999541,0.99953,0.999442,0.999452,0.998075
2,361310806324936705,0.999999,"Homework time, rested enough! 😳","homework time, rested enough! 😳","homework time , rested enough ! 😳","[homework, rested]","[homework, time, ,, rested, enough, !]",2,6,0.99945,0.999397,0.998939,0.999397,0.999387,0.999268,0.999241,0.999417
3,926698476423057408,0.999991,unemployed tweet: pussy juice,unemployed tweet: pussy juice,unemployed tweet : pussy juice,"[unemployed, pussy, juice]","[unemployed, tweet, :, pussy, juice]",3,5,0.999334,0.999354,0.999302,0.999354,0.999347,0.999216,0.999222,0.998407
4,327965523281317888,0.899829,Finally off work..stepping out tonight! ✌💃,finally off work..stepping out tonight! ✌💃,finally off work . <repeated> stepping out ton...,"[finally, off, out]","[finally, off, work, ., stepping, out, tonight...",3,9,0.991541,0.998574,0.998233,0.998574,0.998557,0.998398,0.998068,0.996591


#### Minimum threshold of picked up words = 5

Top 5 tweets for model trained on tweets where at least 5 words are picked up

In [270]:
df_filtered_5 = df_filtered.sort_values(by ='y_predict_proba_ekphrasis_5', ascending=False).reset_index(drop=True)
df_filtered_5.head()

Unnamed: 0,tweet_id,glove_pos_model,text,ProcessedText,ekphrasis_text,word_picked_up_by_glove_lowercased,word_picked_up_by_glove_ekphrasis,len_list_words_picked_up_lowercased,len_list_words_picked_up_ekphrasis,y_predict_proba_lowercased,y_predict_proba_ekphrasis,y_predict_proba_ekphrasis_new,y_predict_proba_ekphrasis_2,y_predict_proba_ekphrasis_3,y_predict_proba_ekphrasis_4,y_predict_proba_ekphrasis_5,y_predict_proba_ekphrasis_10
0,467423895654395904,0.999988,unemployed biketoworkday rodeanyway,unemployed biketoworkday rodeanyway,unemployed biketoworkday rodeanyway,[unemployed],[unemployed],1,1,0.999982,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,360837810544058371,0.999991,Officially unemployed 😔,officially unemployed 😔,officially unemployed 😔,"[officially, unemployed]","[officially, unemployed]",2,2,0.999858,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,366301696806223873,0.999991,Officially unemployed 👌,officially unemployed 👌,officially unemployed 👌,"[officially, unemployed]","[officially, unemployed]",2,2,0.999858,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,628295079162056704,0.999988,Officially unemployed 👀,officially unemployed 👀,officially unemployed 👀,"[officially, unemployed]","[officially, unemployed]",2,2,0.999858,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,459512998047277056,0.999988,Officially unemployed,officially unemployed,officially unemployed,"[officially, unemployed]","[officially, unemployed]",2,2,0.999858,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Top 5 tweets for model trained on tweets where at least 5 words are picked up, excluding tweets where less than 5 words were picked up

In [271]:
df_filtered_10 = df_filtered.sort_values(by ='y_predict_proba_ekphrasis_5', ascending=False).reset_index(drop=True)
df_filtered_10.head()

Unnamed: 0,tweet_id,glove_pos_model,text,ProcessedText,ekphrasis_text,word_picked_up_by_glove_lowercased,word_picked_up_by_glove_ekphrasis,len_list_words_picked_up_lowercased,len_list_words_picked_up_ekphrasis,y_predict_proba_lowercased,y_predict_proba_ekphrasis,y_predict_proba_ekphrasis_new,y_predict_proba_ekphrasis_2,y_predict_proba_ekphrasis_3,y_predict_proba_ekphrasis_4,y_predict_proba_ekphrasis_5,y_predict_proba_ekphrasis_10
0,467423895654395904,0.999988,unemployed biketoworkday rodeanyway,unemployed biketoworkday rodeanyway,unemployed biketoworkday rodeanyway,[unemployed],[unemployed],1,1,0.999982,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,360837810544058371,0.999991,Officially unemployed 😔,officially unemployed 😔,officially unemployed 😔,"[officially, unemployed]","[officially, unemployed]",2,2,0.999858,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,366301696806223873,0.999991,Officially unemployed 👌,officially unemployed 👌,officially unemployed 👌,"[officially, unemployed]","[officially, unemployed]",2,2,0.999858,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,628295079162056704,0.999988,Officially unemployed 👀,officially unemployed 👀,officially unemployed 👀,"[officially, unemployed]","[officially, unemployed]",2,2,0.999858,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,459512998047277056,0.999988,Officially unemployed,officially unemployed,officially unemployed,"[officially, unemployed]","[officially, unemployed]",2,2,0.999858,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Analysis random set 

In [111]:
#add feature of number of words picked up on filtered X
random_array_new_feature = np.zeros((22000,201))
for i in range(random_vecs_bert_mean_ekphrasis.shape[0]):
    random_array_new_feature[i] = np.append(random_vecs_bert_mean_ekphrasis[i], df_random_bert['len_list_words_picked_up_ekphrasis'][i])

In [112]:
df_random_bert['y_predict_proba_lowercased'] = clf_lowercased.predict_proba(random_vecs_bert_mean_lowercased)[:, 1]
df_random_bert['y_predict_proba_ekphrasis'] = clf_ekphrasis.predict_proba(random_vecs_bert_mean_ekphrasis)[:, 1]
df_random_bert['y_predict_proba_ekphrasis_new'] = clf_ekphrasis_new.predict_proba(random_array_new_feature)[:, 1]
df_random_bert['y_predict_proba_ekphrasis_2'] = clf_ekphrasis_2.predict_proba(random_vecs_bert_mean_ekphrasis)[:, 1]
df_random_bert['y_predict_proba_ekphrasis_3'] = clf_ekphrasis_3.predict_proba(random_vecs_bert_mean_ekphrasis)[:, 1]
df_random_bert['y_predict_proba_ekphrasis_4'] = clf_ekphrasis_4.predict_proba(random_vecs_bert_mean_ekphrasis)[:, 1]
df_random_bert['y_predict_proba_ekphrasis_5'] = clf_ekphrasis_5.predict_proba(random_vecs_bert_mean_ekphrasis)[:, 1]
df_random_bert['y_predict_proba_ekphrasis_10'] = clf_ekphrasis_10.predict_proba(random_vecs_bert_mean_ekphrasis)[:, 1]
df_random_bert.head()

Unnamed: 0,tweet_id,pos_model,text,text_tokenized,glove_cnn_class_pred,ProcessedText,ekphrasis_text,word_picked_up,word_picked_up_by_glove_lowercased,word_picked_up_by_glove_ekphrasis,...,len_list_words_picked_up_lowercased,len_list_words_picked_up_ekphrasis,y_predict_proba_lowercased,y_predict_proba_ekphrasis,y_predict_proba_ekphrasis_new,y_predict_proba_ekphrasis_2,y_predict_proba_ekphrasis_3,y_predict_proba_ekphrasis_4,y_predict_proba_ekphrasis_5,y_predict_proba_ekphrasis_10
0,728443901963444224,0.938718,I got fired today. 😂,"[11, 144, 9118, 382215, 0]",0.987249,i got fired today. 😂,i got fired today . 😂,"[got, fired, today.]","[i, got, fired, today.]","[i, got, fired, today, .]",...,4,5,0.840077,0.864211,0.861721,0.864211,0.863059,0.86536,0.849934,0.713432
1,785875357358358529,0.936386,I was fired today. I'm a badass,"[11, 94, 9118, 382215, 0, 12, 8267]",0.764424,i was fired today. i'm a badass,i was fired today . i am a badass,"[was, fired, today., a, badass]","[i, was, fired, today., a, badass]","[i, was, fired, today, ., i, am, a, badass]",...,6,9,0.741759,0.875303,0.852757,0.875303,0.874311,0.873575,0.867218,0.791571
2,336917153766834177,0.935609,Lol Vinny just got fired,"[89, 37790, 60, 144, 9118]",0.837681,lol vinny just got fired,lol vinny just got fired,"[just, got, fired]","[lol, vinny, just, got, fired]","[lol, vinny, just, got, fired]",...,5,5,0.822723,0.935286,0.93592,0.935286,0.934522,0.933309,0.926477,0.873339
3,285522710845603841,0.934539,Andy Reid got fired... About time,"[5423, 22022, 144, 0, 122, 136]",0.679201,andy reid got fired... about time,andy reid got fired . <repeated> about time,"[got, time]","[andy, reid, got, about, time]","[andy, reid, got, fired, ., about, time]",...,5,7,0.800994,0.974387,0.966022,0.974387,0.974051,0.972589,0.974389,0.940988
4,496205184654008321,0.934478,i got fired https://t.co/FjMrgTf8kJ,"[11, 144, 9118, 0, 0]",0.959649,i got fired https://t.co/fjmrgtf8kj,i got fired <url>,"[i, got, fired]","[i, got, fired]","[i, got, fired, <url>]",...,3,4,0.860262,0.656734,0.675738,0.656734,0.655003,0.65756,0.6462,0.462698


In [115]:
df_random_bert_save = df_random_bert[['text','pos_model','word_picked_up','len_list_words_picked_up','y_predict_proba_lowercased','len_list_words_picked_up_lowercased', 'word_picked_up_by_glove_lowercased', 'y_predict_proba_ekphrasis','len_list_words_picked_up_ekphrasis','word_picked_up_by_glove_ekphrasis','y_predict_proba_ekphrasis_new','glove_cnn_class_pred']]
df_random_bert_save.columns = ['text','bert_proba', 'words_glove_no_lower','nb_words_glove_no_lower', 'glove_logit_lower_proba', 'nb_words_glove_lower', 'words_glove_lower', 'glove_logit_preprocess_proba','nb_words_glove_preprocess', 'words_glove_preprocess','glove_logit_preprocess_extra_feature_proba', 'glove_cnn_proba']
df_random_bert_save.head()

Unnamed: 0,text,bert_proba,words_glove_no_lower,nb_words_glove_no_lower,glove_logit_lower_proba,nb_words_glove_lower,words_glove_lower,glove_logit_preprocess_proba,nb_words_glove_preprocess,words_glove_preprocess,glove_logit_preprocess_extra_feature_proba,glove_cnn_proba
0,I got fired today. 😂,0.938718,"[got, fired, today.]",3,0.840077,4,"[i, got, fired, today.]",0.864211,5,"[i, got, fired, today, .]",0.861721,0.987249
1,I was fired today. I'm a badass,0.936386,"[was, fired, today., a, badass]",5,0.741759,6,"[i, was, fired, today., a, badass]",0.875303,9,"[i, was, fired, today, ., i, am, a, badass]",0.852757,0.764424
2,Lol Vinny just got fired,0.935609,"[just, got, fired]",3,0.822723,5,"[lol, vinny, just, got, fired]",0.935286,5,"[lol, vinny, just, got, fired]",0.93592,0.837681
3,Andy Reid got fired... About time,0.934539,"[got, time]",2,0.800994,5,"[andy, reid, got, about, time]",0.974387,7,"[andy, reid, got, fired, ., about, time]",0.966022,0.679201
4,i got fired https://t.co/FjMrgTf8kJ,0.934478,"[i, got, fired]",3,0.860262,3,"[i, got, fired]",0.656734,4,"[i, got, fired, <url>]",0.675738,0.959649


In [116]:
df_random_bert_save.to_csv("/home/manuto/Documents/world_bank/bert_twitter_labor/data/glove_cnn_prediction_data/glove_predictions/results_slides/bert_random.csv", index=False)

In [97]:
#add feature of number of words picked up on filtered X
random_array_new_feature = np.zeros((22000,201))
for i in range(random_vecs_glove_mean_ekphrasis.shape[0]):
    random_array_new_feature[i] = np.append(random_vecs_glove_mean_ekphrasis[i], df_random['len_list_words_picked_up_ekphrasis'][i])

In [98]:
df_random['y_predict_proba_lowercased'] = clf_lowercased.predict_proba(random_vecs_glove_mean_lowercased)[:, 1]
df_random['y_predict_proba_ekphrasis'] = clf_ekphrasis.predict_proba(random_vecs_glove_mean_ekphrasis)[:, 1]
df_random['y_predict_proba_ekphrasis_new'] = clf_ekphrasis_new.predict_proba(random_array_new_feature)[:, 1]
df_random['y_predict_proba_ekphrasis_2'] = clf_ekphrasis_2.predict_proba(random_vecs_glove_mean_ekphrasis)[:, 1]
df_random['y_predict_proba_ekphrasis_3'] = clf_ekphrasis_3.predict_proba(random_vecs_glove_mean_ekphrasis)[:, 1]
df_random['y_predict_proba_ekphrasis_4'] = clf_ekphrasis_4.predict_proba(random_vecs_glove_mean_ekphrasis)[:, 1]
df_random['y_predict_proba_ekphrasis_5'] = clf_ekphrasis_5.predict_proba(random_vecs_glove_mean_ekphrasis)[:, 1]
df_random['y_predict_proba_ekphrasis_10'] = clf_ekphrasis_10.predict_proba(random_vecs_glove_mean_ekphrasis)[:, 1]
df_random.head()

Unnamed: 0,tweet_id,glove_pos_model,text,text_tokenized,glove_cnn_class_pred,ProcessedText,ekphrasis_text,word_picked_up_by_glove_lowercased,word_picked_up_by_glove_ekphrasis,len_list_words_picked_up_lowercased,...,y_predict_proba_lowercased,y_predict_proba_ekphrasis,y_predict_proba_ekphrasis_new,y_predict_proba_ekphrasis_2,y_predict_proba_ekphrasis_3,y_predict_proba_ekphrasis_4,y_predict_proba_ekphrasis_5,y_predict_proba_ekphrasis_10,word_picked_up,len_list_words_picked_up
0,1067199112435679233,1.0,December Giveaway \n#storkvisionmarion #storkv...,"[3129, 2250, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 268...",0.353126,december giveaway \n#storkvisionmarion #storkv...,december giveaway <hashtag> stork vision mario...,"[december, giveaway, #mom]","[december, giveaway, <hashtag>, stork, vision,...",3,...,0.983625,0.24833,0.331122,0.24833,0.246965,0.246123,0.259627,0.17279,[#mom],1
1,444443081644179456,1.0,RT @Reza848: #LORDJASONJEROME @LORDJASONJEROME...,"[4, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0.373738,rt @reza848: #lordjasonjerome @lordjasonjerome...,rt <user> : <hashtag> lordjasonjerome </hashta...,"[rt, #iphonegames]","[rt, <user>, :, <hashtag>, lordjasonjerome, <u...",2,...,0.997048,0.189556,0.194416,0.189556,0.188968,0.189258,0.189435,0.177733,[#iphonegames],1
2,443970350230298624,1.0,RT @HingerLovera: #LORDJASONJEROME @LORDJASONJ...,"[4, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0.373738,rt @hingerlovera: #lordjasonjerome @lordjasonj...,rt <user> : <hashtag> lordjasonjerome </hashta...,"[rt, #iphonegames]","[rt, <user>, :, <hashtag>, lordjasonjerome, <u...",2,...,0.997048,0.189556,0.194416,0.189556,0.188968,0.189258,0.189435,0.177733,[#iphonegames],1
3,444230598459392000,1.0,RT @Bethany2211: #LORDJASONJEROME @LORDJASONJE...,"[4, 0, 0, 0, 0, 0, 59614, 0, 0, 0]",0.58685,rt @bethany2211: #lordjasonjerome @lordjasonje...,rt <user> : <hashtag> lordjasonjerome </hashta...,"[rt, #iphonegames]","[rt, <user>, :, <hashtag>, lordjasonjerome, <u...",2,...,0.997048,0.054976,0.056643,0.054976,0.054914,0.057704,0.057663,0.066567,[#iphonegames],1
4,444277343730008064,1.0,RT @azamater19: #LORDJASONJEROME @LORDJASONJER...,"[4, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0.373738,rt @azamater19: #lordjasonjerome @lordjasonjer...,rt <user> : <hashtag> lordjasonjerome </hashta...,"[rt, #iphonegames]","[rt, <user>, :, <hashtag>, lordjasonjerome, <u...",2,...,0.997048,0.018662,0.01849,0.018662,0.018675,0.020568,0.02109,0.019658,[#iphonegames],1


Similarly as on the filtered set, using a model that was trained only on lowercased data yields completely irrelevant results.

In [99]:
df_random.sort_values(by ='y_predict_proba_lowercased', ascending=False).reset_index(drop=True).head()

Unnamed: 0,tweet_id,glove_pos_model,text,text_tokenized,glove_cnn_class_pred,ProcessedText,ekphrasis_text,word_picked_up_by_glove_lowercased,word_picked_up_by_glove_ekphrasis,len_list_words_picked_up_lowercased,...,y_predict_proba_lowercased,y_predict_proba_ekphrasis,y_predict_proba_ekphrasis_new,y_predict_proba_ekphrasis_2,y_predict_proba_ekphrasis_3,y_predict_proba_ekphrasis_4,y_predict_proba_ekphrasis_5,y_predict_proba_ekphrasis_10,word_picked_up,len_list_words_picked_up
0,610540812754132992,0.999996,#threesome #reallesbianexposed \nhttp://t.co/V...,"[0, 0, 0, 0]",0.441638,#threesome #reallesbianexposed \nhttp://t.co/v...,<hashtag> threesome </hashtag> <hashtag> real ...,[#ass],"[<hashtag>, threesome, <hashtag>, real, lesbia...",1,...,1.0,0.307233,0.245269,0.307233,0.305873,0.313333,0.315944,0.187831,[#ass],1
1,341989350650814464,0.999997,#Trifling #ass #Niggas https://t.co/WRv4dpQHNG,"[0, 349204, 0, 0]",0.824067,#trifling #ass #niggas https://t.co/wrv4dpqhng,<hashtag> trifling </hashtag> <hashtag> ass </...,[#ass],"[<hashtag>, trifling, <hashtag>, ass, <hashtag...",1,...,1.0,0.416145,0.294801,0.416145,0.412668,0.434646,0.412222,0.271311,[#ass],1
2,433294693183787008,0.999998,#ass #feet http://t.co/woHmtU1RJD,"[349204, 0, 0]",0.59578,#ass #feet http://t.co/wohmtu1rjd,<hashtag> ass </hashtag> <hashtag> feet </hash...,[#ass],"[<hashtag>, ass, <hashtag>, feet, <url>]",1,...,1.0,0.260524,0.161764,0.260524,0.258107,0.266978,0.272988,0.146677,[#ass],1
3,619387321151037440,0.999996,#ass #babes \nhttp://t.co/o0d77Hf1bk\n#latina ...,"[349204, 0, 0, 0]",0.59578,#ass #babes \nhttp://t.co/o0d77hf1bk\n#latina ...,<hashtag> ass </hashtag> <hashtag> babes </has...,[#ass],"[<hashtag>, ass, <hashtag>, babes, <url>, <has...",1,...,1.0,0.150904,0.121078,0.150904,0.150265,0.155618,0.154512,0.087654,[#ass],1
4,280459904865693697,0.999998,#ass http://t.co/8UE7WWEH,"[349204, 0]",0.59578,#ass http://t.co/8ue7wweh,<hashtag> ass </hashtag> <url>,[#ass],"[<hashtag>, ass, <url>]",1,...,1.0,0.075355,0.049055,0.075355,0.074751,0.07887,0.080857,0.039335,[#ass],1


When using the model trained on preprocessed data, we now get "cleaner" tweets as top tweets but still not very relevant to the unemployment subject.

In [100]:
df_random.sort_values(by ='y_predict_proba_ekphrasis', ascending=False).reset_index(drop=True).head(n=50)

Unnamed: 0,tweet_id,glove_pos_model,text,text_tokenized,glove_cnn_class_pred,ProcessedText,ekphrasis_text,word_picked_up_by_glove_lowercased,word_picked_up_by_glove_ekphrasis,len_list_words_picked_up_lowercased,...,y_predict_proba_lowercased,y_predict_proba_ekphrasis,y_predict_proba_ekphrasis_new,y_predict_proba_ekphrasis_2,y_predict_proba_ekphrasis_3,y_predict_proba_ekphrasis_4,y_predict_proba_ekphrasis_5,y_predict_proba_ekphrasis_10,word_picked_up,len_list_words_picked_up
0,531330010204733440,0.999988,Emotionally drained,"[11814, 23137]",0.782337,emotionally drained,emotionally drained,"[emotionally, drained]","[emotionally, drained]",2,...,0.999997,1.0,1.0,1.0,1.0,1.0,1.0,1.0,[drained],1
1,634556365634150400,0.999993,emotionally drained,"[11814, 23137]",0.782337,emotionally drained,emotionally drained,"[emotionally, drained]","[emotionally, drained]",2,...,0.999997,1.0,1.0,1.0,1.0,1.0,1.0,1.0,"[emotionally, drained]",2
2,394584354086600705,0.999989,Emotionally drained 😩,"[11814, 23137, 0]",0.782337,emotionally drained 😩,emotionally drained 😩,"[emotionally, drained]","[emotionally, drained]",2,...,0.999997,1.0,1.0,1.0,1.0,1.0,1.0,1.0,[drained],1
3,429894177703591936,1.0,restless,[29812],0.570592,restless,restless,[restless],[restless],1,...,0.999954,1.0,1.0,1.0,1.0,1.0,1.0,1.0,[restless],1
4,530639166330376194,0.999989,😂😂😂😂😂 dormiree,"[0, 417675]",0.744703,😂😂😂😂😂 dormiree,😂 😂 😂 😂 😂 dormiree,[dormiree],[dormiree],1,...,0.999965,1.0,1.0,1.0,1.0,1.0,1.0,0.999998,[dormiree],1
5,430634728257892352,0.999995,Constant sorrows,"[12956, 48195]",0.802352,constant sorrows,constant sorrows,"[constant, sorrows]","[constant, sorrows]",2,...,0.999996,1.0,1.0,1.0,1.0,1.0,1.0,0.999999,[sorrows],1
6,409070624519102464,0.999998,Weak emotionally,"[2774, 11814]",0.340478,weak emotionally,weak emotionally,"[weak, emotionally]","[weak, emotionally]",2,...,0.999915,1.0,1.0,1.0,1.0,1.0,1.0,0.999999,[emotionally],1
7,633418991868973056,0.999988,drained,[23137],0.698684,drained,drained,[drained],[drained],1,...,0.999975,1.0,1.0,1.0,1.0,1.0,1.0,1.0,[drained],1
8,430942493945757697,0.999995,So emotionally drained,"[56, 11814, 23137]",0.879681,so emotionally drained,so emotionally drained,"[so, emotionally, drained]","[so, emotionally, drained]",3,...,0.999936,1.0,1.0,1.0,1.0,1.0,1.0,0.999999,"[emotionally, drained]",2
9,551266254234066947,0.99999,clumsy,[20733],0.390743,clumsy,clumsy,[clumsy],[clumsy],1,...,0.988624,1.0,1.0,1.0,1.0,1.0,1.0,1.0,[clumsy],1


Not much difference when adding the number of words picked up by GloVe as a feature. 

In [101]:
df_random.sort_values(by ='y_predict_proba_ekphrasis_new', ascending=False).reset_index(drop=True).head()

Unnamed: 0,tweet_id,glove_pos_model,text,text_tokenized,glove_cnn_class_pred,ProcessedText,ekphrasis_text,word_picked_up_by_glove_lowercased,word_picked_up_by_glove_ekphrasis,len_list_words_picked_up_lowercased,...,y_predict_proba_lowercased,y_predict_proba_ekphrasis,y_predict_proba_ekphrasis_new,y_predict_proba_ekphrasis_2,y_predict_proba_ekphrasis_3,y_predict_proba_ekphrasis_4,y_predict_proba_ekphrasis_5,y_predict_proba_ekphrasis_10,word_picked_up,len_list_words_picked_up
0,394584354086600705,0.999989,Emotionally drained 😩,"[11814, 23137, 0]",0.782337,emotionally drained 😩,emotionally drained 😩,"[emotionally, drained]","[emotionally, drained]",2,...,0.999997,1.0,1.0,1.0,1.0,1.0,1.0,1.0,[drained],1
1,634556365634150400,0.999993,emotionally drained,"[11814, 23137]",0.782337,emotionally drained,emotionally drained,"[emotionally, drained]","[emotionally, drained]",2,...,0.999997,1.0,1.0,1.0,1.0,1.0,1.0,1.0,"[emotionally, drained]",2
2,531330010204733440,0.999988,Emotionally drained,"[11814, 23137]",0.782337,emotionally drained,emotionally drained,"[emotionally, drained]","[emotionally, drained]",2,...,0.999997,1.0,1.0,1.0,1.0,1.0,1.0,1.0,[drained],1
3,429894177703591936,1.0,restless,[29812],0.570592,restless,restless,[restless],[restless],1,...,0.999954,1.0,1.0,1.0,1.0,1.0,1.0,1.0,[restless],1
4,530639166330376194,0.999989,😂😂😂😂😂 dormiree,"[0, 417675]",0.744703,😂😂😂😂😂 dormiree,😂 😂 😂 😂 😂 dormiree,[dormiree],[dormiree],1,...,0.999965,1.0,1.0,1.0,1.0,1.0,1.0,0.999998,[dormiree],1


In [103]:
df_random_save = df_random[['text','glove_pos_model','word_picked_up','len_list_words_picked_up','y_predict_proba_lowercased','len_list_words_picked_up_lowercased', 'word_picked_up_by_glove_lowercased', 'y_predict_proba_ekphrasis','len_list_words_picked_up_ekphrasis','word_picked_up_by_glove_ekphrasis','y_predict_proba_ekphrasis_new','glove_cnn_class_pred']]
df_random_save.columns = ['text','glove_logit_basic_proba', 'words_glove_no_lower','nb_words_glove_no_lower', 'glove_logit_lower_proba', 'nb_words_glove_lower', 'words_glove_lower', 'glove_logit_preprocess_proba','nb_words_glove_preprocess', 'words_glove_preprocess','glove_logit_preprocess_extra_feature_proba', 'glove_cnn_proba']
df_random_save.head()

Unnamed: 0,text,glove_logit_basic_proba,words_glove_no_lower,nb_words_glove_no_lower,glove_logit_lower_proba,nb_words_glove_lower,words_glove_lower,glove_logit_preprocess_proba,nb_words_glove_preprocess,words_glove_preprocess,glove_logit_preprocess_extra_feature_proba,glove_cnn_proba
0,December Giveaway \n#storkvisionmarion #storkv...,1.0,[#mom],1,0.983625,3,"[december, giveaway, #mom]",0.24833,40,"[december, giveaway, <hashtag>, stork, vision,...",0.331122,0.353126
1,RT @Reza848: #LORDJASONJEROME @LORDJASONJEROME...,1.0,[#iphonegames],1,0.997048,2,"[rt, #iphonegames]",0.189556,23,"[rt, <user>, :, <hashtag>, lordjasonjerome, <u...",0.194416,0.373738
2,RT @HingerLovera: #LORDJASONJEROME @LORDJASONJ...,1.0,[#iphonegames],1,0.997048,2,"[rt, #iphonegames]",0.189556,23,"[rt, <user>, :, <hashtag>, lordjasonjerome, <u...",0.194416,0.373738
3,RT @Bethany2211: #LORDJASONJEROME @LORDJASONJE...,1.0,[#iphonegames],1,0.997048,2,"[rt, #iphonegames]",0.054976,22,"[rt, <user>, :, <hashtag>, lordjasonjerome, <u...",0.056643,0.58685
4,RT @azamater19: #LORDJASONJEROME @LORDJASONJER...,1.0,[#iphonegames],1,0.997048,2,"[rt, #iphonegames]",0.018662,21,"[rt, <user>, :, <hashtag>, lordjasonjerome, <u...",0.01849,0.373738


In [105]:
df_random_save.to_csv("/home/manuto/Documents/world_bank/bert_twitter_labor/data/glove_cnn_prediction_data/glove_predictions/results_slides/glove_random.csv", index=False)

### Threshold analysis on random set

#### Minimum threshold of picked up words = 2

In [168]:
df_random_2 = df_random.sort_values(by ='y_predict_proba_ekphrasis_2', ascending=False).reset_index(drop=True)
df_random_2.head()

Unnamed: 0,tweet_id,glove_pos_model,text,ProcessedText,ekphrasis_text,word_picked_up_by_glove_lowercased,word_picked_up_by_glove_ekphrasis,len_list_words_picked_up_lowercased,len_list_words_picked_up_ekphrasis,y_predict_proba_lowercased,y_predict_proba_ekphrasis,y_predict_proba_ekphrasis_new,y_predict_proba_ekphrasis_2,y_predict_proba_ekphrasis_3,y_predict_proba_ekphrasis_4,y_predict_proba_ekphrasis_5,y_predict_proba_ekphrasis_10
0,394584354086600705,0.999989,Emotionally drained 😩,emotionally drained 😩,emotionally drained 😩,"[emotionally, drained]","[emotionally, drained]",2,2,0.999997,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,531330010204733440,0.999988,Emotionally drained,emotionally drained,emotionally drained,"[emotionally, drained]","[emotionally, drained]",2,2,0.999997,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,634556365634150400,0.999993,emotionally drained,emotionally drained,emotionally drained,"[emotionally, drained]","[emotionally, drained]",2,2,0.999997,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,429894177703591936,1.0,restless,restless,restless,[restless],[restless],1,1,0.999954,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,530639166330376194,0.999989,😂😂😂😂😂 dormiree,😂😂😂😂😂 dormiree,😂 😂 😂 😂 😂 dormiree,[dormiree],[dormiree],1,1,0.999965,1.0,1.0,1.0,1.0,1.0,1.0,0.999998


In [169]:
df_random_2.loc[df_random_2['len_list_words_picked_up_ekphrasis']>=2].reset_index(drop=True).head()

Unnamed: 0,tweet_id,glove_pos_model,text,ProcessedText,ekphrasis_text,word_picked_up_by_glove_lowercased,word_picked_up_by_glove_ekphrasis,len_list_words_picked_up_lowercased,len_list_words_picked_up_ekphrasis,y_predict_proba_lowercased,y_predict_proba_ekphrasis,y_predict_proba_ekphrasis_new,y_predict_proba_ekphrasis_2,y_predict_proba_ekphrasis_3,y_predict_proba_ekphrasis_4,y_predict_proba_ekphrasis_5,y_predict_proba_ekphrasis_10
0,394584354086600705,0.999989,Emotionally drained 😩,emotionally drained 😩,emotionally drained 😩,"[emotionally, drained]","[emotionally, drained]",2,2,0.999997,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,531330010204733440,0.999988,Emotionally drained,emotionally drained,emotionally drained,"[emotionally, drained]","[emotionally, drained]",2,2,0.999997,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,634556365634150400,0.999993,emotionally drained,emotionally drained,emotionally drained,"[emotionally, drained]","[emotionally, drained]",2,2,0.999997,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,430634728257892352,0.999995,Constant sorrows,constant sorrows,constant sorrows,"[constant, sorrows]","[constant, sorrows]",2,2,0.999996,1.0,1.0,1.0,1.0,1.0,1.0,0.999999
4,409070624519102464,0.999998,Weak emotionally,weak emotionally,weak emotionally,"[weak, emotionally]","[weak, emotionally]",2,2,0.999915,1.0,1.0,1.0,1.0,1.0,1.0,0.999999


#### Minimum threshold of picked up words = 3

In [170]:
df_random_3 = df_random.sort_values(by ='y_predict_proba_ekphrasis_3', ascending=False).reset_index(drop=True)
df_random_3.head()

Unnamed: 0,tweet_id,glove_pos_model,text,ProcessedText,ekphrasis_text,word_picked_up_by_glove_lowercased,word_picked_up_by_glove_ekphrasis,len_list_words_picked_up_lowercased,len_list_words_picked_up_ekphrasis,y_predict_proba_lowercased,y_predict_proba_ekphrasis,y_predict_proba_ekphrasis_new,y_predict_proba_ekphrasis_2,y_predict_proba_ekphrasis_3,y_predict_proba_ekphrasis_4,y_predict_proba_ekphrasis_5,y_predict_proba_ekphrasis_10
0,394584354086600705,0.999989,Emotionally drained 😩,emotionally drained 😩,emotionally drained 😩,"[emotionally, drained]","[emotionally, drained]",2,2,0.999997,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,531330010204733440,0.999988,Emotionally drained,emotionally drained,emotionally drained,"[emotionally, drained]","[emotionally, drained]",2,2,0.999997,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,634556365634150400,0.999993,emotionally drained,emotionally drained,emotionally drained,"[emotionally, drained]","[emotionally, drained]",2,2,0.999997,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,429894177703591936,1.0,restless,restless,restless,[restless],[restless],1,1,0.999954,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,530639166330376194,0.999989,😂😂😂😂😂 dormiree,😂😂😂😂😂 dormiree,😂 😂 😂 😂 😂 dormiree,[dormiree],[dormiree],1,1,0.999965,1.0,1.0,1.0,1.0,1.0,1.0,0.999998


In [172]:
df_random_3.loc[df_random_3['len_list_words_picked_up_ekphrasis']>=3].reset_index(drop=True).head(n=10)

Unnamed: 0,tweet_id,glove_pos_model,text,ProcessedText,ekphrasis_text,word_picked_up_by_glove_lowercased,word_picked_up_by_glove_ekphrasis,len_list_words_picked_up_lowercased,len_list_words_picked_up_ekphrasis,y_predict_proba_lowercased,y_predict_proba_ekphrasis,y_predict_proba_ekphrasis_new,y_predict_proba_ekphrasis_2,y_predict_proba_ekphrasis_3,y_predict_proba_ekphrasis_4,y_predict_proba_ekphrasis_5,y_predict_proba_ekphrasis_10
0,430942493945757697,0.999995,So emotionally drained,so emotionally drained,so emotionally drained,"[so, emotionally, drained]","[so, emotionally, drained]",3,3,0.999936,1.0,1.0,1.0,1.0,1.0,1.0,0.999999
1,287073122753064960,0.999998,emotionally charged!,emotionally charged!,emotionally charged !,[emotionally],"[emotionally, charged, !]",1,3,1.0,0.999999,0.999999,0.999999,0.999999,0.999999,0.999999,0.999998
2,517537246963306496,1.0,Finally ungrounded yesyesyes,finally ungrounded yesyesyes,finally ungrounded yesyesyes,"[finally, ungrounded, yesyesyes]","[finally, ungrounded, yesyesyes]",3,3,0.999839,0.99999,0.999993,0.99999,0.99999,0.999986,0.999985,0.999917
3,404296747343966208,0.999995,I'm emotionally drained,i'm emotionally drained,i am emotionally drained,"[emotionally, drained]","[i, am, emotionally, drained]",2,4,0.999997,0.99999,0.99999,0.99999,0.99999,0.999988,0.999987,0.999984
4,492391379390439424,0.999997,"Physically, mentally, emotionally tired.","physically, mentally, emotionally tired.","physically , mentally , emotionally tired .",[emotionally],"[physically, ,, mentally, ,, emotionally, tire...",1,7,1.0,0.999976,0.999968,0.999976,0.999975,0.999971,0.99997,0.999918
5,629510894771437569,0.999996,I'm emotionally drained.,i'm emotionally drained.,i am emotionally drained .,[emotionally],"[i, am, emotionally, drained, .]",1,5,1.0,0.999932,0.999922,0.999932,0.999931,0.999921,0.999914,0.999893
6,703342166861459456,0.999998,I 💙 Twitter fights,i 💙 twitter fights,i 💙 twitter fights,"[i, twitter, fights]","[i, twitter, fights]",3,3,0.99967,0.99988,0.999754,0.99988,0.999877,0.999869,0.999862,0.999768
7,778303015333797889,1.0,@KumarsSalehi strawberries blueberries lingonb...,@kumarssalehi strawberries blueberries lingonb...,<user> strawberries blueberries lingonberries ...,"[strawberries, blueberries]","[<user>, strawberries, blueberries]",2,3,0.999968,0.999859,0.999845,0.999859,0.999857,0.999816,0.999838,0.999684
8,284144890466680833,0.999993,Twitter jail time!!!,twitter jail time!!!,twitter jail time ! <repeated>,"[twitter, jail]","[twitter, jail, time, !]",2,4,0.999908,0.999711,0.999552,0.999711,0.999707,0.999724,0.999712,0.999557
9,284588819783761921,1.0,I'm restless 😩😩,i'm restless 😩😩,i am restless 😩 😩,[restless],"[i, am, restless]",1,3,0.999954,0.999709,0.999617,0.999709,0.999704,0.999684,0.999672,0.999721


#### Minimum threshold of picked up words = 4

In [173]:
df_random_4 = df_random.sort_values(by ='y_predict_proba_ekphrasis_4', ascending=False).reset_index(drop=True)
df_random_4.head()

Unnamed: 0,tweet_id,glove_pos_model,text,ProcessedText,ekphrasis_text,word_picked_up_by_glove_lowercased,word_picked_up_by_glove_ekphrasis,len_list_words_picked_up_lowercased,len_list_words_picked_up_ekphrasis,y_predict_proba_lowercased,y_predict_proba_ekphrasis,y_predict_proba_ekphrasis_new,y_predict_proba_ekphrasis_2,y_predict_proba_ekphrasis_3,y_predict_proba_ekphrasis_4,y_predict_proba_ekphrasis_5,y_predict_proba_ekphrasis_10
0,394584354086600705,0.999989,Emotionally drained 😩,emotionally drained 😩,emotionally drained 😩,"[emotionally, drained]","[emotionally, drained]",2,2,0.999997,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,531330010204733440,0.999988,Emotionally drained,emotionally drained,emotionally drained,"[emotionally, drained]","[emotionally, drained]",2,2,0.999997,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,634556365634150400,0.999993,emotionally drained,emotionally drained,emotionally drained,"[emotionally, drained]","[emotionally, drained]",2,2,0.999997,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,429894177703591936,1.0,restless,restless,restless,[restless],[restless],1,1,0.999954,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,530639166330376194,0.999989,😂😂😂😂😂 dormiree,😂😂😂😂😂 dormiree,😂 😂 😂 😂 😂 dormiree,[dormiree],[dormiree],1,1,0.999965,1.0,1.0,1.0,1.0,1.0,1.0,0.999998


In [174]:
df_random_4.loc[df_random_4['len_list_words_picked_up_ekphrasis']>=4].reset_index(drop=True).head(n=10)

Unnamed: 0,tweet_id,glove_pos_model,text,ProcessedText,ekphrasis_text,word_picked_up_by_glove_lowercased,word_picked_up_by_glove_ekphrasis,len_list_words_picked_up_lowercased,len_list_words_picked_up_ekphrasis,y_predict_proba_lowercased,y_predict_proba_ekphrasis,y_predict_proba_ekphrasis_new,y_predict_proba_ekphrasis_2,y_predict_proba_ekphrasis_3,y_predict_proba_ekphrasis_4,y_predict_proba_ekphrasis_5,y_predict_proba_ekphrasis_10
0,404296747343966208,0.999995,I'm emotionally drained,i'm emotionally drained,i am emotionally drained,"[emotionally, drained]","[i, am, emotionally, drained]",2,4,0.999997,0.99999,0.99999,0.99999,0.99999,0.999988,0.999987,0.999984
1,492391379390439424,0.999997,"Physically, mentally, emotionally tired.","physically, mentally, emotionally tired.","physically , mentally , emotionally tired .",[emotionally],"[physically, ,, mentally, ,, emotionally, tire...",1,7,1.0,0.999976,0.999968,0.999976,0.999975,0.999971,0.99997,0.999918
2,629510894771437569,0.999996,I'm emotionally drained.,i'm emotionally drained.,i am emotionally drained .,[emotionally],"[i, am, emotionally, drained, .]",1,5,1.0,0.999932,0.999922,0.999932,0.999931,0.999921,0.999914,0.999893
3,284144890466680833,0.999993,Twitter jail time!!!,twitter jail time!!!,twitter jail time ! <repeated>,"[twitter, jail]","[twitter, jail, time, !]",2,4,0.999908,0.999711,0.999552,0.999711,0.999707,0.999724,0.999712,0.999557
4,359054874572365824,0.999992,Kidney beans .. Lol!,kidney beans .. lol!,kidney beans . <repeated> lol !,"[kidney, beans]","[kidney, beans, ., lol, !]",2,5,0.998325,0.999096,0.99861,0.999096,0.999084,0.999114,0.998996,0.999127
5,440352028934832129,0.999998,Good night. 👋✌,good night. 👋✌,good night . 👋 ✌,"[good, night.]","[good, night, ., ✌]",2,4,0.999511,0.999122,0.99867,0.999122,0.999111,0.999002,0.998759,0.99787
6,800037550840651776,0.999999,"Dear Tom Herman,\nPlease \nPlease\nPlease\nPle...","dear tom herman,\nplease \nplease\nplease\nple...","dear tom herman , please please please please ...","[dear, tom, please, please, please, please, pl...","[dear, tom, herman, ,, please, please, please,...",13,16,0.992586,0.999137,0.998979,0.999137,0.999134,0.998988,0.998974,0.998553
7,288466000691736576,0.999998,"RT @Drakee_YMCMB: Physically, mentally, emotio...","rt @drakee_ymcmb: physically, mentally, emotio...","rt <user> : physically , mentally , emotionall...","[rt, emotionally]","[rt, <user>, :, physically, ,, mentally, ,, em...",2,10,0.99902,0.998874,0.998546,0.998874,0.998855,0.998768,0.998728,0.997677
8,377037789654573056,0.999998,"RT @diaryforteens: Physically, mentally, emoti...","rt @diaryforteens: physically, mentally, emoti...","rt <user> : physically , mentally , emotionall...","[rt, emotionally]","[rt, <user>, :, physically, ,, mentally, ,, em...",2,10,0.99902,0.998874,0.998546,0.998874,0.998855,0.998768,0.998728,0.997677
9,410273680988864513,0.999998,"RT @ComedyTruth: Physically, mentally, emotion...","rt @comedytruth: physically, mentally, emotion...","rt <user> : physically , mentally , emotionall...","[rt, emotionally]","[rt, <user>, :, physically, ,, mentally, ,, em...",2,10,0.99902,0.998874,0.998546,0.998874,0.998855,0.998768,0.998728,0.997677


#### Minimum threshold of picked up words = 5

In [176]:
df_random_5 = df_random.sort_values(by ='y_predict_proba_ekphrasis_5', ascending=False).reset_index(drop=True)
df_random_5.head()

Unnamed: 0,tweet_id,glove_pos_model,text,ProcessedText,ekphrasis_text,word_picked_up_by_glove_lowercased,word_picked_up_by_glove_ekphrasis,len_list_words_picked_up_lowercased,len_list_words_picked_up_ekphrasis,y_predict_proba_lowercased,y_predict_proba_ekphrasis,y_predict_proba_ekphrasis_new,y_predict_proba_ekphrasis_2,y_predict_proba_ekphrasis_3,y_predict_proba_ekphrasis_4,y_predict_proba_ekphrasis_5,y_predict_proba_ekphrasis_10
0,394584354086600705,0.999989,Emotionally drained 😩,emotionally drained 😩,emotionally drained 😩,"[emotionally, drained]","[emotionally, drained]",2,2,0.999997,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,531330010204733440,0.999988,Emotionally drained,emotionally drained,emotionally drained,"[emotionally, drained]","[emotionally, drained]",2,2,0.999997,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,634556365634150400,0.999993,emotionally drained,emotionally drained,emotionally drained,"[emotionally, drained]","[emotionally, drained]",2,2,0.999997,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,429894177703591936,1.0,restless,restless,restless,[restless],[restless],1,1,0.999954,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,530639166330376194,0.999989,😂😂😂😂😂 dormiree,😂😂😂😂😂 dormiree,😂 😂 😂 😂 😂 dormiree,[dormiree],[dormiree],1,1,0.999965,1.0,1.0,1.0,1.0,1.0,1.0,0.999998


In [177]:
df_random_5.loc[df_random_5['len_list_words_picked_up_ekphrasis']>=5].reset_index(drop=True).head(n=10)

Unnamed: 0,tweet_id,glove_pos_model,text,ProcessedText,ekphrasis_text,word_picked_up_by_glove_lowercased,word_picked_up_by_glove_ekphrasis,len_list_words_picked_up_lowercased,len_list_words_picked_up_ekphrasis,y_predict_proba_lowercased,y_predict_proba_ekphrasis,y_predict_proba_ekphrasis_new,y_predict_proba_ekphrasis_2,y_predict_proba_ekphrasis_3,y_predict_proba_ekphrasis_4,y_predict_proba_ekphrasis_5,y_predict_proba_ekphrasis_10
0,492391379390439424,0.999997,"Physically, mentally, emotionally tired.","physically, mentally, emotionally tired.","physically , mentally , emotionally tired .",[emotionally],"[physically, ,, mentally, ,, emotionally, tire...",1,7,1.0,0.999976,0.999968,0.999976,0.999975,0.999971,0.99997,0.999918
1,629510894771437569,0.999996,I'm emotionally drained.,i'm emotionally drained.,i am emotionally drained .,[emotionally],"[i, am, emotionally, drained, .]",1,5,1.0,0.999932,0.999922,0.999932,0.999931,0.999921,0.999914,0.999893
2,359054874572365824,0.999992,Kidney beans .. Lol!,kidney beans .. lol!,kidney beans . <repeated> lol !,"[kidney, beans]","[kidney, beans, ., lol, !]",2,5,0.998325,0.999096,0.99861,0.999096,0.999084,0.999114,0.998996,0.999127
3,800037550840651776,0.999999,"Dear Tom Herman,\nPlease \nPlease\nPlease\nPle...","dear tom herman,\nplease \nplease\nplease\nple...","dear tom herman , please please please please ...","[dear, tom, please, please, please, please, pl...","[dear, tom, herman, ,, please, please, please,...",13,16,0.992586,0.999137,0.998979,0.999137,0.999134,0.998988,0.998974,0.998553
4,380955454026694656,0.999998,"RT @Drakee_YMCMB: Physically, mentally, emotio...","rt @drakee_ymcmb: physically, mentally, emotio...","rt <user> : physically , mentally , emotionall...","[rt, emotionally]","[rt, <user>, :, physically, ,, mentally, ,, em...",2,10,0.99902,0.998874,0.998546,0.998874,0.998855,0.998768,0.998728,0.997677
5,709921217977966593,0.999996,"RT @omfgcomplex: Physically, mentally, emotion...","rt @omfgcomplex: physically, mentally, emotion...","rt <user> : physically , mentally , emotionall...","[rt, emotionally]","[rt, <user>, :, physically, ,, mentally, ,, em...",2,10,0.99902,0.998874,0.998546,0.998874,0.998855,0.998768,0.998728,0.997677
6,437115125082767360,0.999998,"RT @FUCKtheBULLSHlT: Physically, mentally, emo...","rt @fuckthebullshlt: physically, mentally, emo...","rt <user> : physically , mentally , emotionall...","[rt, emotionally]","[rt, <user>, :, physically, ,, mentally, ,, em...",2,10,0.99902,0.998874,0.998546,0.998874,0.998855,0.998768,0.998728,0.997677
7,681292048108945408,0.999996,"RT @beyondxdarkness: physically, mentally, emo...","rt @beyondxdarkness: physically, mentally, emo...","rt <user> : physically , mentally , emotionall...","[rt, emotionally]","[rt, <user>, :, physically, ,, mentally, ,, em...",2,10,0.99902,0.998874,0.998546,0.998874,0.998855,0.998768,0.998728,0.997677
8,404119709974417408,0.999998,"RT @SincerelyTumblr: Physically, mentally, emo...","rt @sincerelytumblr: physically, mentally, emo...","rt <user> : physically , mentally , emotionall...","[rt, emotionally]","[rt, <user>, :, physically, ,, mentally, ,, em...",2,10,0.99902,0.998874,0.998546,0.998874,0.998855,0.998768,0.998728,0.997677
9,288466000691736576,0.999998,"RT @Drakee_YMCMB: Physically, mentally, emotio...","rt @drakee_ymcmb: physically, mentally, emotio...","rt <user> : physically , mentally , emotionall...","[rt, emotionally]","[rt, <user>, :, physically, ,, mentally, ,, em...",2,10,0.99902,0.998874,0.998546,0.998874,0.998855,0.998768,0.998728,0.997677


In [178]:
df_random.head()

Unnamed: 0,tweet_id,glove_pos_model,text,ProcessedText,ekphrasis_text,word_picked_up_by_glove_lowercased,word_picked_up_by_glove_ekphrasis,len_list_words_picked_up_lowercased,len_list_words_picked_up_ekphrasis,y_predict_proba_lowercased,y_predict_proba_ekphrasis,y_predict_proba_ekphrasis_new,y_predict_proba_ekphrasis_2,y_predict_proba_ekphrasis_3,y_predict_proba_ekphrasis_4,y_predict_proba_ekphrasis_5,y_predict_proba_ekphrasis_10
0,394584354086600705,0.999989,Emotionally drained 😩,emotionally drained 😩,emotionally drained 😩,"[emotionally, drained]","[emotionally, drained]",2,2,0.999997,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,634556365634150400,0.999993,emotionally drained,emotionally drained,emotionally drained,"[emotionally, drained]","[emotionally, drained]",2,2,0.999997,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,531330010204733440,0.999988,Emotionally drained,emotionally drained,emotionally drained,"[emotionally, drained]","[emotionally, drained]",2,2,0.999997,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,429894177703591936,1.0,restless,restless,restless,[restless],[restless],1,1,0.999954,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,530639166330376194,0.999989,😂😂😂😂😂 dormiree,😂😂😂😂😂 dormiree,😂 😂 😂 😂 😂 dormiree,[dormiree],[dormiree],1,1,0.999965,1.0,1.0,1.0,1.0,1.0,1.0,0.999998


In [183]:
df_train.loc[df_train[''.head()

Unnamed: 0,id,text,class,ProcessedText,ProcessedText_length,ekphrasis_text,word_picked_up_by_glove_lowercased,word_picked_up_by_glove_ekphrasis,len_list_words_picked_up_lowercased,len_list_words_picked_up_ekphrasis
0,2772,I was late again got work today. They gonna fi...,0,i was late again got work today. they gonna fi...,16,i was late again got work today . they gonna f...,"[i, was, late, again, got, work, today., they,...","[i, was, late, again, got, work, today, ., the...",16,17
1,1205,Got laid off today :/#work,1,got laid off today :/#work,5,got laid off today <annoyed> <hashtag> work </...,"[got, laid, off, today]","[got, laid, off, today, <hashtag>, work]",4,6
2,4512,Microsoft posted a job you might be interested...,0,microsoft posted a job you might be interested...,22,microsoft posted a job you might be interested...,"[microsoft, posted, a, job, you, might, be, in...","[microsoft, posted, a, job, you, might, be, in...",22,25
3,7249,Now the owners wife is having a breakdown and ...,0,now the owners wife is having a breakdown and ...,16,now the owners wife is having a breakdown and ...,"[now, the, owners, wife, is, having, a, breakd...","[now, the, owners, wife, is, having, a, breakd...",15,17
4,9453,I am happy today. Good. Time to get some food ...,1,i am happy today. good. time to get some food ...,27,i am happy today . good . time to get some foo...,"[i, am, happy, today., good., time, to, get, s...","[i, am, happy, today, ., good, ., time, to, ge...",25,30


In [277]:
df_filtered.head()

Unnamed: 0,tweet_id,glove_pos_model,text,ProcessedText,ekphrasis_text,word_picked_up_by_glove_lowercased,word_picked_up_by_glove_ekphrasis,len_list_words_picked_up_lowercased,len_list_words_picked_up_ekphrasis,y_predict_proba_lowercased,y_predict_proba_ekphrasis,y_predict_proba_ekphrasis_new,y_predict_proba_ekphrasis_2,y_predict_proba_ekphrasis_3,y_predict_proba_ekphrasis_4,y_predict_proba_ekphrasis_5,y_predict_proba_ekphrasis_10
0,330275754157948928,1.0,How To Avoid Work-at-Home Scams http://t.co/mm...,how to avoid work-at-home scams http://t.co/mm...,how to avoid work - at - home scams <url> <has...,"[how, to, avoid, work-at-home, scams, #moms]","[how, to, avoid, work, -, at, -, home, scams, ...",6,12,0.734351,0.020623,0.012285,0.020623,0.020615,0.020726,0.020776,0.012291
1,326440717394255873,1.0,RT @FastCompany Instead Of Taking Your Daughte...,rt @fastcompany instead of taking your daughte...,rt <user> instead of taking your daughters to ...,"[rt, instead, of, taking, your, daughters, to,...","[rt, <user>, instead, of, taking, your, daught...",12,17,0.355899,0.12764,0.103868,0.12764,0.12733,0.122493,0.126251,0.089601
2,328029087752929280,1.0,#mothersdaygift #itworks #Tighten #Tone #FullT...,#mothersdaygift #itworks #tighten #tone #fullt...,<hashtag> mothers day gift </hashtag> <hashtag...,[#moms],"[<hashtag>, mothers, day, gift, <hashtag>, it,...",1,29,1.0,0.53964,0.567707,0.53964,0.53725,0.530023,0.527263,0.495505
3,345961088258547712,1.0,Find Other Moms In Your Area http://t.co/PSkL3...,find other moms in your area http://t.co/pskl3...,find other moms in your area <url> to meet up ...,"[find, other, moms, in, your, area, to, meet, ...","[find, other, moms, in, your, area, <url>, to,...",11,30,0.621035,0.161189,0.163784,0.161189,0.160059,0.152525,0.151944,0.124003
4,364496673935851521,1.0,EntrepreneurMoms Pilot Turned Linen-Maker Jenn...,entrepreneurmoms pilot turned linen-maker jenn...,entrepreneurmoms pilot turned linen - maker je...,"[pilot, turned, jenny, main, photo, #moms]","[pilot, turned, linen, -, maker, jenny, davids...",6,21,0.907398,0.254193,0.292495,0.254193,0.254112,0.259702,0.24692,0.203777


In [278]:
for i in range(df_filtered.shape[0]):
    if 'unemployed' in df_filtered['word_picked_up_by_glove_ekphrasis'][i]:
        print('Tweet with unemployed keyword: ', df_filtered['text'][i])
        print('Probability lowercased:', df_filtered['y_predict_proba_lowercased'][i])
        print('Probability ekphrasis:', df_filtered['y_predict_proba_ekphrasis'][i])
        print('Probability ekphrasis with number of picked up words as extra feature:', df_filtered['y_predict_proba_ekphrasis_new'][i])
        print('Probability ekphrasis, threshold = 2', df_filtered['y_predict_proba_ekphrasis_2'][i])
        print('Probability ekphrasis, threshold = 3', df_filtered['y_predict_proba_ekphrasis_3'][i])
        print('Probability ekphrasis, threshold = 4', df_filtered['y_predict_proba_ekphrasis_4'][i])
        print('Probability ekphrasis, threshold = 5', df_filtered['y_predict_proba_ekphrasis_5'][i])
        print('Probability ekphrasis, threshold = 10', df_filtered['y_predict_proba_ekphrasis_10'][i])

Tweet with unemployed keyword:  Nothing. Because, unemployed https://t.co/b7ddYyefHF
Probability lowercased: 0.9999819119809213
Probability ekphrasis: 0.9918468360119767
Probability ekphrasis with number of picked up words as extra feature: 0.9908613984505192
Probability ekphrasis, threshold = 2 0.9918468360119767
Probability ekphrasis, threshold = 3 0.9917395496914316
Probability ekphrasis, threshold = 4 0.9918203464768416
Probability ekphrasis, threshold = 5 0.9911425106519366
Probability ekphrasis, threshold = 10 0.9850425910083895
Tweet with unemployed keyword:  That's unemployed
Probability lowercased: 0.9999819119809213
Probability ekphrasis: 0.999833595446299
Probability ekphrasis with number of picked up words as extra feature: 0.9996786525352986
Probability ekphrasis, threshold = 2 0.999833595446299
Probability ekphrasis, threshold = 3 0.9998298442872565
Probability ekphrasis, threshold = 4 0.9997989606384389
Probability ekphrasis, threshold = 5 0.9997918356734267
Probability 

Tweet with unemployed keyword:  @PrisonPlanet 😂😂😂 of course those kids protesting get in trouble when their leader is a 70yr old unemployed dude who thinks is cool. Go to bed Moore .
Probability lowercased: 0.47182158724205187
Probability ekphrasis: 0.6898920239083361
Probability ekphrasis with number of picked up words as extra feature: 0.7330892545472667
Probability ekphrasis, threshold = 2 0.6898920239083361
Probability ekphrasis, threshold = 3 0.6885442657739844
Probability ekphrasis, threshold = 4 0.6818619468546897
Probability ekphrasis, threshold = 5 0.6743576402901466
Probability ekphrasis, threshold = 10 0.6774673217782274
Tweet with unemployed keyword:  RT @surlyrevenant: @BillPeriman   America wasn't created BY a multi-cultural cast of gay &amp; lesbian, unemployed atheists.  Nor was it create…
Probability lowercased: 0.5312851804379389
Probability ekphrasis: 0.6909620459254037
Probability ekphrasis with number of picked up words as extra feature: 0.711296070189942
Probabili

Tweet with unemployed keyword:  #jobs #unemployed Assistant Vice President for Technology Systems at College of Staten Island (NY): As a key m...  http://t.co/a5LnvVDQY9
Probability lowercased: 0.21069169869022594
Probability ekphrasis: 0.5751790579760193
Probability ekphrasis with number of picked up words as extra feature: 0.6109082523762793
Probability ekphrasis, threshold = 2 0.5751790579760193
Probability ekphrasis, threshold = 3 0.5736361328338561
Probability ekphrasis, threshold = 4 0.5792690140523298
Probability ekphrasis, threshold = 5 0.5842208028321593
Probability ekphrasis, threshold = 10 0.5121979786728813


In [279]:
for i in range(df_random.shape[0]):
    if 'unemployed' in df_random['word_picked_up_by_glove_ekphrasis'][i]:
        print('Tweet with unemployed keyword: ', df_random['text'][i])
        print('Probability lowercased:', df_random['y_predict_proba_lowercased'][i])
        print('Probability ekphrasis:', df_random['y_predict_proba_ekphrasis'][i])
        print('Probability ekphrasis with number of picked up words as extra feature:', df_random['y_predict_proba_ekphrasis_new'][i])
        print('Probability ekphrasis, threshold = 2', df_random['y_predict_proba_ekphrasis_2'][i])
        print('Probability ekphrasis, threshold = 3', df_random['y_predict_proba_ekphrasis_3'][i])
        print('Probability ekphrasis, threshold = 4', df_random['y_predict_proba_ekphrasis_4'][i])
        print('Probability ekphrasis, threshold = 5', df_random['y_predict_proba_ekphrasis_5'][i])
        print('Probability ekphrasis, threshold = 10', df_random['y_predict_proba_ekphrasis_10'][i])

Tweet with unemployed keyword:  Are you unemployed? http://t.co/ikFqUmf2JS
Probability lowercased: 0.35074165357748643
Probability ekphrasis: 0.9149607601697434
Probability ekphrasis with number of picked up words as extra feature: 0.9188663934074717
Probability ekphrasis, threshold = 2 0.9149607601697434
Probability ekphrasis, threshold = 3 0.914406780399996
Probability ekphrasis, threshold = 4 0.905412557125823
Probability ekphrasis, threshold = 5 0.9020243712700314
Probability ekphrasis, threshold = 10 0.8128594682709777


In [280]:
df_train.head()

Unnamed: 0,id,text,class,ProcessedText,ProcessedText_length,ekphrasis_text,word_picked_up_by_glove_lowercased,word_picked_up_by_glove_ekphrasis,len_list_words_picked_up_lowercased,len_list_words_picked_up_ekphrasis
0,2772,I was late again got work today. They gonna fi...,0,i was late again got work today. they gonna fi...,16,i was late again got work today . they gonna f...,"[i, was, late, again, got, work, today., they,...","[i, was, late, again, got, work, today, ., the...",16,17
1,1205,Got laid off today :/#work,1,got laid off today :/#work,5,got laid off today <annoyed> <hashtag> work </...,"[got, laid, off, today]","[got, laid, off, today, <hashtag>, work]",4,6
2,4512,Microsoft posted a job you might be interested...,0,microsoft posted a job you might be interested...,22,microsoft posted a job you might be interested...,"[microsoft, posted, a, job, you, might, be, in...","[microsoft, posted, a, job, you, might, be, in...",22,25
3,7249,Now the owners wife is having a breakdown and ...,0,now the owners wife is having a breakdown and ...,16,now the owners wife is having a breakdown and ...,"[now, the, owners, wife, is, having, a, breakd...","[now, the, owners, wife, is, having, a, breakd...",15,17
4,9453,I am happy today. Good. Time to get some food ...,1,i am happy today. good. time to get some food ...,27,i am happy today . good . time to get some foo...,"[i, am, happy, today., good., time, to, get, s...","[i, am, happy, today, ., good, ., time, to, ge...",25,30


In [None]:
df_random.loc

In [281]:
for i in range(df_random.shape[0]):
    if 'fired' in df_random['word_picked_up_by_glove_ekphrasis'][i]:
        print('Tweet with need and job keywords: ', df_random['text'][i])
        print('Probability lowercased:', df_random['y_predict_proba_lowercased'][i])
        print('Probability ekphrasis:', df_random['y_predict_proba_ekphrasis'][i])
        print('Probability ekphrasis with number of picked up words as extra feature:', df_random['y_predict_proba_ekphrasis_new'][i])
        print('Probability ekphrasis, threshold = 2', df_random['y_predict_proba_ekphrasis_2'][i])
        print('Probability ekphrasis, threshold = 3', df_random['y_predict_proba_ekphrasis_3'][i])
        print('Probability ekphrasis, threshold = 4', df_random['y_predict_proba_ekphrasis_4'][i])
        print('Probability ekphrasis, threshold = 5', df_random['y_predict_proba_ekphrasis_5'][i])
        print('Probability ekphrasis, threshold = 10', df_random['y_predict_proba_ekphrasis_10'][i])

Tweet with need and job keywords:  I fired steven as my friend
Probability lowercased: 0.6966021543911461
Probability ekphrasis: 0.9479174550702677
Probability ekphrasis with number of picked up words as extra feature: 0.9226919133910368
Probability ekphrasis, threshold = 2 0.9479174550702677
Probability ekphrasis, threshold = 3 0.9472218587516258
Probability ekphrasis, threshold = 4 0.9452371842072906
Probability ekphrasis, threshold = 5 0.9413266398305504
Probability ekphrasis, threshold = 10 0.8931813280806957
Tweet with need and job keywords:  #GoodbyeBrooklyn. RT @WojYahooNBA The Brooklyn Nets have fired coach Avery Johnson, league source tells Yahoo! Sports.
Probability lowercased: 0.7099214798537863
Probability ekphrasis: 0.9061101606609383
Probability ekphrasis with number of picked up words as extra feature: 0.9227126038168239
Probability ekphrasis, threshold = 2 0.9061101606609383
Probability ekphrasis, threshold = 3 0.9054479863528433
Probability ekphrasis, threshold = 4 0.9

In [283]:
for i in range(df_random.shape[0]):
    if 'interview' in df_random['word_picked_up_by_glove_ekphrasis'][i]:
        print('Tweet with need and job keywords: ', df_random['text'][i])
        print('Probability lowercased:', df_random['y_predict_proba_lowercased'][i])
        print('Probability ekphrasis:', df_random['y_predict_proba_ekphrasis'][i])
        print('Probability ekphrasis with number of picked up words as extra feature:', df_random['y_predict_proba_ekphrasis_new'][i])
        print('Probability ekphrasis, threshold = 2', df_random['y_predict_proba_ekphrasis_2'][i])
        print('Probability ekphrasis, threshold = 3', df_random['y_predict_proba_ekphrasis_3'][i])
        print('Probability ekphrasis, threshold = 4', df_random['y_predict_proba_ekphrasis_4'][i])
        print('Probability ekphrasis, threshold = 5', df_random['y_predict_proba_ekphrasis_5'][i])
        print('Probability ekphrasis, threshold = 10', df_random['y_predict_proba_ekphrasis_10'][i])

Tweet with need and job keywords:  Got goose bumps....#interview
Probability lowercased: 0.9808196814466333
Probability ekphrasis: 0.7315295492849467
Probability ekphrasis with number of picked up words as extra feature: 0.5339955014866204
Probability ekphrasis, threshold = 2 0.7315295492849467
Probability ekphrasis, threshold = 3 0.7300510228655088
Probability ekphrasis, threshold = 4 0.7301396611947966
Probability ekphrasis, threshold = 5 0.7177609900170498
Probability ekphrasis, threshold = 10 0.7702355704929579
Tweet with need and job keywords:  RT @noahpeep: No jumper interview ASAP @tanamongeau @adam22 @nojumper https://t.co/6MNOKYUkPj
Probability lowercased: 0.8337343436332543
Probability ekphrasis: 0.3092175659367394
Probability ekphrasis with number of picked up words as extra feature: 0.26741465128396563
Probability ekphrasis, threshold = 2 0.3092175659367394
Probability ekphrasis, threshold = 3 0.3093206336654421
Probability ekphrasis, threshold = 4 0.3107942023680224
Probab

Tweet with need and job keywords:  Martha Stewart Talks Sex, Tacos, &amp; Truffle Oil http://t.co/Mm4XuNEMCO #gwynethpaltrow #interview
Probability lowercased: 0.030773144276184976
Probability ekphrasis: 0.5776326954433308
Probability ekphrasis with number of picked up words as extra feature: 0.4696620914943703
Probability ekphrasis, threshold = 2 0.5776326954433308
Probability ekphrasis, threshold = 3 0.5755398357695478
Probability ekphrasis, threshold = 4 0.5930630880164182
Probability ekphrasis, threshold = 5 0.5992962058421395
Probability ekphrasis, threshold = 10 0.47844850225546753
Tweet with need and job keywords:  via @filmcourage Story Mastery &amp; The Director's Journey - Full Interview with Michael Hauge &amp; M... http://t.co/iw7yKZEjy9 #insider #tips
Probability lowercased: 0.22648015521255327
Probability ekphrasis: 0.5735712316196844
Probability ekphrasis with number of picked up words as extra feature: 0.5487850860802361
Probability ekphrasis, threshold = 2 0.5735712316