In [31]:
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import LogisticRegression
from ast import literal_eval
from sklearn.metrics import classification_report
import gensim.downloader as api
from sklearn.preprocessing import scale
import enchant
import itertools

## Utils

In [2]:
def get_w2v_general(tweet, size, vectors, aggregation='mean'):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tweet.split():
        try:
            vec += vectors[word].reshape((1, size))
            count += 1.
        except KeyError:
            continue
    if aggregation == 'mean':
        if count != 0:
            vec /= count
        return vec
    elif aggregation == 'sum':
        return vec

In [35]:
def perf_measure(y_actual, y_hat):
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    for i in range(len(y_hat)): 
        if y_actual[i]==y_hat[i]==1:
           TP += 1
        if y_hat[i]==1 and y_actual[i]!=y_hat[i]:
           FP += 1
        if y_actual[i]==y_hat[i]==0:
           TN += 1
        if y_hat[i]==0 and y_actual[i]!=y_hat[i]:
           FN += 1

    return(TP, FP, TN, FN)

In [3]:
glove_twitter = api.load("glove-twitter-200")

In [9]:
word_list = glove_twitter.vocab.keys()

In [10]:
english_dic = enchant.Dict("en_US")

In [32]:
path = "/home/manuto/Documents/world_bank/bert_twitter_labor/code/twitter/data/may20_9Klabels/data_binary_pos_neg_balanced"

## Word score determination

The objective is to determine which words have the biggest influence on the classification decision for each label. 

To do this, we import word embeddings from GloVe (trained on Twitter data) and train one logistic regression per label, using the embeddings as input and each label as target. For each label, we then use the coefficient vector $b$ containing  the coefficient assigned to each dimension of the word embedding $x$. For each label $y$, the score assigned to each word $w$ is defined as follows: 

$$P(y=1| w) = \frac{1}{1 + e^{-b*x_w}} $$ with $x_w$ the Twitter GloVe word embedding of word $w$

In [39]:
for label in ["is_hired_1mo","is_unemployed","job_offer","job_search","lost_job_1mo"]:
    #import data
    train_file_name = "train_{}.csv".format(label)
    val_file_name = "val_{}.csv".format(label)
    df_train = pd.read_csv(os.path.join(path, train_file_name))
    df_val = pd.read_csv(os.path.join(path, val_file_name))
    #determine unique words in input data
    words = df_train['text'].str.lower().str.findall("\w+")
    unique_words = set()
    for x in words:
        unique_words.update(x)
    #import word embeddings
    train_vecs_glove_mean = scale(np.concatenate([get_w2v_general(z, 200, glove_twitter,'mean') for z in df_train["text"]]))
    validation_vecs_glove_mean = scale(np.concatenate([get_w2v_general(z, 200, glove_twitter,'mean') for z in df_val["text"]]))
    #define and train model
    clf = LogisticRegression(max_iter=1000)
    clf.fit(train_vecs_glove_mean,df_train["class"])
    #model evaluation
    df_val["class_predict"] = clf.predict(validation_vecs_glove_mean)
    TP, FP, TN, FN = perf_measure(df_val["class"], df_val["class_predict"])
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    f1 = 2*(precision*recall)/(precision+recall)
    print("************ Label {} ************".format(label))
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F-1 Score:", f1)
    print("**********************************")
    #define regression coefficients
    coeff_array = clf.coef_[0]
    score_dict = {}
    for word in word_list:
        if word in unique_words: #and english_dic.check(word)
            word_vector = glove_twitter[word]
            word_score = 1/(1+np.exp(-np.sum(word_vector*coeff_array)))
            score_dict[word] = word_score
    print("Top 30 word scores for label {}:".format(label))
    for word in sorted(score_dict, key=score_dict.get, reverse=True)[:30]:
        print(word,score_dict[word])
    print("                                 ")

************ Label is_hired_1mo ************
Precision:  0.7894736842105263
Recall:  0.7741935483870968
F-1 Score: 0.7817589576547231
**********************************
Top 30 word scores for label is_hired_1mo:
ultrasound 0.9925333400677396
panerai 0.9917193725300089
resigns 0.990287840329032
tyj 0.9902841202466814
shoreline 0.9893935403492077
battisti 0.9872657569351381
montanas 0.985815433265916
unopened 0.985789008853867
riverdale 0.9824460327078864
hired 0.980605939113789
bloomfield 0.9774575055206486
divorcee 0.9761434360421456
accepted 0.9699050573807927
mortenson 0.9697020065360366
whoooop 0.9689413331541138
booked 0.9676561110821127
refurbished 0.9645882085584606
armani 0.9623800075560839
seatac 0.9622044768977256
dancers 0.9618885124298453
magnolia 0.9616251512357683
openings 0.9611380844425051
morphed 0.9605509218306135
babysit 0.9600816933843663
dental 0.9600648948965494
salesmen 0.9575998108468805
cfo 0.9565865108141255
bartend 0.9552069732815022
strippers 0.95364455729538