In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import LogisticRegression
from ast import literal_eval
from sklearn.metrics import classification_report
import gensim.downloader as api
from sklearn.preprocessing import scale

## Utils

In [2]:
def get_w2v_general(tweet, size, vectors, aggregation='mean'):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tweet.split():
        try:
            vec += vectors[word].reshape((1, size))
            count += 1.
        except KeyError:
            continue
    if aggregation == 'mean':
        if count != 0:
            vec /= count
        return vec
    elif aggregation == 'sum':
        return vec

In [3]:
def perf_measure(y_actual, y_hat):
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    for i in range(len(y_hat)): 
        if y_actual[i]==y_hat[i]==1:
           TP += 1
        if y_hat[i]==1 and y_actual[i]!=y_hat[i]:
           FP += 1
        if y_actual[i]==y_hat[i]==0:
           TN += 1
        if y_hat[i]==0 and y_actual[i]!=y_hat[i]:
           FN += 1

    return(TP, FP, TN, FN)

## Logit + GloVe Twitter 200d on May5_7Klabels 

In [4]:
glove_twitter = api.load("glove-twitter-200")

### Without preprocessing

In [5]:
path = "/home/manuto/Documents/world_bank/bert_twitter_labor/code/twitter/data/may5_7Klabels/data_binary_pos_neg_balanced_removed_allzeros/preprocessed_glove"
for label in ["is_hired_1mo","is_unemployed","job_offer","job_search","lost_job_1mo"]:
    print("************ {} ************".format(label))
    
    train_file_name = "preprocessed_train_{}.csv".format(label)
    val_file_name = "preprocessed_val_{}.csv".format(label)
    #download data
    df_train = pd.read_csv(os.path.join(path, train_file_name))
    df_val = pd.read_csv(os.path.join(path, val_file_name))
    #create embeddings
    train_vecs_glove_mean = scale(np.concatenate([get_w2v_general(z, 200, glove_twitter,'mean') for z in df_train["text"]]))
    validation_vecs_glove_mean = scale(np.concatenate([get_w2v_general(z, 200, glove_twitter,'mean') for z in df_val["text"]]))
    #train
    clf = LogisticRegression(max_iter=1000)
    clf.fit(train_vecs_glove_mean,df_train["class"])
    #evaluate
    df_val["class_predict"] = clf.predict(validation_vecs_glove_mean)
    TP, FP, TN, FN = perf_measure(df_val["class"], df_val["class_predict"])
    print("Precision: ", TP/(TP+FP))
    print("Recall: ", TP/(TP+FN))

************ is_hired_1mo ************
Precision:  0.7678571428571429
Recall:  0.7543859649122807
************ is_unemployed ************
Precision:  0.8113207547169812
Recall:  0.8301158301158301
************ job_offer ************
Precision:  0.8918918918918919
Recall:  0.924
************ job_search ************
Precision:  0.7662337662337663
Recall:  0.7375
************ lost_job_1mo ************
Precision:  0.8181818181818182
Recall:  0.75


### With preprocessing

In [6]:
for label in ["is_hired_1mo","is_unemployed","job_offer","job_search","lost_job_1mo"]:
    print("************ {} ************".format(label))
    
    train_file_name = "preprocessed_train_{}.csv".format(label)
    val_file_name = "preprocessed_val_{}.csv".format(label)
    #download data
    df_train = pd.read_csv(os.path.join(path, train_file_name))
    df_val = pd.read_csv(os.path.join(path, val_file_name))
    #create embeddings
    train_vecs_glove_mean = scale(np.concatenate([get_w2v_general(z, 200, glove_twitter,'mean') for z in df_train["ProcessedText"]]))
    validation_vecs_glove_mean = scale(np.concatenate([get_w2v_general(z, 200, glove_twitter,'mean') for z in df_val["ProcessedText"]]))
    #train
    clf = LogisticRegression(max_iter=1000)
    clf.fit(train_vecs_glove_mean,df_train["class"])
    #evaluate
    df_val["class_predict"] = clf.predict(validation_vecs_glove_mean)
    TP, FP, TN, FN = perf_measure(df_val["class"], df_val["class_predict"])
    print("Precision: ", TP/(TP+FP))
    print("Recall: ", TP/(TP+FN))

************ is_hired_1mo ************
Precision:  0.8446601941747572
Recall:  0.7631578947368421
************ is_unemployed ************
Precision:  0.8051470588235294
Recall:  0.8455598455598455
************ job_offer ************
Precision:  0.91796875
Recall:  0.94
************ job_search ************
Precision:  0.7975460122699386
Recall:  0.8125
************ lost_job_1mo ************
Precision:  0.7857142857142857
Recall:  0.7638888888888888


## Logit + GloVe Twitter 200d on May11_9Klabels 

### Without preprocessing

In [8]:
path = "/home/manuto/Documents/world_bank/bert_twitter_labor/code/twitter/data/may11_9Klabels/data_binary_pos_neg_balanced_removed_allzeros_BAD/preprocessed_glove"
for label in ["is_hired_1mo","is_unemployed","job_offer","job_search","lost_job_1mo"]:
    print("************ {} ************".format(label))
    
    train_file_name = "preprocessed_train_{}.csv".format(label)
    val_file_name = "preprocessed_val_{}.csv".format(label)
    #download data
    df_train = pd.read_csv(os.path.join(path, train_file_name))
    df_val = pd.read_csv(os.path.join(path, val_file_name))
    #create embeddings
    train_vecs_glove_mean = scale(np.concatenate([get_w2v_general(z, 200, glove_twitter,'mean') for z in df_train["text"]]))
    validation_vecs_glove_mean = scale(np.concatenate([get_w2v_general(z, 200, glove_twitter,'mean') for z in df_val["text"]]))
    #train
    clf = LogisticRegression(max_iter=1000)
    clf.fit(train_vecs_glove_mean,df_train["class"])
    #evaluate
    df_val["class_predict"] = clf.predict(validation_vecs_glove_mean)
    TP, FP, TN, FN = perf_measure(df_val["class"], df_val["class_predict"])
    print("Precision: ", TP/(TP+FP))
    print("Recall: ", TP/(TP+FN))

************ is_hired_1mo ************
Precision:  0.7961783439490446
Recall:  0.8064516129032258
************ is_unemployed ************
Precision:  0.8412698412698413
Recall:  0.8571428571428571
************ job_offer ************
Precision:  0.9051724137931034
Recall:  0.8823529411764706
************ job_search ************
Precision:  0.7216981132075472
Recall:  0.68
************ lost_job_1mo ************
Precision:  0.8333333333333334
Recall:  0.7734806629834254


### With preprocessing

In [9]:
for label in ["is_hired_1mo","is_unemployed","job_offer","job_search","lost_job_1mo"]:
    print("************ {} ************".format(label))
    
    train_file_name = "preprocessed_train_{}.csv".format(label)
    val_file_name = "preprocessed_val_{}.csv".format(label)
    #download data
    df_train = pd.read_csv(os.path.join(path, train_file_name))
    df_val = pd.read_csv(os.path.join(path, val_file_name))
    #create embeddings
    train_vecs_glove_mean = scale(np.concatenate([get_w2v_general(z, 200, glove_twitter,'mean') for z in df_train["ProcessedText"]]))
    validation_vecs_glove_mean = scale(np.concatenate([get_w2v_general(z, 200, glove_twitter,'mean') for z in df_val["ProcessedText"]]))
    #train
    clf = LogisticRegression(max_iter=1000)
    clf.fit(train_vecs_glove_mean,df_train["class"])
    #evaluate
    df_val["class_predict"] = clf.predict(validation_vecs_glove_mean)
    TP, FP, TN, FN = perf_measure(df_val["class"], df_val["class_predict"])
    print("Precision: ", TP/(TP+FP))
    print("Recall: ", TP/(TP+FN))

************ is_hired_1mo ************
Precision:  0.8732394366197183
Recall:  0.8
************ is_unemployed ************
Precision:  0.8198433420365535
Recall:  0.8463611859838275
************ job_offer ************
Precision:  0.9346590909090909
Recall:  0.9215686274509803
************ job_search ************
Precision:  0.7405660377358491
Recall:  0.6977777777777778
************ lost_job_1mo ************
Precision:  0.8268156424581006
Recall:  0.8176795580110497


## Logit + GloVe Wiki 300 on May5_7Klabels 

In [10]:
glove_wiki = api.load("glove-wiki-gigaword-300")

#### Without preprocessing

In [11]:
path = "/home/manuto/Documents/world_bank/bert_twitter_labor/code/twitter/data/may5_7Klabels/data_binary_pos_neg_balanced_removed_allzeros/preprocessed_glove"
for label in ["is_hired_1mo","is_unemployed","job_offer","job_search","lost_job_1mo"]:
    print("************ {} ************".format(label))
    
    train_file_name = "preprocessed_train_{}.csv".format(label)
    val_file_name = "preprocessed_val_{}.csv".format(label)
    #download data
    df_train = pd.read_csv(os.path.join(path, train_file_name))
    df_val = pd.read_csv(os.path.join(path, val_file_name))
    #create embeddings
    train_vecs_glove_mean = scale(np.concatenate([get_w2v_general(z, 300, glove_wiki,'mean') for z in df_train["text"]]))
    validation_vecs_glove_mean = scale(np.concatenate([get_w2v_general(z, 300, glove_wiki,'mean') for z in df_val["text"]]))
    #train
    clf = LogisticRegression(max_iter=1000)
    clf.fit(train_vecs_glove_mean,df_train["class"])
    #evaluate
    df_val["class_predict"] = clf.predict(validation_vecs_glove_mean)
    TP, FP, TN, FN = perf_measure(df_val["class"], df_val["class_predict"])
    print("Precision: ", TP/(TP+FP))
    print("Recall: ", TP/(TP+FN))

************ is_hired_1mo ************
Precision:  0.7407407407407407
Recall:  0.7017543859649122
************ is_unemployed ************
Precision:  0.8157894736842105
Recall:  0.8378378378378378
************ job_offer ************
Precision:  0.8928571428571429
Recall:  0.9
************ job_search ************
Precision:  0.8013698630136986
Recall:  0.73125
************ lost_job_1mo ************
Precision:  0.84
Recall:  0.7291666666666666


#### With preprocessing

In [12]:
for label in ["is_hired_1mo","is_unemployed","job_offer","job_search","lost_job_1mo"]:
    print("************ {} ************".format(label))
    
    train_file_name = "preprocessed_train_{}.csv".format(label)
    val_file_name = "preprocessed_val_{}.csv".format(label)
    #download data
    df_train = pd.read_csv(os.path.join(path, train_file_name))
    df_val = pd.read_csv(os.path.join(path, val_file_name))
    #create embeddings
    train_vecs_glove_mean = scale(np.concatenate([get_w2v_general(z, 300, glove_wiki,'mean') for z in df_train["ProcessedText"]]))
    validation_vecs_glove_mean = scale(np.concatenate([get_w2v_general(z, 300, glove_wiki,'mean') for z in df_val["ProcessedText"]]))
    #train
    clf = LogisticRegression(max_iter=1000)
    clf.fit(train_vecs_glove_mean,df_train["class"])
    #evaluate
    df_val["class_predict"] = clf.predict(validation_vecs_glove_mean)
    TP, FP, TN, FN = perf_measure(df_val["class"], df_val["class_predict"])
    print("Precision: ", TP/(TP+FP))
    print("Recall: ", TP/(TP+FN))

************ is_hired_1mo ************
Precision:  0.8207547169811321
Recall:  0.7631578947368421
************ is_unemployed ************
Precision:  0.8120300751879699
Recall:  0.833976833976834
************ job_offer ************
Precision:  0.9322709163346613
Recall:  0.936
************ job_search ************
Precision:  0.7677419354838709
Recall:  0.74375
************ lost_job_1mo ************
Precision:  0.8048780487804879
Recall:  0.6875


## Logit + GloVe Wiki 300 on May11_9Klabels 

### Without preprocessing

In [13]:
path = "/home/manuto/Documents/world_bank/bert_twitter_labor/code/twitter/data/may11_9Klabels/data_binary_pos_neg_balanced_removed_allzeros_BAD/preprocessed_glove"
for label in ["is_hired_1mo","is_unemployed","job_offer","job_search","lost_job_1mo"]:
    print("************ {} ************".format(label))
    
    train_file_name = "preprocessed_train_{}.csv".format(label)
    val_file_name = "preprocessed_val_{}.csv".format(label)
    #download data
    df_train = pd.read_csv(os.path.join(path, train_file_name))
    df_val = pd.read_csv(os.path.join(path, val_file_name))
    #create embeddings
    train_vecs_glove_mean = scale(np.concatenate([get_w2v_general(z, 300, glove_wiki,'mean') for z in df_train["text"]]))
    validation_vecs_glove_mean = scale(np.concatenate([get_w2v_general(z, 300, glove_wiki,'mean') for z in df_val["text"]]))
    #train
    clf = LogisticRegression(max_iter=1000)
    clf.fit(train_vecs_glove_mean,df_train["class"])
    #evaluate
    df_val["class_predict"] = clf.predict(validation_vecs_glove_mean)
    TP, FP, TN, FN = perf_measure(df_val["class"], df_val["class_predict"])
    print("Precision: ", TP/(TP+FP))
    print("Recall: ", TP/(TP+FN))

************ is_hired_1mo ************
Precision:  0.7880794701986755
Recall:  0.7677419354838709
************ is_unemployed ************
Precision:  0.8282548476454293
Recall:  0.8059299191374663
************ job_offer ************
Precision:  0.9048991354466859
Recall:  0.8795518207282913
************ job_search ************
Precision:  0.7407407407407407
Recall:  0.7111111111111111
************ lost_job_1mo ************
Precision:  0.8106508875739645
Recall:  0.7569060773480663


### With preprocessing

In [14]:
for label in ["is_hired_1mo","is_unemployed","job_offer","job_search","lost_job_1mo"]:
    print("************ {} ************".format(label))
    
    train_file_name = "preprocessed_train_{}.csv".format(label)
    val_file_name = "preprocessed_val_{}.csv".format(label)
    #download data
    df_train = pd.read_csv(os.path.join(path, train_file_name))
    df_val = pd.read_csv(os.path.join(path, val_file_name))
    #create embeddings
    train_vecs_glove_mean = scale(np.concatenate([get_w2v_general(z, 300, glove_wiki,'mean') for z in df_train["ProcessedText"]]))
    validation_vecs_glove_mean = scale(np.concatenate([get_w2v_general(z, 300, glove_wiki,'mean') for z in df_val["ProcessedText"]]))
    #train
    clf = LogisticRegression(max_iter=1000)
    clf.fit(train_vecs_glove_mean,df_train["class"])
    #evaluate
    df_val["class_predict"] = clf.predict(validation_vecs_glove_mean)
    TP, FP, TN, FN = perf_measure(df_val["class"], df_val["class_predict"])
    print("Precision: ", TP/(TP+FP))
    print("Recall: ", TP/(TP+FN))

************ is_hired_1mo ************
Precision:  0.8355263157894737
Recall:  0.8193548387096774
************ is_unemployed ************
Precision:  0.8218085106382979
Recall:  0.8328840970350404
************ job_offer ************
Precision:  0.9307479224376731
Recall:  0.9411764705882353
************ job_search ************
Precision:  0.7511961722488039
Recall:  0.6977777777777778
************ lost_job_1mo ************
Precision:  0.8034682080924855
Recall:  0.7679558011049724
