In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import LogisticRegression
from ast import literal_eval
from sklearn.metrics import classification_report, roc_curve, auc
import gensim.downloader as api
from sklearn.preprocessing import scale

## Utils

In [2]:
def get_w2v_general(tweet, size, vectors, aggregation='mean'):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tweet.split():
        try:
            vec += vectors[word.lower()].reshape((1, size))
            count += 1.
        except KeyError:
            continue
    if aggregation == 'mean':
        if count != 0:
            vec /= count
        return vec
    elif aggregation == 'sum':
        return vec

In [3]:
def perf_measure(y_actual, y_hat):
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    for i in range(len(y_hat)): 
        if y_actual[i]==y_hat[i]==1:
           TP += 1
        if y_hat[i]==1 and y_actual[i]!=y_hat[i]:
           FP += 1
        if y_actual[i]==y_hat[i]==0:
           TN += 1
        if y_hat[i]==0 and y_actual[i]!=y_hat[i]:
           FN += 1

    return(TP, FP, TN, FN)

## Logit + GloVe Twitter 200d on May5_7Klabels 

In [4]:
glove_twitter = api.load("glove-twitter-200")

### Without preprocessing

In [93]:
path = "/home/manuto/Documents/world_bank/bert_twitter_labor/code/twitter/data/may5_7Klabels/data_binary_pos_neg_balanced_removed_allzeros/preprocessed_glove"
for label in ["is_hired_1mo","is_unemployed","job_offer","job_search","lost_job_1mo"]:
    print("************ {} ************".format(label))
    train_file_name = "train_{}.csv".format(label)
    val_file_name = "val_{}.csv".format(label)
    #download data
    df_train = pd.read_csv(os.path.join(path, train_file_name))
    df_val = pd.read_csv(os.path.join(path, val_file_name))
    #create embeddings
    train_vecs_glove_mean = scale(np.concatenate([get_w2v_general(z, 200, glove_twitter,'mean') for z in df_train["text"]]))
    validation_vecs_glove_mean = scale(np.concatenate([get_w2v_general(z, 200, glove_twitter,'mean') for z in df_val["text"]]))
    #train
    clf = LogisticRegression(max_iter=1000)
    clf.fit(train_vecs_glove_mean,df_train["class"])
    #evaluate
    df_val["class_predict"] = clf.predict(validation_vecs_glove_mean)
    df_val["y_predict_proba"] = clf.predict_proba(validation_vecs_glove_mean)[:, 1]
    TP, FP, TN, FN = perf_measure(df_val["class"], df_val["class_predict"])
    print("Precision: ", TP/(TP+FP))
    print("Recall: ", TP/(TP+FN))
    fpr, tpr, thresholds = roc_curve(df_val["class"], df_val["y_predict_proba"])
    print("AUC: ", auc(fpr, tpr))

************ is_hired_1mo ************
Precision:  0.8446601941747572
Recall:  0.7631578947368421
AUC:  0.8838133068520357
************ is_unemployed ************
Precision:  0.8051470588235294
Recall:  0.8455598455598455
AUC:  0.902046192659911
************ job_offer ************
Precision:  0.91796875
Recall:  0.94
AUC:  0.975321033210332
************ job_search ************
Precision:  0.7975460122699386
Recall:  0.8125
AUC:  0.8866776315789474
************ lost_job_1mo ************
Precision:  0.7857142857142857
Recall:  0.7638888888888888
AUC:  0.8674045138888888


### With preprocessing

In [94]:
for label in ["is_hired_1mo","is_unemployed","job_offer","job_search","lost_job_1mo"]:
    print("************ {} ************".format(label))
    
    train_file_name = "train_{}.csv".format(label)
    val_file_name = "val_{}.csv".format(label)
    #download data
    df_train = pd.read_csv(os.path.join(path, train_file_name))
    df_val = pd.read_csv(os.path.join(path, val_file_name))
    #create embeddings
    train_vecs_glove_mean = scale(np.concatenate([get_w2v_general(z, 200, glove_twitter,'mean') for z in df_train["ProcessedText"]]))
    validation_vecs_glove_mean = scale(np.concatenate([get_w2v_general(z, 200, glove_twitter,'mean') for z in df_val["ProcessedText"]]))
    #train
    clf = LogisticRegression(max_iter=1000)
    clf.fit(train_vecs_glove_mean,df_train["class"])
    #evaluate
    df_val["class_predict"] = clf.predict(validation_vecs_glove_mean)
    df_val["y_predict_proba"] = clf.predict_proba(validation_vecs_glove_mean)[:, 1]
    TP, FP, TN, FN = perf_measure(df_val["class"], df_val["class_predict"])
    print("Precision: ", TP/(TP+FP))
    print("Recall: ", TP/(TP+FN))
    fpr, tpr, thresholds = roc_curve(df_val["class"], df_val["y_predict_proba"])
    print("AUC: ", auc(fpr, tpr))

************ is_hired_1mo ************
Precision:  0.8446601941747572
Recall:  0.7631578947368421
AUC:  0.8838133068520357
************ is_unemployed ************
Precision:  0.8051470588235294
Recall:  0.8455598455598455
AUC:  0.902046192659911
************ job_offer ************
Precision:  0.91796875
Recall:  0.94
AUC:  0.975321033210332
************ job_search ************
Precision:  0.7975460122699386
Recall:  0.8125
AUC:  0.8866776315789474
************ lost_job_1mo ************
Precision:  0.7857142857142857
Recall:  0.7638888888888888
AUC:  0.8674045138888888


## Logit + GloVe Twitter 200 on May20_9Klabels

### Without preprocessing

In [95]:
path = "/home/manuto/Documents/world_bank/bert_twitter_labor/code/twitter/data/may20_9Klabels/data_binary_pos_neg_balanced/preprocessed_glove"
for label in ["lost_job_1mo","is_unemployed","job_search","is_hired_1mo","job_offer"]:
    print("************ {} ************".format(label))
    
    train_file_name = "train_{}.csv".format(label)
    val_file_name = "val_{}.csv".format(label)
    #download data
    df_train = pd.read_csv(os.path.join(path, train_file_name))
    df_val = pd.read_csv(os.path.join(path, val_file_name))
    #create embeddings
    train_vecs_glove_mean = scale(np.concatenate([get_w2v_general(z, 200, glove_twitter,'mean') for z in df_train["text"]]))
    validation_vecs_glove_mean = scale(np.concatenate([get_w2v_general(z, 200, glove_twitter,'mean') for z in df_val["text"]]))
    #train
    clf = LogisticRegression(max_iter=1000)
    clf.fit(train_vecs_glove_mean,df_train["class"])
    #evaluate
    df_val["class_predict"] = clf.predict(validation_vecs_glove_mean)
    df_val["y_predict_proba"] = clf.predict_proba(validation_vecs_glove_mean)[:, 1]
    TP, FP, TN, FN = perf_measure(df_val["class"], df_val["class_predict"])
    print("Precision: ", TP/(TP+FP))
    print("Recall: ", TP/(TP+FN))
    fpr, tpr, thresholds = roc_curve(df_val["class"], df_val["y_predict_proba"])
    print("AUC: ", auc(fpr, tpr))

************ lost_job_1mo ************
Precision:  0.7814207650273224
Recall:  0.7900552486187845
AUC:  0.8616287417673316
************ is_unemployed ************
Precision:  0.7331606217616581
Recall:  0.7628032345013477
AUC:  0.8070274992346805
************ job_search ************
Precision:  0.75
Recall:  0.7333333333333333
AUC:  0.8286887835703001
************ is_hired_1mo ************
Precision:  0.8450704225352113
Recall:  0.7741935483870968
AUC:  0.879418446160836
************ job_offer ************
Precision:  0.9080779944289693
Recall:  0.9131652661064426
AUC:  0.9619925582173169


### With preprocessing

In [96]:
for label in ["lost_job_1mo","is_unemployed","job_search","is_hired_1mo","job_offer"]:
    print("************ {} ************".format(label))
    
    train_file_name = "train_{}.csv".format(label)
    val_file_name = "val_{}.csv".format(label)
    #download data
    df_train = pd.read_csv(os.path.join(path, train_file_name))
    df_val = pd.read_csv(os.path.join(path, val_file_name))
    #create embeddings
    train_vecs_glove_mean = scale(np.concatenate([get_w2v_general(z, 200, glove_twitter,'mean') for z in df_train["ProcessedText"]]))
    validation_vecs_glove_mean = scale(np.concatenate([get_w2v_general(z, 200, glove_twitter,'mean') for z in df_val["ProcessedText"]]))
    #train
    clf = LogisticRegression(max_iter=1000)
    clf.fit(train_vecs_glove_mean,df_train["class"])
    #evaluate
    df_val["class_predict"] = clf.predict(validation_vecs_glove_mean)
    df_val["y_predict_proba"] = clf.predict_proba(validation_vecs_glove_mean)[:, 1]
    TP, FP, TN, FN = perf_measure(df_val["class"], df_val["class_predict"])
    print("Precision: ", TP/(TP+FP))
    print("Recall: ", TP/(TP+FN))
    fpr, tpr, thresholds = roc_curve(df_val["class"], df_val["y_predict_proba"])
    print("AUC: ", auc(fpr, tpr))

************ lost_job_1mo ************
Precision:  0.7814207650273224
Recall:  0.7900552486187845
AUC:  0.8616287417673316
************ is_unemployed ************
Precision:  0.7331606217616581
Recall:  0.7628032345013477
AUC:  0.8070274992346805
************ job_search ************
Precision:  0.75
Recall:  0.7333333333333333
AUC:  0.8286887835703001
************ is_hired_1mo ************
Precision:  0.8450704225352113
Recall:  0.7741935483870968
AUC:  0.879418446160836
************ job_offer ************
Precision:  0.9080779944289693
Recall:  0.9131652661064426
AUC:  0.9619925582173169


## Logit + GloVe Wiki 300 on May5_7Klabels 

In [97]:
glove_wiki = api.load("glove-wiki-gigaword-300")

#### Without preprocessing

In [101]:
path = "/home/manuto/Documents/world_bank/bert_twitter_labor/code/twitter/data/may5_7Klabels/data_binary_pos_neg_balanced_removed_allzeros/preprocessed_glove"
for label in ["lost_job_1mo","is_unemployed","job_search","is_hired_1mo","job_offer"]:
    print("************ {} ************".format(label))
    
    train_file_name = "train_{}.csv".format(label)
    val_file_name = "val_{}.csv".format(label)
    #download data
    df_train = pd.read_csv(os.path.join(path, train_file_name))
    df_val = pd.read_csv(os.path.join(path, val_file_name))
    #create embeddings
    train_vecs_glove_mean = scale(np.concatenate([get_w2v_general(z, 300, glove_wiki,'mean') for z in df_train["text"]]))
    validation_vecs_glove_mean = scale(np.concatenate([get_w2v_general(z, 300, glove_wiki,'mean') for z in df_val["text"]]))
    #train
    clf = LogisticRegression(max_iter=1000)
    clf.fit(train_vecs_glove_mean,df_train["class"])
    #evaluate
    df_val["class_predict"] = clf.predict(validation_vecs_glove_mean)
    df_val["y_predict_proba"] = clf.predict_proba(validation_vecs_glove_mean)[:, 1]
    TP, FP, TN, FN = perf_measure(df_val["class"], df_val["class_predict"])
    print("Precision: ", TP/(TP+FP))
    print("Recall: ", TP/(TP+FN))
    fpr, tpr, thresholds = roc_curve(df_val["class"], df_val["y_predict_proba"])
    print("AUC: ", auc(fpr, tpr))

************ lost_job_1mo ************
Precision:  0.8048780487804879
Recall:  0.6875
AUC:  0.8640407986111112
************ is_unemployed ************
Precision:  0.8120300751879699
Recall:  0.833976833976834
AUC:  0.9001087214083604
************ job_search ************
Precision:  0.7677419354838709
Recall:  0.74375
AUC:  0.8546052631578948
************ is_hired_1mo ************
Precision:  0.8207547169811321
Recall:  0.7631578947368421
AUC:  0.8729725256537569
************ job_offer ************
Precision:  0.9322709163346613
Recall:  0.936
AUC:  0.9803837638376384


#### With preprocessing

In [102]:
for label in ["lost_job_1mo","is_unemployed","job_search","is_hired_1mo","job_offer"]:
    print("************ {} ************".format(label))
    
    train_file_name = "train_{}.csv".format(label)
    val_file_name = "val_{}.csv".format(label)
    #download data
    df_train = pd.read_csv(os.path.join(path, train_file_name))
    df_val = pd.read_csv(os.path.join(path, val_file_name))
    #create embeddings
    train_vecs_glove_mean = scale(np.concatenate([get_w2v_general(z, 300, glove_wiki,'mean') for z in df_train["ProcessedText"]]))
    validation_vecs_glove_mean = scale(np.concatenate([get_w2v_general(z, 300, glove_wiki,'mean') for z in df_val["ProcessedText"]]))
    #train
    clf = LogisticRegression(max_iter=1000)
    clf.fit(train_vecs_glove_mean,df_train["class"])
    #evaluate
    df_val["class_predict"] = clf.predict(validation_vecs_glove_mean)
    df_val["y_predict_proba"] = clf.predict_proba(validation_vecs_glove_mean)[:, 1]
    TP, FP, TN, FN = perf_measure(df_val["class"], df_val["class_predict"])
    print("Precision: ", TP/(TP+FP))
    print("Recall: ", TP/(TP+FN))
    fpr, tpr, thresholds = roc_curve(df_val["class"], df_val["y_predict_proba"])
    print("AUC: ", auc(fpr, tpr))

************ lost_job_1mo ************
Precision:  0.8048780487804879
Recall:  0.6875
AUC:  0.8640407986111112
************ is_unemployed ************
Precision:  0.8120300751879699
Recall:  0.833976833976834
AUC:  0.9001087214083604
************ job_search ************
Precision:  0.7677419354838709
Recall:  0.74375
AUC:  0.8546052631578948
************ is_hired_1mo ************
Precision:  0.8207547169811321
Recall:  0.7631578947368421
AUC:  0.8729725256537569
************ job_offer ************
Precision:  0.9322709163346613
Recall:  0.936
AUC:  0.9803837638376384


## Logit Wiki + GloVe 300 on May20_9Klabels

### Without preprocessing

In [104]:
path = "/home/manuto/Documents/world_bank/bert_twitter_labor/code/twitter/data/may20_9Klabels/data_binary_pos_neg_balanced/preprocessed_glove"
for label in ["lost_job_1mo","is_unemployed","job_search","is_hired_1mo","job_offer"]:
    print("************ {} ************".format(label))
    
    train_file_name = "train_{}.csv".format(label)
    val_file_name = "val_{}.csv".format(label)
    #download data
    df_train = pd.read_csv(os.path.join(path, train_file_name))
    df_val = pd.read_csv(os.path.join(path, val_file_name))
    #create embeddings
    train_vecs_glove_mean = scale(np.concatenate([get_w2v_general(z, 300, glove_wiki,'mean') for z in df_train["text"]]))
    validation_vecs_glove_mean = scale(np.concatenate([get_w2v_general(z, 300, glove_wiki,'mean') for z in df_val["text"]]))
    #train
    clf = LogisticRegression(max_iter=1000)
    clf.fit(train_vecs_glove_mean,df_train["class"])
    #evaluate
    df_val["class_predict"] = clf.predict(validation_vecs_glove_mean)
    df_val["y_predict_proba"] = clf.predict_proba(validation_vecs_glove_mean)[:, 1]
    TP, FP, TN, FN = perf_measure(df_val["class"], df_val["class_predict"])
    print("Precision: ", TP/(TP+FP))
    print("Recall: ", TP/(TP+FN))
    fpr, tpr, thresholds = roc_curve(df_val["class"], df_val["y_predict_proba"])
    print("AUC: ", auc(fpr, tpr))

************ lost_job_1mo ************
Precision:  0.7965116279069767
Recall:  0.7569060773480663
AUC:  0.8564472328869744
************ is_unemployed ************
Precision:  0.7243243243243244
Recall:  0.7223719676549866
AUC:  0.8007854790899792
************ job_search ************
Precision:  0.7023255813953488
Recall:  0.6711111111111111
AUC:  0.774555028962612
************ is_hired_1mo ************
Precision:  0.8378378378378378
Recall:  0.8
AUC:  0.8891867333030441
************ job_offer ************
Precision:  0.8851540616246498
Recall:  0.8851540616246498
AUC:  0.9524687486935073


### With preprocessing

In [105]:
for label in ["lost_job_1mo","is_unemployed","job_search","is_hired_1mo","job_offer"]:
    print("************ {} ************".format(label))
    
    train_file_name = "train_{}.csv".format(label)
    val_file_name = "val_{}.csv".format(label)
    #download data
    df_train = pd.read_csv(os.path.join(path, train_file_name))
    df_val = pd.read_csv(os.path.join(path, val_file_name))
    #create embeddings
    train_vecs_glove_mean = scale(np.concatenate([get_w2v_general(z, 300, glove_wiki,'mean') for z in df_train["ProcessedText"]]))
    validation_vecs_glove_mean = scale(np.concatenate([get_w2v_general(z, 300, glove_wiki,'mean') for z in df_val["ProcessedText"]]))
    #train
    clf = LogisticRegression(max_iter=1000)
    clf.fit(train_vecs_glove_mean,df_train["class"])
    #evaluate
    df_val["class_predict"] = clf.predict(validation_vecs_glove_mean)
    df_val["y_predict_proba"] = clf.predict_proba(validation_vecs_glove_mean)[:, 1]
    TP, FP, TN, FN = perf_measure(df_val["class"], df_val["class_predict"])
    print("Precision: ", TP/(TP+FP))
    print("Recall: ", TP/(TP+FN))
    fpr, tpr, thresholds = roc_curve(df_val["class"], df_val["y_predict_proba"])
    print("AUC: ", auc(fpr, tpr))

************ lost_job_1mo ************
Precision:  0.7965116279069767
Recall:  0.7569060773480663
AUC:  0.8564472328869744
************ is_unemployed ************
Precision:  0.7243243243243244
Recall:  0.7223719676549866
AUC:  0.8007854790899792
************ job_search ************
Precision:  0.7023255813953488
Recall:  0.6711111111111111
AUC:  0.774555028962612
************ is_hired_1mo ************
Precision:  0.8378378378378378
Recall:  0.8
AUC:  0.8891867333030441
************ job_offer ************
Precision:  0.8851540616246498
Recall:  0.8851540616246498
AUC:  0.9524687486935073
