In [1]:
import pandas as pd
import numpy as np
from time import time
import re
import socket
from glob import glob
import os
import pyarrow.parquet as pq

In [2]:
iterations=range(2)
models=['GLOVE','BERT']
labels=['is_hired_1mo','is_unemployed','job_offer','job_search','lost_job_1mo']

country_code = "US"
print('Country:', country_code)

print('Hostname:', socket.gethostname())
if 'samuel' in socket.gethostname().lower():
    path_to_data='../../data'
else:
    path_to_data='/scratch/spf248/twitter/data'

Country: US
Hostname: Samuels-MBP.home


In [3]:
print('Extract labeled tweets:\n')
labeled_tweets=pd.read_pickle(os.path.join(path_to_data,'classification',country_code,'labeling','labels.pkl'))
print('# labels:', labeled_tweets.shape[0])

Extract labeled tweets:

# labels: 13183


In [4]:
labeled_tweets.head()

Unnamed: 0,tweet_id,is_unemployed,lost_job_1mo,job_search,is_hired_1mo,job_offer
0,278307497121554433,no,no,no,no,no
1,278519817328279552,yes,no,yes,no,no
2,278534362910322688,no,no,yes,no,no
3,278660548453888001,yes,no,no,no,no
4,278690533809991681,no,no,no,no,no


In [5]:
for iteration in iterations:
    print()
    print('********* Iteration:',iteration,'*********')
    for model in models:
        print()
        print('*** Model:',model,'***')
        name='iteration_'+str(iteration)+'_'+model
        sample_files=glob(os.path.join(path_to_data,'classification',country_code,'labeling',str(iteration),'sampled','training','*'))
        if iteration!=0:
            sample_files=[x for x in sample_files if model.lower() in x.lower()]
        sampled_tweets=pd.DataFrame()
        for sample_file in sample_files:
            if '.pkl' in sample_file:
                tmp=pd.read_pickle(sample_file)[['tweet_id','text']]
            elif '.parquet' in sample_file:
                tmp=pd.read_parquet(sample_file)[['tweet_id','text']]
            else:
                print('Extension error:', sample_file)
            tmp.tweet_id=tmp.tweet_id.astype(str)
            sampled_tweets=pd.concat([sampled_tweets,tmp])
        sampled_tweets=sampled_tweets.drop_duplicates('tweet_id')
        sampled_tweets=sampled_tweets.merge(labeled_tweets)
        print('# tweets:', name, sampled_tweets.shape[0])
        
        for label in labels:
            print('*** Label:',label,'***')
            training_tweets=sampled_tweets[['text',label]].copy()
            training_tweets[label]=training_tweets[label].apply(lambda x:{'yes':1,'no':0,'unsure':np.nan}[x])
            training_tweets.dropna(inplace=True)
            training_tweets.rename(columns={label:'class'},inplace=True)
            print('# tweets:', training_tweets.shape[0])
            print('% Positives (taking all the labeled tweets across boundaries for that model):', round(training_tweets['class'].mean(),2))
            training_tweets.to_csv(os.path.join(path_to_data,'classification',country_code,'labeling',str(iteration),'labeled','training','labels_iteration_'+str(iteration)+'_'+model+'_'+label+'.csv'),index=False)


********* Iteration: 0 *********

*** Model: GLOVE ***
Extension error: ../../data/classification/US/labeling/0/sampled/training/_SUCCESS
# tweets: iteration_0_GLOVE 9746
*** Label: is_hired_1mo ***
# tweets: 9715
% Positives (taking all the labeled tweets across boundaries for that model): 0.08
*** Label: is_unemployed ***
# tweets: 9679
% Positives (taking all the labeled tweets across boundaries for that model): 0.2
*** Label: job_offer ***
# tweets: 9716
% Positives (taking all the labeled tweets across boundaries for that model): 0.18
*** Label: job_search ***
# tweets: 9731
% Positives (taking all the labeled tweets across boundaries for that model): 0.12
*** Label: lost_job_1mo ***
# tweets: 9661
% Positives (taking all the labeled tweets across boundaries for that model): 0.1

*** Model: BERT ***
Extension error: ../../data/classification/US/labeling/0/sampled/training/_SUCCESS
# tweets: iteration_0_BERT 9746
*** Label: is_hired_1mo ***
# tweets: 9715
% Positives (taking all t