In [1]:
import pandas as pd
import numpy as np
from time import time
import re
import string
import socket
from glob import glob
import os
import pyarrow.parquet as pq

In [2]:
country_code = "US"
print('Country:', country_code)
model='BERT'
iteration='1'
iter_and_model='ITER_'+iteration
if model:
    iter_and_model+='_'+model
print('Folder:', iter_and_model)
# Local
if 'samuel' in socket.gethostname().lower():
    path_to_data = os.path.join('../../data/classification',country_code)
# Cluster
else:
    path_to_data = os.path.join('/scratch/spf248/twitter/data/classification',country_code)
print('Path to data:',path_to_data)

Country: US
Folder: ITER_1_BERT
Path to data: ../../data/classification/US


# Collect all existing labels

In [3]:
print("Surveys:",len(sorted([x.split('/')[-2] for x in glob(os.path.join(path_to_data,'labeling','*','qualtrics','*','labels.csv'))])))

Surveys: 18


In [4]:
# Only keep one label per worker and tweet
labels=pd.concat(
[pd.read_csv(file) for file in glob(os.path.join(path_to_data,'labeling','*','qualtrics','*','labels.csv'))]).sort_values(
by=['tweet_id','class_id','QIDWorker']).drop_duplicates(
['tweet_id','class_id','QIDWorker']).set_index(
['tweet_id','class_id','QIDWorker'])

print('# labels:', labels.shape[0])

# labels: 204480


In [5]:
labels

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,score
tweet_id,class_id,QIDWorker,Unnamed: 3_level_1
278519817328279552,1,A170EDGL8ZWMSL,yes
278519817328279552,1,A1CFPKUOCGJIM6,yes
278519817328279552,1,A1NHTBY5YB9JH7,yes
278519817328279552,1,A3B7TNVOISSZ2O,yes
278519817328279552,1,AVCXJ9M71WDCB,yes
...,...,...,...
1079723333548093440,4,AGYZ0GAAUIJZX,no
1079723333548093440,5,A1ETJBNTO9ZWZ8,yes
1079723333548093440,5,A2MFMT03E21ZIT,yes
1079723333548093440,5,A3135Y3RMFC3PK,yes


In [6]:
def is_labeled(x):
    # If First sequence was allocated more than once
    if x[0]>1:
        # If no other sequence
        if len(x)==1:
            return True
        else:
            # If second sequence less popular
            if x[1]<x[0]:
                return True
    return False

# Counts labels for each observation
counts=labels.groupby(['tweet_id','class_id'])['score'].value_counts().rename('count')

# Keep tweets that were labeled more than once with most popular labels strictly dominating
ids_labeled=counts.groupby(['tweet_id','class_id']).apply(list).apply(is_labeled).groupby('tweet_id').sum().where(lambda x:x==5).dropna().index
print('# labeled tweets:', len(ids_labeled))

# labeled tweets: 10522


In [7]:
# Keep most popular label sequence
labels=counts.reindex(ids_labeled,level='tweet_id').reset_index(
level='score').groupby(['tweet_id','class_id'])['score'].first().unstack()
labels.index=labels.index.astype(str)

class2name=dict(zip(range(1,6),[
'is_unemployed',
'job_loss',
'job_search',
'is_hired',
'job_offer',
]))

['Does this tweet indicate that the user is currently unemployed?',
'Does this tweet indicate that the user became unemployed within the last month?',
'Does this tweet indicate that the user is currently searching for a job?',
'Does this tweet indicate that the user was hired within the last month?',
'Does this tweet contain a job offer?',]

labels.rename(columns=lambda x:class2name[x],inplace=True)
labels.reset_index(inplace=True)

# Merge with input data

In [8]:
tweets=pd.DataFrame()
sample_files=glob(os.path.join(path_to_data,'labeling',iter_and_model,'sample','*'))
for sample_file in sample_files:
    if '.pkl' in sample_file:
        tweets=pd.concat([tweets,pd.read_pickle(sample_file)])
    elif '.parquet' in sample_file:
        tweets=pd.concat([tweets,pd.read_parquet(sample_file)])
    else:
        print('Extension error:', sample_file)
tweets=tweets.drop_duplicates('tweet_id')[['tweet_id','text']]
tweets.tweet_id=tweets.tweet_id.astype(str)
print('# Input tweets:',tweets.shape[0])

tweets=tweets.merge(labels)
print('# Labeled tweets:',tweets.shape[0])

# Input tweets: 990
# Labeled tweets: 755


In [9]:
tweets[class2name.values()].applymap(lambda x:x=='yes').mean()

is_unemployed    0.250331
job_loss         0.123179
job_search       0.137748
is_hired         0.123179
job_offer        0.182781
dtype: float64

In [10]:
tweets.to_pickle(os.path.join(path_to_data,'labeling',iter_and_model,'labels.pkl'))