In [1]:
import pandas as pd
import numpy as np
from time import time
import re
import string
import socket
from glob import glob
import os
import pyarrow.parquet as pq

In [2]:
country_code = "US"
print('Country:', country_code)

# Local
if 'samuel' in socket.gethostname().lower():
    path_to_data = '../../data'
# Cluster
else:
    path_to_data = '/scratch/spf248/twitter/data'
print('Path to data:',path_to_data) 

Country: US
Path to data: ../../data


# Collect all existing labels

In [3]:
print("Surveys:",len(sorted([x.split('/')[-2] for x in glob(os.path.join(path_to_data,'classification',country_code,'labeling','qualtrics','*','labels.csv'))])))

Surveys: 33


In [4]:
# Only keep one label per worker and tweet
labels=pd.concat(
[pd.read_csv(file) for file in glob(os.path.join(path_to_data,'classification',country_code,'labeling','qualtrics','*','labels.csv'))]).sort_values(
by=['tweet_id','class_id','QIDWorker']).drop_duplicates(
['tweet_id','class_id','QIDWorker']).set_index(
['tweet_id','class_id','QIDWorker'])

print('# labels:', labels.shape[0])

# labels: 268390


In [5]:
def is_labeled(x):
    # If First sequence was allocated more than once
    if x[0]>1:
        # If no other sequence
        if len(x)==1:
            return True
        else:
            # If second sequence less popular
            if x[1]<x[0]:
                return True
    return False

# Counts labels for each observation
counts=labels.groupby(['tweet_id','class_id'])['score'].value_counts().rename('count')

# Keep tweets that were labeled more than once with most popular labels strictly dominating
ids_labeled=counts.groupby(['tweet_id','class_id']).apply(list).apply(is_labeled).groupby('tweet_id').sum().where(lambda x:x==5).dropna().index
print('# labeled tweets:', len(ids_labeled))

# labeled tweets: 13620


In [6]:
# Keep most popular label sequence
labels=counts.reindex(ids_labeled,level='tweet_id').reset_index(
level='score').groupby(['tweet_id','class_id'])['score'].first().unstack()
labels.index=labels.index.astype(str)

class2name=dict(zip(range(1,6),[
'is_unemployed',
'lost_job_1mo',
'job_search',
'is_hired_1mo',
'job_offer',
]))

['Does this tweet indicate that the user is currently unemployed?',
'Does this tweet indicate that the user became unemployed within the last month?',
'Does this tweet indicate that the user is currently searching for a job?',
'Does this tweet indicate that the user was hired within the last month?',
'Does this tweet contain a job offer?',]

labels.rename(columns=lambda x:class2name[x],inplace=True)
labels.reset_index(inplace=True)
labels.columns.name=''

In [7]:
labels.to_pickle(os.path.join(path_to_data,'classification',country_code,'labeling','labels.pkl'))

In [8]:
labels.tail()

Unnamed: 0,tweet_id,is_unemployed,lost_job_1mo,job_search,is_hired_1mo,job_offer
13615,1077961226733473792,yes,yes,yes,no,yes
13616,1078919721674784768,no,no,yes,no,no
13617,1079020674377629701,no,no,no,no,yes
13618,1079086885648031745,no,no,no,no,yes
13619,1079723333548093440,no,no,no,no,yes
