In [1]:
import requests
import zipfile
import json
import io, os
import sys
import re
import socket
import pandas as pd
import reverse_geocoder as rg
import numpy as np
from glob import glob

In [2]:
country_code="US"

model='BERT'
iteration='1'
iter_and_model='ITER_'+iteration
if model:
    iter_and_model+='_'+model
print('Folder:', iter_and_model)

print('Hostname:', socket.gethostname())
if 'samuel' in socket.gethostname().lower():
    path_to_data='../../data'
else:
    path_to_data='/scratch/spf248/twitter/data'

# Setting user Parameters
with open(os.path.join(path_to_data,'keys/qualtrics/apiToken'),'r') as f:
    apiToken = eval(f.readline())
dataCenter = "nyu.ca1"
surveyId = "SV_24RoQ3TAAnEpaN7"
fileFormat = "csv"

Folder: ITER_1_BERT
Hostname: Samuels-MBP.home


# Export Survey

In [3]:
def exportSurvey(apiToken,surveyId,dataCenter,fileFormat):
    
    surveyId = surveyId
    fileFormat = fileFormat
    dataCenter = dataCenter 

    # Setting static parameters
    requestCheckProgress = 0.0
    progressStatus = "inProgress"
    baseUrl = "https://{0}.qualtrics.com/API/v3/responseexports/".format(dataCenter)
    headers = {
    "content-type": "application/json",
    "x-api-token": apiToken,
    }

    # Step 1: Creating Data Export
    downloadRequestUrl = baseUrl
    downloadRequestPayload = '{"format":"' + fileFormat + '","surveyId":"' + surveyId + '"}'
    downloadRequestResponse = requests.request("POST", downloadRequestUrl, data=downloadRequestPayload, headers=headers)
    progressId = downloadRequestResponse.json()["result"]['id']
    print(downloadRequestResponse.text)

    # Step 2: Checking on Data Export Progress and waiting until export is ready
    while progressStatus != "complete" and progressStatus != "failed":
        print ("progressStatus=", progressStatus)
        requestCheckUrl = baseUrl + progressId
        requestCheckResponse = requests.request("GET", requestCheckUrl, headers=headers)
        requestCheckProgress = requestCheckResponse.json()["result"]["percentComplete"]
        print("Download is " + str(requestCheckProgress) + " complete")
        progressStatus = requestCheckResponse.json()["result"]["status"]

    #step 2.1: Check for error
    if progressStatus is "failed":
        raise Exception("export failed")

    # # Step 3: Downloading file
    requestDownloadUrl = baseUrl + progressId + '/file'
    requestDownload = requests.request("GET", requestDownloadUrl, headers=headers, stream=True)

    # Step 4: Unzipping the file
    zipfile.ZipFile(io.BytesIO(requestDownload.content)).extractall(
    os.path.join(path_to_data,"classification",country_code,"labeling",iter_and_model,'qualtrics',surveyId))
    print('Complete')

In [4]:
if not os.path.exists(os.path.join(path_to_data,"classification",country_code,"labeling",iter_and_model,'qualtrics',surveyId)):
    if not re.compile('^SV_.*').match(surveyId):
        print("survey Id must match ^SV_.*")
    else:
        exportSurvey(apiToken, surveyId, dataCenter, fileFormat)

{"result":{"id":"ES_9th9qum4q358invkcdtbdqvr5c"},"meta":{"httpStatus":"200 - OK","requestId":"06de4d69-3ff5-4605-9db5-703a1ed07136"}}
progressStatus= inProgress
Download is 0.0 complete
progressStatus= in progress
Download is 0.0 complete
progressStatus= in progress
Download is 0.0 complete
progressStatus= in progress
Download is 0.0 complete
progressStatus= in progress
Download is 0.0 complete
progressStatus= in progress
Download is 100.0 complete
Complete


# Analyse Results

In [5]:
df=pd.read_csv(os.path.join(path_to_data,"classification",country_code,"labeling",iter_and_model,'qualtrics',surveyId,"labor-market-tweets.csv"),low_memory=False)

In [6]:
# First two rows contain metadata
df.drop([0,1],inplace=True)

df=df.loc[(df['QIDWorker'].dropna().drop_duplicates().index)].set_index('QIDWorker').copy()

places=rg.search([tuple(x) for x in df[['LocationLatitude','LocationLongitude']].astype(float).dropna().values.tolist()])

print('# of workers who refused the consent form:', (df.QIDConsent.astype(int)==0).sum())
print('# of workers who did not complete the survey:', (df.Finished.astype(int)==0).sum())

to_drop=[
'ResponseID',
'ResponseSet', 
'IPAddress', 
'StartDate', 
'EndDate',
'RecipientLastName', 
'RecipientFirstName', 
'RecipientEmail',
'ExternalDataReference',
'Finished',
'Status', 
'Random ID',
'QIDConsent', 
'QIDDescription',
'QIDCompletion',
'LocationLatitude',
'LocationLongitude',
'LocationAccuracy']

df.drop(to_drop,1,inplace=True,errors='ignore')
df.drop([x for x in df.columns if 'BR-FL_' in x],1,inplace=True,errors='ignore')

Loading formatted geocoded file...
# of workers who refused the consent form: 0
# of workers who did not complete the survey: 0


In [7]:
# Checks
checks=df[[col for col in df.columns if 'check' in col]].copy()
checks.columns.name='QID'

# Rearrange Results
checks=checks.stack().rename('score').to_frame()

# Extract Check ID
checks['check_id']=checks.index.get_level_values('QID').map(
lambda x:re.findall('check-(\d)',x)[0])

# Extract Class ID
checks['class_id']=checks.index.get_level_values('QID').map(
lambda x:re.findall('_(\d)',x)[0])

# Sort Values
checks=checks.reset_index(level='QIDWorker').sort_values(
by=['QIDWorker','check_id','class_id']).set_index(
['QIDWorker','check_id','class_id'])

# Bot=Fail to give a Yes to the 3 check questions
def is_bot(x):
    l=x.split('_')
    if len(l)==10:
        if l[0]=='1' and l[1]=='1' and l[8]=='1':
            return False
    return True

bots=checks.unstack(
level='check_id').unstack(
level='class_id').fillna('').apply(
lambda x:'_'.join(x),1).apply(is_bot).where(
lambda x:x==True).dropna().index

print('# Workers who failed the check questions (= bots?):', bots.shape[0])

# Workers who failed the check questions (= bots?): 3


In [8]:
# Remove checks
df.drop([col for col in df.columns if 'check' in col],1,inplace=True)
df.columns.name='QID'

# Rearrange Results
df=df.stack().rename('score').to_frame()

# Extract Tweets ID (Removing Extra Indexing)
df['tweet_id']=df.index.get_level_values('QID').map(
lambda x:re.sub('-v\d','',x.replace('ID_','').replace('.1','')).split('_')[0])

# Extract Classes (Removing Extra Indexing)
df['class_id']=df.index.get_level_values('QID').map(
lambda x:re.sub('-v\d','',x.replace('ID_','').replace('.1','')).split('_')[1])

# Sort Values
df=df.reset_index(level='QIDWorker').sort_values(
by=['tweet_id','class_id','QIDWorker']).set_index(
['tweet_id','class_id','QIDWorker'])

# Drop Bots
df.drop(bots,level='QIDWorker',inplace=True,errors='ignore')

# Convert Scores
df.score=df.score.apply(lambda x:{'1':'yes','2':'no','3':'unsure'}[x])

# Count number of labels and number of unique labels
counts=df.groupby(['tweet_id','class_id'])['score'].agg(['count','nunique'])

print('% Observations with at least two identical labels:')
print(counts[(counts['count']>1)&(counts['nunique']==1)].shape[0]/counts[(counts['count']>1)].shape[0]*100)

% Observations with at least two identical labels:
43.913043478260875


In [9]:
df.to_csv(os.path.join(path_to_data,"classification",country_code,"labeling",iter_and_model,'qualtrics',surveyId,'labels.csv'))

In [10]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,score
tweet_id,class_id,QIDWorker,Unnamed: 3_level_1
1070419306096164864,1,A11YS0T8MV3Q7C,no
1070419306096164864,2,A11YS0T8MV3Q7C,no
1070419306096164864,3,A11YS0T8MV3Q7C,no
1070419306096164864,4,A11YS0T8MV3Q7C,yes
1070419306096164864,5,A11YS0T8MV3Q7C,unsure
...,...,...,...
976046480447418368,3,A2UYZFH5VT5R3H,no
976046480447418368,4,A2HRAAY4QRTM7S,no
976046480447418368,4,A2UYZFH5VT5R3H,no
976046480447418368,5,A2HRAAY4QRTM7S,yes


- coin's kappa:
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.cohen_kappa_score.html

- send back to labeling if there is disagreement or less than 2 labels