In [1]:
import requests
import zipfile
import json
import io, os
import sys
import re
import socket
import pandas as pd
import reverse_geocoder as rg

In [2]:
country_code="US"

print('Hostname:', socket.gethostname())
if 'samuel' in socket.gethostname().lower():
    path_to_data='../../data'
else:
    path_to_data='/scratch/spf248/twitter/data'

# Setting user Parameters
with open(os.path.join(path_to_data,'keys/qualtrics/apiToken'),'r') as f:
    apiToken = eval(f.readline())
dataCenter = "nyu.ca1"
surveyId = "SV_8uMuwiJVgsGDPjn"
fileFormat = "csv" # ["csv", "tsv", "spss"]

Hostname: Samuels-MacBook-Pro.local


# Export Survey

In [3]:
def exportSurvey(apiToken,surveyId,dataCenter,fileFormat):
    
    surveyId = surveyId
    fileFormat = fileFormat
    dataCenter = dataCenter 

    # Setting static parameters
    requestCheckProgress = 0.0
    progressStatus = "inProgress"
    baseUrl = "https://{0}.qualtrics.com/API/v3/responseexports/".format(dataCenter)
    headers = {
    "content-type": "application/json",
    "x-api-token": apiToken,
    }

    # Step 1: Creating Data Export
    downloadRequestUrl = baseUrl
    downloadRequestPayload = '{"format":"' + fileFormat + '","surveyId":"' + surveyId + '"}'
    downloadRequestResponse = requests.request("POST", downloadRequestUrl, data=downloadRequestPayload, headers=headers)
    progressId = downloadRequestResponse.json()["result"]['id']
    print(downloadRequestResponse.text)

    # Step 2: Checking on Data Export Progress and waiting until export is ready
    while progressStatus != "complete" and progressStatus != "failed":
        print ("progressStatus=", progressStatus)
        requestCheckUrl = baseUrl + progressId
        requestCheckResponse = requests.request("GET", requestCheckUrl, headers=headers)
        requestCheckProgress = requestCheckResponse.json()["result"]["percentComplete"]
        print("Download is " + str(requestCheckProgress) + " complete")
        progressStatus = requestCheckResponse.json()["result"]["status"]

    #step 2.1: Check for error
    if progressStatus is "failed":
        raise Exception("export failed")

    # # Step 3: Downloading file
    requestDownloadUrl = baseUrl + progressId + '/file'
    requestDownload = requests.request("GET", requestDownloadUrl, headers=headers, stream=True)

    # Step 4: Unzipping the file
    zipfile.ZipFile(io.BytesIO(requestDownload.content)).extractall(
    os.path.join(path_to_data,"classification",country_code,"labeled",surveyId))
    print('Complete')

In [4]:
if not os.path.exists(os.path.join(path_to_data,"classification",country_code,"labeled",surveyId)):
    if not re.compile('^SV_.*').match(surveyId):
        print("survey Id must match ^SV_.*")
    else:
        exportSurvey(apiToken, surveyId, dataCenter, fileFormat)

# Analyse Results

In [5]:
df=pd.read_csv(os.path.join(path_to_data,"classification",country_code,"labeled",surveyId,"labor-market-tweets.csv"),low_memory=False)

In [6]:
# First two rows contain metadata
df.drop([0,1],inplace=True)

if df.QIDWorker.unique().shape[0]!=df.QIDWorker.shape[0]:
    print('Some workers took the survey twice, hence indexing by worker might be invalid.')
    
else:
    df.set_index('QIDWorker',inplace=True)
    
places=rg.search([tuple(x) for x in df[['LocationLatitude','LocationLongitude']].astype(float).dropna().values.tolist()])

print('# of workers who refused the consent form:', (df.QIDConsent.astype(int)==0).sum())
print('# of workers who did not complete the survey:', (df.Finished.astype(int)==0).sum())

to_drop=[
'ResponseID',
'ResponseSet', 
'IPAddress', 
'StartDate', 
'EndDate',
'RecipientLastName', 
'RecipientFirstName', 
'RecipientEmail',
'ExternalDataReference',
'Finished',
'Status', 
'Random ID',
'QIDConsent', 
'QIDDescription',
'QIDCompletion',
'LocationLatitude',
'LocationLongitude',
'LocationAccuracy']

df.drop(to_drop,1,inplace=True,errors='ignore')
df.drop([x for x in df.columns if 'BR-FL_' in x],1,inplace=True,errors='ignore')

Loading formatted geocoded file...
# of workers who refused the consent form: 0
# of workers who did not complete the survey: 0


In [7]:
# Checks
checks=df[[col for col in df.columns if 'check' in col]].copy()
checks.columns.name='QID'

# Rearrange Results
checks=checks.stack().rename('score').to_frame()

# Extract Check ID
checks['check_id']=checks.index.get_level_values('QID').map(
lambda x:re.findall('check-(\d)',x)[0])

# Extract Class ID
checks['class_id']=checks.index.get_level_values('QID').map(
lambda x:re.findall('_(\d)',x)[0])

# Sort Values
checks=checks.reset_index(level='QIDWorker').sort_values(
by=['QIDWorker','check_id','class_id']).set_index(
['QIDWorker','check_id','class_id'])

# List check sequence filling missing values
checks=checks.unstack(level='class_id').apply(lambda x:'_'.join(x),1)
checks=checks.unstack(level='check_id').fillna('')

# Select workers who failed the check sequence
checks=checks.where(lambda x:(x['0']!='1_1_2_2_2')|(x['1']!='2_2_2_1_2')).dropna()

# print('Workers who failed the check questions (= bots?):')
checks

check_id,0,1
QIDWorker,Unnamed: 1_level_1,Unnamed: 2_level_1
A1SWV4X4PD25S1,1_1_2_2_2,2_3_2_1_2
A1USDMJVT10CE4,1_1_2_2_2,2_2_2_2_2
A2APPZDU0VS9LN,1_1_2_2_2,2_2_2_2_2
A2F0X4LN9N4O4C,1_2_2_2_2,2_2_2_1_2
A2FUMA4UR6S920,1_2_2_2_2,2_2_2_1_2
A2GR3333S2F53G,1_1_3_2_2,2_2_2_1_2
A2SYRFPPV9WDEG,1_1_2_2_2,2_1_2_1_2
A35D31QHYQUF9V,1_1_3_2_2,2_2_2_1_2
A38USYKE9P7Z9O,1_1_2_2_2,2_1_1_1_2
A3LRZX8477TYYZ,1_1_3_2_2,2_2_2_1_2


In [8]:
# Remove checks
df.drop([col for col in df.columns if 'check' in col],1,inplace=True)
df.columns.name='QID'

# Rearrange Results
df=df.stack().rename('score').to_frame()

# Extract Tweets ID (Removing Extra Indexing)
df['tweet_id']=df.index.get_level_values('QID').map(
lambda x:re.sub('-v\d','',x.replace('ID_','').replace('.1','')).split('_')[0])

# Extract Classes (Removing Extra Indexing)
df['class_id']=df.index.get_level_values('QID').map(
lambda x:re.sub('-v\d','',x.replace('ID_','').replace('.1','')).split('_')[1])

# Sort Values
df=df.reset_index(level='QIDWorker').sort_values(
by=['tweet_id','class_id','QIDWorker']).set_index(
['tweet_id','class_id','QIDWorker'])

# Drop Bots
df.drop(checks.index,level='QIDWorker',inplace=True)

# Convert Scores
df.score=df.score.apply(lambda x:{'1':'yes','2':'no','3':'unsure'}[x])

# Count number of labels and number of unique labels
counts=df.groupby(['tweet_id','class_id'])['score'].agg(['count','nunique'])

print('% Observations with at least two identical labels:')
print(counts[(counts['count']>1)&(counts['nunique']==1)].shape[0]/counts[(counts['count']>1)].shape[0]*100)

% Observations with at least two identical labels:
90.0


In [9]:
df.to_csv(os.path.join(path_to_data,"classification",country_code,'labeled',surveyId,'labels.csv'))

In [10]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,score
tweet_id,class_id,QIDWorker,Unnamed: 3_level_1
1005925892621783041,1,AMA18W8F60Y2J,yes
1005925892621783041,2,AMA18W8F60Y2J,yes
1005925892621783041,3,AMA18W8F60Y2J,no
1005925892621783041,4,AMA18W8F60Y2J,no
1005925892621783041,5,AMA18W8F60Y2J,no
...,...,...,...
998726197856100352,4,A1CFMY4CEYOM8Y,yes
998726197856100352,4,AETIZKQNUSBLB,yes
998726197856100352,5,A100Y89FZO4J0B,no
998726197856100352,5,A1CFMY4CEYOM8Y,no


- coin's kappa:
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.cohen_kappa_score.html

- send back to labeling if there is disagreement or less than 2 labels