In [6]:
import requests
import zipfile
import json
import io, os
import sys
import re
import socket
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
import reverse_geocoder as rg

In [7]:
def exportSurvey(apiToken,surveyId,dataCenter,fileFormat,fileName):

    surveyId = surveyId
    fileFormat = fileFormat
    dataCenter = dataCenter 

    # Setting static parameters
    requestCheckProgress = 0.0
    progressStatus = "inProgress"
    baseUrl = "https://{0}.qualtrics.com/API/v3/surveys/{1}/export-responses/".format(dataCenter, surveyId)
    headers = {
    "content-type": "application/json",
    "x-api-token": apiToken,
    }

    # Step 1: Creating Data Export
    downloadRequestUrl = baseUrl
    downloadRequestPayload = '{"format":"' + fileFormat + '"}'
    downloadRequestResponse = requests.request("POST", downloadRequestUrl, data=downloadRequestPayload, headers=headers)
    progressId = downloadRequestResponse.json()["result"]["progressId"]
    print(downloadRequestResponse.text)

    # Step 2: Checking on Data Export Progress and waiting until export is ready
    while progressStatus != "complete" and progressStatus != "failed":
        print ("progressStatus=", progressStatus)
        requestCheckUrl = baseUrl + progressId
        requestCheckResponse = requests.request("GET", requestCheckUrl, headers=headers)
        requestCheckProgress = requestCheckResponse.json()["result"]["percentComplete"]
        print("Download is " + str(requestCheckProgress) + " complete")
        progressStatus = requestCheckResponse.json()["result"]["status"]

    #step 2.1: Check for error
    if progressStatus is "failed":
        raise Exception("export failed")

    fileId = requestCheckResponse.json()["result"]["fileId"]

    # Step 3: Downloading file
    requestDownloadUrl = baseUrl + fileId + '/file'
    requestDownload = requests.request("GET", requestDownloadUrl, headers=headers, stream=True)

    # Step 4: Unzipping the file
    zipfile.ZipFile(io.BytesIO(requestDownload.content)).extractall(
    os.path.join(path_to_data,"classification",country,fileName))
    print('Complete')

In [8]:
country="US"

print('Hostname:', socket.gethostname())
if 'samuel' in socket.gethostname().lower():
    path_to_data='../../data'
else:
    path_to_data='/scratch/spf248/twitter/data'
    
# Setting user Parameters
with open(os.path.join(path_to_data,'keys/qualtrics/apiToken'),'r') as f:
    apiToken = eval(f.readline())
dataCenter = "nyu.ca1"
surveyId = "SV_0dB80s8q5OhAV8x"
fileFormat = "csv" # ["csv", "tsv", "spss"]

r = re.compile('^SV_.*')
m = r.match(surveyId)
if not m:
    print("survey Id must match ^SV_.*")
else:
    exportSurvey(apiToken, surveyId, dataCenter, fileFormat,"qualtrics-download-v1")

Hostname: Samuels-MBP
{"result":{"progressId":"ES_bKoA6nD5Gz5KmBn","percentComplete":0.0,"status":"inProgress"},"meta":{"requestId":"1e600583-06b9-4cd6-bfed-265b57899105","httpStatus":"200 - OK"}}
progressStatus= inProgress
Download is 0.0 complete
progressStatus= inProgress
Download is 100.0 complete
Complete


# Analyse Results

In [328]:
df=pd.read_csv(os.path.join(path_to_data,"classification",country,"qualtrics-download-v1","labor-market-tweets.csv"))
df.columns.name='question'

In [329]:
print('Median Duration:', df['Duration (in seconds)'].drop([0,1]).astype(int).median(),'sec')

Median Duration: 727.0 sec


In [330]:
print('Min Duration:', df['Duration (in seconds)'].drop([0,1]).astype(int).min(),'sec')

Min Duration: 201 sec


In [331]:
print('Max Duration:', df['Duration (in seconds)'].drop([0,1]).astype(int).max(),'sec')

Max Duration: 21888 sec


In [332]:
print('Non-unique ID:', df.QIDWorker.unique().shape[0]!=df.QIDWorker.shape[0])

Non-unique ID: False


In [333]:
print('# Workers Who Refused the Consent Form:', (df.QIDConsent.drop([0,1]).astype(int)==0).sum())

# Workers Who Refused the Consent Form: 0


In [334]:
print('# Workers Who Did Not Complete the Survey:', (df.Finished.drop([0,1]).astype(int)==0).sum())

# Workers Who Did Not Complete the Survey: 3


In [335]:
rg.search([tuple(x) for x in df[['LocationLatitude','LocationLongitude']].drop([0,1]).astype(float).dropna().values.tolist()])

[OrderedDict([('lat', '42.57952'),
              ('lon', '1.65362'),
              ('name', 'El Tarter'),
              ('admin1', 'Canillo'),
              ('admin2', ''),
              ('cc', 'AD')]),
 OrderedDict([('lat', '42.57952'),
              ('lon', '1.65362'),
              ('name', 'El Tarter'),
              ('admin1', 'Canillo'),
              ('admin2', ''),
              ('cc', 'AD')]),
 OrderedDict([('lat', '42.57952'),
              ('lon', '1.65362'),
              ('name', 'El Tarter'),
              ('admin1', 'Canillo'),
              ('admin2', ''),
              ('cc', 'AD')]),
 OrderedDict([('lat', '42.57952'),
              ('lon', '1.65362'),
              ('name', 'El Tarter'),
              ('admin1', 'Canillo'),
              ('admin2', ''),
              ('cc', 'AD')]),
 OrderedDict([('lat', '42.57952'),
              ('lon', '1.65362'),
              ('name', 'El Tarter'),
              ('admin1', 'Canillo'),
              ('admin2', ''),
              (

In [459]:
to_drop=(df.drop([0,1]).set_index('QIDWorker').filter(regex='check-0').stack(
).groupby('QIDWorker').apply(lambda x:'_'.join(x))=='1_1_2_2_2').where(lambda x:x!=True).dropna().index.union(
(df.drop([0,1]).set_index('QIDWorker').filter(regex='check-1').stack(
).groupby('QIDWorker').apply(lambda x:'_'.join(x))=='2_2_2_1_2').where(lambda x:x!=True).dropna().index)
to_drop

Index(['A1PUHCEBSOWETV', 'A2DNLHS1RSTF5R', 'A2UO3QJZNC2VOE'], dtype='object', name='QIDWorker')

In [460]:
questions=sorted([x for x in df.columns[19:-1] if 'check' not in x])

In [461]:
results=pd.concat([
df.drop([0,1]).set_index('QIDWorker')[questions].rename(
columns=lambda x:x.split('_')[0]).stack().groupby(
['QIDWorker','question']).apply(
lambda x:'_'.join(x)).drop(to_drop,level='QIDWorker').groupby(
'question').count().rename('n_count'),
df.drop([0,1]).set_index('QIDWorker')[questions].rename(
columns=lambda x:x.split('_')[0]).stack().groupby(
['QIDWorker','question']).apply(
lambda x:'_'.join(x)).drop(to_drop,level='QIDWorker').groupby(
'question').nunique().rename('n_unique')],1)

In [462]:
results[(results['n_count']>1)&(results['n_unique']==1)].shape[0]/results.shape[0]

0.5348837209302325