## This file analyzes the crowdsourcing data

In [1]:
import itertools
import json
import random
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import os
import math
import toolz
import glob as glob
import copy

In [15]:
## Make the url anonymous 
def makeUrlAnonymous(url):
    return 'https://anonymouslink/folder/' + url.split('/')[-1]

for batchResultsFile in glob.glob(os.path.join('MTurkData', '*batch_results.csv')):
    batchResultsFileDf = pd.read_csv(batchResultsFile)
    
    for idx in range(1,11):
        try:
            batchResultsFileDf['Input.img' + str(idx) + '_1'] = batchResultsFileDf['Input.img' + str(idx) + '_1'].apply(lambda x: makeUrlAnonymous(x))
            batchResultsFileDf['Input.img' + str(idx) + '_2'] = batchResultsFileDf['Input.img' + str(idx) + '_2'].apply(lambda x: makeUrlAnonymous(x))
        except:
            continue
            
    batchResultsFileDf.to_csv(batchResultsFile, index = False)

In [21]:
exp1 = pd.concat([pd.read_csv('MTurkData/exp1_e1_batch_results.csv'),
                     pd.read_csv('MTurkData/exp1_e2_batch_results.csv'),
                     pd.read_csv('MTurkData/exp1_e3_batch_results.csv')])
exp2 = pd.concat([pd.read_csv('MTurkData/exp2_e1_batch_results.csv'),
                     pd.read_csv('MTurkData/exp2_e2_batch_results.csv'),
                     pd.read_csv('MTurkData/exp2_e3_batch_results.csv')])
expAll = pd.concat([exp1, exp2])

In [22]:
print('Participants in Exp 1: ', exp1['WorkerId'].nunique())
print('Participants in Exp 2: ', exp2['WorkerId'].nunique())
print('Total: ', expAll['WorkerId'].nunique())

Participants in Exp 1:  280
Participants in Exp 2:  176
Total:  416


In [23]:
## There is one duplicate question per HIT. The HIT will be rejected if the answers are not consistent
def getQuality(row):
    ans = json.loads(row['Answer.taskAnswers'])[0]
    if (ans['img10_1']['img10_1'] == ans['img1_1']['img1_1']):
        return False
    else:
        return True

expAll['quality'] = expAll.apply(lambda x: getQuality(x), axis = 1) 
expAll.quality.value_counts()

True     1726
False     752
Name: quality, dtype: int64

In [24]:
## Keep only the valid ones
resultDf = expAll[expAll.quality == True]


(1726, 49)

In [25]:
## Unpack the responses into paired results
pairs = []
answers = []
for idx, row in resultDf.iterrows():
    ans = json.loads(row['Answer.taskAnswers'])[0]
    for i in range(1, 9 + 1):
        key = 'img' + str(i) + '_1'
        key2 = 'img' + str(i) + '_2'
        
        if ans[key][key] == True:
            pairs.append([row["Input." + key], row["Input." + key2], 1])
        else:
            pairs.append([row["Input." + key], row["Input." + key2], 2])
            
pairDf = pd.DataFrame(pairs, columns=['img1', 'img2', 'wIdx'])

(15534, 3)

In [26]:
pairDf.dropna(inplace = True)
pairDf.shape

(15500, 3)

In [31]:
## Compute the winning counts per pair
pairDf['cIdx'] = pairDf['img1'].apply(lambda x: x.split('/')[-1].split('_')[0])
pairCountDf = pairDf.groupby(['img1', 'img2', 'wIdx']).size().unstack().fillna(0)
pairCountDf = pairCountDf.reset_index()
pairCountDf['cIdx'] = pairCountDf['img1'].apply(lambda x: x.split('/')[-1].split('_')[0])

In [32]:
for idx, row in pairCountDf.iterrows():
    if row[1] > row[2]:
        pairCountDf.loc[idx, 'good'] = row['img1']
        pairCountDf.loc[idx, 'bad'] = row['img2']
        pairCountDf.loc[idx, 'goodCount'] = row[1]
        pairCountDf.loc[idx, 'badCount'] = row[2]
    elif row[1] < row[2]:
        pairCountDf.loc[idx, 'good'] = row['img2']
        pairCountDf.loc[idx, 'bad'] = row['img1']
        pairCountDf.loc[idx, 'goodCount'] = row[2]
        pairCountDf.loc[idx, 'badCount'] = row[1]

(5343, 9)

In [36]:
finaldf = pairCountDf[['img1', 'img2', 'good', 'bad', 'cIdx', 'goodCount', 'badCount']].reset_index(drop=True)
finaldf.reset_index(drop=True,  inplace=True)
finaldf.dropna(inplace = True)
agreedf = finaldf[finaldf.apply(lambda x: x['badCount'] == 0, axis=1)]
print('Total aggrement: ', agreedf.shape[0] / finaldf.shape[0])

Total aggrement:  0.4555182543013009
