In [1]:
from collections import defaultdict
import sys
import os
import pickle
import csv 
from tqdm import tqdm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
sys.path.append('../')

from sklearn import metrics

survey_path='/nfs/juhu/data/rakhasan/bystander-detection/pilot3_coco/'
survey_photo_path = survey_path+'/photos/'

In [2]:
mapping = pickle.load(open(survey_path +'mappings_common.pkl','rb'))

In [3]:
agree_disagree = {'Strongly disagree' : -3, 'Disagree': -2, 'Somewhat disagree' : -1, 'Neither agree nor disagree' : 0,
                  'Somewhat agree': 1, 'Agree':2, 'Strongly agree' : 3}

photo_place = {'A public place':-2, 'A semi-public place.':-1, 'A semi-private place':1, 'A private place':2, 'Not sure':0}

awareness= {'Not at all aware':0, 'Slightly aware':1, 'Somewhat aware':2,'Moderately aware':3, 'Extremely aware':4}

willingness = {'Completely unwilling':-2,'Somewhat unwilling':-1, 'Neither unwilling nor willing':0,
               'Somewhat willing':1, 'Completely willing':2}

comfortable = {'Highly uncomfortable':-3, 'Uncomfortable':-2, 'Somewhat uncomfortable':-1, 'Neither uncomfortable nor comfortable':0,
              'Somewhat comfortable':1, 'Comfortable':2, 'Highly comfortable':3}

subject = {'Definitely subject':2, 'Most probably subject':1, 'Not sure':0, 
           'Most probably bystander':-1, 'Definitely bystander':-2}


In [4]:
meta_data = ['StartDate', 'EndDate', 'Status', 'IPAddress', 'Progress',
              'Finished',
             'RecordedDate', 'ResponseId', 'RecipientLastName',
             'RecipientFirstName', 'RecipientEmail', 'ExternalReference', 'LocationLatitude', 'LocationLongitude',
             'DistributionChannel', 'UserLanguage', 
             'count', 
             #'path', 'num_of_check', 'img_w', 'img_h', 'total_count', 'img_w2', 'img_h2', 
             'RandomCode',
             'IsMobile', 'test',
            ]
test_questions = []

In [178]:
def remove_invalid_responses(df,                             
                             tolerance = 0):
    '''Remove unfinished or responses with incorrect attention check > tolerance'''
    rows_to_remove = []
    not_finished = 0
    no_id=0
    duplicate=0
    questions_missed=0
    no_sequence=0
    for index, row in df.iterrows():
        if int(row['duplicate'])==1:
            rows_to_remove.append(index)
            duplicate+=1
            continue
            
        if row['Finished'] =='False':
            rows_to_remove.append(index)
            not_finished +=1
            continue
        workerid = row['workerid']
        if type(workerid)==float or len(workerid.strip())==0:
            rows_to_remove.append(index)
            no_id +=1
            continue
        if row['sequence']=='s':
            no_sequence+=1
            rows_to_remove.append(index)
            continue
        if (isinstance(row['attnwrong'], float) and np.isnan(row['attnwrong'])) or \
            int(row['attnwrong']) > tolerance:
                rows_to_remove.append(index)
                questions_missed+=1
    print('duplicate:{}, unfinished:{}, no workerid: {}, no sequence: {}, questions missed:{}'.format(
        duplicate, not_finished, no_id, no_sequence, questions_missed))    
    #print('rows removed:',rows_to_remove)
    removed_rows =  df.loc[rows_to_remove]
    df= df.drop(rows_to_remove, axis=0, inplace=False)
    return (df, removed_rows)


In [284]:
%%bash
python  /data/rakhasan/download_qualtrics_data.py SV_6Ya2ALuITeOYWvX /data/rakhasan/bystander-detection/pilot3_coco/survey-response.csv
python  /data/rakhasan/download_qualtrics_data.py SV_e5O28IS76M968GV /data/rakhasan/bystander-detection/pilot3_coco/survey-response2.csv 
python  /data/rakhasan/download_qualtrics_data.py SV_3f4VyiHf6GlLJrf /data/rakhasan/bystander-detection/pilot3_coco/fourth-response.csv 
python  /data/rakhasan/download_qualtrics_data.py SV_8qRBx0kNIaSmc3b /data/rakhasan/bystander-detection/pilot3_coco/fourth-response2.csv 

{"result":{"progressId":"ES_cvX6DEnkVLZjDvL","percentComplete":0.0,"status":"inProgress"},"meta":{"requestId":"4b8cff65-41b6-4242-b57b-6b9557d2156d","httpStatus":"200 - OK"}}
progressStatus= inProgress
Download is 0.0 complete
progressStatus= inProgress
Download is 0.0 complete
progressStatus= inProgress
Download is 0.0 complete
progressStatus= inProgress
Download is 0.0 complete
progressStatus= inProgress
Download is 100.0 complete
Complete
{"result":{"progressId":"ES_bllADrFbR0KbJkh","percentComplete":0.0,"status":"inProgress"},"meta":{"requestId":"c6e82575-3f94-4aa0-8384-d2d512f27d64","httpStatus":"200 - OK"}}
progressStatus= inProgress
Download is 0.0 complete
progressStatus= inProgress
Download is 0.0 complete
progressStatus= inProgress
Download is 100.0 complete
Complete
{"result":{"progressId":"ES_aXWVUqW6c1yiYiF","percentComplete":0.0,"status":"inProgress"},"meta":{"requestId":"7f572c6b-9eea-462c-b25f-92a3d3eba09b","httpStatus":"200 - OK"}}
progressStatus= inProgress
Download i

In [285]:
data_file = os.path.join(survey_path,'survey-response.csv')
data_file2 = os.path.join(survey_path,'survey-response2.csv')

data1=pd.read_csv(data_file).reset_index(inplace=False)
data1.drop([0,1], inplace=True, axis=0) #drop first row

data2=pd.read_csv(data_file2).reset_index(inplace=False)
data2.drop([0,1], inplace=True, axis=0) #drop first row

## A hack to keep one data point
i=data2[data2.photo_set=='14'].workerid.index
data2.loc[i,'workerid']='temp'

data = pd.concat([data1,data2])
data.index = range(len(data))

print('Total number of responses: ',len(data))
for t in range(3):
    df, removed = remove_invalid_responses(data, tolerance=t) 
    print(len(df),len(removed))
#     print('Tolerance: {}, valid response: {}({:.2f}%) (not finished: {})'.format(
#         t, len(validated_data),len(validated_data)*100/(len(data)-not_finished), not_finished))

Total number of responses:  104
duplicate:27, unfinished:0, no workerid: 2, no sequence: 7, questions missed:11
57 47
duplicate:27, unfinished:0, no workerid: 2, no sequence: 7, questions missed:1
67 37
duplicate:27, unfinished:0, no workerid: 2, no sequence: 7, questions missed:0
68 36


In [286]:
#fourth response
data_file = os.path.join(survey_path,'fourth-response.csv')
data_file2 = os.path.join(survey_path,'fourth-response2.csv')

data1=pd.read_csv(data_file).reset_index(inplace=False)
data1.drop([0,1], inplace=True, axis=0) #drop first row

data2=pd.read_csv(data_file2).reset_index(inplace=False)
data2.drop([0,1], inplace=True, axis=0) #drop first row

fourth = pd.concat([data1,data2])
fourth.index = range(len(fourth))

print('Total number of responses: ',len(fourth))
fourth, removed = remove_invalid_responses(fourth, tolerance=0) 
fourth.shape

Total number of responses:  4
duplicate:0, unfinished:0, no workerid: 0, no sequence: 0, questions missed:0


(4, 1832)

In [287]:
t=0
validated_data, removed = remove_invalid_responses(data, tolerance=t) 
validated_data.index.rename('pid', inplace = True)
#validated_data.photo_set_offset.fillna(0, inplace=True)
#validated_data['photo_set']=validated_data.photo_set.astype(int)#.apply(lambda row: int(row.photo_set), axis=1)
print('Final valid response with {} tolerance: {}'.format(t, len(validated_data)))
print('Number of photo sets: ', len(validated_data.photo_set.unique()))
photo_set_sizes = validated_data.groupby('photo_set').size()

print('\nRemoved mturk ids:')
print(len(removed[(removed.duplicate=='0')][['workerid','attnwrong']]))
removed[(removed.duplicate=='0')][['workerid','attnwrong','photo_set','duplicate']]

duplicate:27, unfinished:0, no workerid: 2, no sequence: 7, questions missed:11
Final valid response with 0 tolerance: 57
Number of photo sets:  18

Removed mturk ids:
20


Unnamed: 0,workerid,attnwrong,photo_set,duplicate
18,A2615YW1YERQBO,1,8,0
23,A2MCNGY62MPRI5,1,11,0
45,A19PN52BDA462L,1,2,0
46,A2U0JT7TSIIXPS,1,9,0
48,AG9LWKO86TNHG,1,0,0
49,A2VKACLXTMOQWO,1,9,0
50,A2K4OJDQPXIU5T,0,4,0
51,A28GFEMPMLUU14,0,9,0
54,A2O2Y99RA9GFUJ,1,0,0
57,A2RY24DUOWXUWU,0,2,0


In [288]:
print('Total:',len(set(validated_data['photo_set'])))
validated_data.groupby('photo_set').size()

Total: 18


photo_set
0     3
1     3
10    3
11    3
12    4
13    3
14    3
15    3
16    3
17    3
2     3
3     3
4     3
5     3
6     3
7     5
8     3
9     3
dtype: int64

In [289]:
validated_data['duration'] = validated_data['Duration (in seconds)'].astype(float)//60
validated_data.duration.describe()

count     57.000000
mean     105.947368
std       93.793546
min        3.000000
25%       56.000000
50%       76.000000
75%      127.000000
max      548.000000
Name: duration, dtype: float64

In [290]:
photo_ques_headers = ['contains_human', 'photo_place', 'was_aware', 'posing','comfort', 
                      'will', 'photographer_intention', 'replacable', 'subject_bystander', 'why_subject', 
                      'why_subject_text','why_bystander', 'why_bystander_text', 'why_neither', 'why_neither_text']

high_level_concepts = ['was_aware', 'posing','comfort', 'will', 'photographer_intention', 'replacable', 'photo_place']
high_level_concepts_num = [c+'_num' for c in high_level_concepts]

questions = list(validated_data.columns)
photo_questions = questions[questions.index('1_Q6.1'):questions.index('Q7.1')]
photos_question_start = questions.index('1_Q6.1')
questions_per_photo = 34 #including text responeses for last 3 questions

def create_photo_df(df):
    
    dicts = []
    for pid, row in df.iterrows():
        per_participant_img_num =int( row['per_participant_img_num'])
#         photo_set = int(row['photo_set'])    
        sequence = list(filter(None, row['sequence'].split(',')))
        if len(sequence)!=52:
            print('pid: {}, len(seq): {}'.format(pid, len(sequence)))
            continue
        for p in range(len(sequence)): #for each photo
            if sequence[p] == 'test1' or sequence[p] == 'test2':
                continue
            d = dict()
            d['photo_no'] = int(sequence[p])
            d['pid'] = row.name#['pid']
            for q in range(10):               
                d[photo_ques_headers[q]] = row[photo_questions[p * questions_per_photo + q]]    
            dicts.append(d)
    return pd.DataFrame.from_dict(dicts)
                
        
def text_to_numeric_series(df, col_name, conversion_dict):
    return df.apply(lambda row: conversion_dict[row[col_name]] if isinstance(row[col_name], str) else row[col_name], axis=1)

In [291]:
photo_df = create_photo_df(validated_data)
photo_df.set_index('photo_no', inplace=True, drop=False)
print('Total unique photos: {}'.format(len(set(photo_df.index.values))))

Total unique photos: 900


In [292]:
'''Fourth response'''
# fourth = create_photo_df(fourth)
# fourth.set_index('photo_no', inplace=True, drop=False)
# print('Total unique photos: {}'.format(len(set(fourth.index.values))))

'Fourth response'

In [293]:
def remove_responses_non_person(df, tolerance= 0):
    rows_to_remove = defaultdict(int)
    for photo_no, row in df.iterrows():
        if row['contains_human'].strip() =='I don\'t see any box.'\
        or row['contains_human'].strip() =='There is just a depiction/representation of a person but not a real person (e.g. a poster/photo/sculpture of a person).'\
        or row['contains_human'].strip()=='I don\'t see any box.'\
        or row['contains_human'].strip()=='There is something else inside the box.':
            rows_to_remove[photo_no] += 1
    to_remove = [k for k in rows_to_remove.keys() if rows_to_remove[k]>tolerance]
    return to_remove, df.drop(to_remove, axis=0, inplace=False)

for t in range(3):
    removed_photos, valid_photo_df = remove_responses_non_person(photo_df, tolerance=t)
    print('{} photos removed when {} participants found no person in the photo ({:.1f}%)'.format(
        len(removed_photos), t+1, len(removed_photos)*100/len(set(photo_df.index.values))))

removed_photos, valid_photo_df = remove_responses_non_person(photo_df, tolerance=1)
print('Total unique photos containing people: {}, rows: {}, removed:{}'.format(
    len(set(valid_photo_df.index.values)), len(valid_photo_df),len(removed_photos)))

290 photos removed when 1 participants found no person in the photo (32.2%)
116 photos removed when 2 participants found no person in the photo (12.9%)
54 photos removed when 3 participants found no person in the photo (6.0%)
Total unique photos containing people: 784, rows: 2480, removed:116


In [294]:
# '''Copy removed images '''
# import shutil
# removed_photos, _ = remove_responses_non_person(photo_df, tolerance=0)
# for r in removed_photos:
#     shutil.copyfile(os.path.join(survey_path,'photos/',r+'.jpg'),os.path.join(survey_path,'removed-photos',r+'.jpg'))
    

In [295]:
# '''Fourth'''
# removed_photos, fourth = remove_responses_non_person(fourth, tolerance=0)
# print('Total unique photos containing people: {}, rows: {}, removed:{}'.format(
#     len(set(fourth.index.values)), len(fourth),len(removed_photos)))
# fourth['subject_bystander_num'] = text_to_numeric_series(fourth, 'subject_bystander', subject)

In [296]:
# def remove_responses_non_person(df, tolerance= 0):
#     rows_to_remove = defaultdict(int)
#     for photo_no, row in df.iterrows():
#         if row['contains_human'].strip()!='There is a person with some of the major body parts visible (such as face, head, torso).':
#             rows_to_remove[photo_no] += 1
#     to_remove = [k for k in rows_to_remove.keys() if rows_to_remove[k]>tolerance]
#     return to_remove, df.drop(to_remove, axis=0, inplace=False)

# for t in range(3):
#     removed_photos, valid_photo_df = remove_responses_non_person(photo_df, tolerance=t)
#     print('{} photos removed when {} participants found no person in the photo ({:.1f}%)'.format(
#         len(removed_photos), t+1, len(removed_photos)*100/len(set(photo_df.index.values))))

# removed_photos, valid_photo_df = remove_responses_non_person(photo_df, tolerance=0)
# print('Total unique photos containing people: {}, removed:{}'.format(
#     len(set(valid_photo_df.index.values)),len(removed_photos)))

In [297]:
'''Convert text responses to numbers for each photo'''

valid_photo_df['subject_bystander_num'] = text_to_numeric_series(valid_photo_df, 'subject_bystander', subject)
valid_photo_df['posing_num'] = text_to_numeric_series(valid_photo_df, 'posing', agree_disagree)
valid_photo_df['photographer_intention_num'] = text_to_numeric_series(valid_photo_df, 'photographer_intention', agree_disagree)
valid_photo_df['photo_place_num'] = text_to_numeric_series(valid_photo_df, 'photo_place', photo_place)
valid_photo_df['will_num'] = text_to_numeric_series(valid_photo_df, 'will', willingness)
valid_photo_df['comfort_num'] = text_to_numeric_series(valid_photo_df, 'comfort', comfortable)
valid_photo_df['replacable_num'] = text_to_numeric_series(valid_photo_df, 'replacable', agree_disagree)
valid_photo_df['was_aware_num'] = text_to_numeric_series(valid_photo_df, 'was_aware', agree_disagree)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-

In [298]:
'''Remove responses for photos where one or more people indicated that it does not contain any person'''
print(len(valid_photo_df), len(valid_photo_df[~np.isnan(valid_photo_df.subject_bystander_num)]))
valid_photo_df = valid_photo_df[~np.isnan(valid_photo_df.subject_bystander_num)]
print('Total unique photos containing people: {}, rows:{}'.format(
    len(set(valid_photo_df.index.values)), len(valid_photo_df)))

2480 2306
Total unique photos containing people: 784, rows:2306


In [299]:
valid_photo_df.subject_bystander_num.isnull().any()

False

In [304]:
'''check if any photo has less than two responses'''
count=valid_photo_df.groupby('photo_no').subject_bystander_num.size()
valid_photo_df = valid_photo_df.drop(count[count<3].index, inplace=False)
valid_photo_df.to_pickle(os.path.join(survey_path,'3-response-df.pkl'))           
valid_photo_df.subject_bystander_num.isnull().any(), len(valid_photo_df)


(False, 1974)

In [301]:
#print(valid_photo_df.shape, fourth.shape)

validated_data.to_pickle(os.path.join(survey_path, 'validated_df.pkl'))
valid_photo_df.to_pickle(os.path.join(survey_path, 'photo_df.pkl'))
#fourth.to_pickle(os.path.join(survey_path, 'fourth_df.pkl'))

In [22]:
# '''Locations of the removed photos in the survey sequences'''
# positions = [0]*52
# position_dict = defaultdict(list)
# for r in removed_photos:
#     for s in list(validated_data.sequence):
#         seq = s.split(',')
#         if r in seq:
#             position_dict[r].append(seq.index(r))
#             positions[seq.index(r)]+=1

# import seaborn as sns
# plt.figure(figsize=(12,4))
# plt.bar(np.arange(len(positions)), positions)
# plt.xticks(range(52),range(52), rotation=50)
# plt.yticks(range(max(positions)+1))
# plt.show()

In [23]:
# '''A test to see if index in dataframe is same as the image ids'''
# helper.draw_photos_from_path([survey_photo_path+'100.jpg', 
#                               helper.find_img_path(openImg_path, mapping[100][0]+'.jpg')], col_size=1)

## Demographics

In [24]:
validated_data.groupby(['Q8.1']).ResponseId.count(),(221*100)/(221+164)

(Q8.1
 Female    17
 Male      26
 Name: ResponseId, dtype: int64, 57.4025974025974)

In [25]:
validated_data.groupby(['Q8.2']).ResponseId.count(),191/len(validated_data), 71/len(validated_data)

(Q8.2
 4th-8th grade                   1
 Associate’s degree              9
 Bachelor's degree              18
 High school graduate or GED     7
 Master's Degree                 1
 Some college, no degree         7
 Name: ResponseId, dtype: int64, 4.0638297872340425, 1.5106382978723405)

In [27]:
#validated_data.groupby(['Q8.3']).ResponseId.count(),242/len(validated_data)

In [None]:
validated_data.groupby(['Q4.1']).ResponseId.count(),(154*100)/len(validated_data)

In [None]:
validated_data.groupby(['Q7.1']).ResponseId.count(), 345/len(validated_data) #OSN account holder

In [None]:
validated_data.groupby(['Q7.2']).ResponseId.count(), 30/len(validated_data) #OSN account holder