In [10]:
from collections import defaultdict
import sys
import os
import pickle
import csv 
from tqdm import tqdm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
sys.path.append('../')

from sklearn import metrics

survey_path='/nfs/juhu/data/rakhasan/bystander-detection/pilot-study2/'
survey_photo_path = survey_path+'/photos/'

In [15]:
mapping = pickle.load(open(survey_path +'mappings_common.pkl','rb'))

In [41]:
# pd.DataFrame(mapping).rename(
#     columns={'imageId':'google_id', 'index':'survey_photo_no'})[['google_id', 'survey_photo_no','bbox']].to_csv(survey_path+'/mappings_common.csv')

In [16]:
agree_disagree = {'Strongly disagree' : -3, 'Disagree': -2, 'Somewhat disagree' : -1, 'Neither agree nor disagree' : 0,
                  'Somewhat agree': 1, 'Agree':2, 'Strongly agree' : 3}

photo_place = {'A public place':-2, 'A semi-public place.':-1, 'A semi-private place':1, 'A private place':2, 'Not sure':0}

awareness= {'Not at all aware':0, 'Slightly aware':1, 'Somewhat aware':2,'Moderately aware':3, 'Extremely aware':4}

willingness = {'Completely unwilling':-2,'Somewhat unwilling':-1, 'Neither unwilling nor willing':0,
               'Somewhat willing':1, 'Completely willing':2}

comfortable = {'Highly uncomfortable':-3, 'Uncomfortable':-2, 'Somewhat uncomfortable':-1, 'Neither uncomfortable nor comfortable':0,
              'Somewhat comfortable':1, 'Comfortable':2, 'Highly comfortable':3}

subject = {'Definitely subject':2, 'Most probably subject':1, 'Not sure':0, 
           'Most probably bystander':-1, 'Definitely bystander':-2}


In [17]:
meta_data = ['StartDate', 'EndDate', 'Status', 'IPAddress', 'Progress',
              'Finished',
             'RecordedDate', 'ResponseId', 'RecipientLastName',
             'RecipientFirstName', 'RecipientEmail', 'ExternalReference', 'LocationLatitude', 'LocationLongitude',
             'DistributionChannel', 'UserLanguage', 
             'count', 
             #'path', 'num_of_check', 'img_w', 'img_h', 'total_count', 'img_w2', 'img_h2', 
             'RandomCode',
             'IsMobile', 'test',
            ]
test_questions = []

In [18]:
def remove_invalid_responses(df, tolerance=0):
    wrong_response = []
    not_finished = []
    for index, row in df.iterrows():
        try:
            if row['Finished'] =='False' or row['Q3.1']!='Yes; I will provide my best answers.' or \
            row['Q4.1'] == 'Under 18 years' or row['Q4.2']!='5 years or more' \
            or int(row['IsMobile'])==1:
                not_finished.append(index)
            elif (isinstance(row['attnwrong'], float) and np.isnan(row['attnwrong'])) or int(row['attnwrong']) > tolerance:
                wrong_response.append(index)
        except Exception as e:
            pass

    #print('Not finished:',len(not_finished), ' wrong response:',len(wrong_response))
    
    return len(not_finished), df.drop(wrong_response+not_finished, axis=0, inplace=False)

In [19]:
data_file = os.path.join(survey_path,'bystander-pilot2-data.csv')
data_file2 = os.path.join(survey_path,'bystander-pilot2-copy-data.csv')
data = pd.concat([pd.read_csv(data_file),pd.read_csv(data_file2)]).reset_index(inplace=False)
data.index = range(len(data))
data.drop([0,1], inplace=True, axis=0) #drop first row

print('Total number of responses: ',len(data))
for t in range(3):
    not_finished, validated_data = remove_invalid_responses(data, tolerance=t) 
    print('Tolerance: {}, valid response: {}({:.2f}%) (not finished: {})'.format(
        t, len(validated_data),len(validated_data)*100/(len(data)-not_finished), not_finished))

validated_data.drop(set(meta_data+test_questions).intersection(set(validated_data.columns)), axis=1, inplace=True)
validated_data.shape

Total number of responses:  716
Tolerance: 0, valid response: 387(89.58%) (not finished: 284)
Tolerance: 1, valid response: 422(97.69%) (not finished: 284)
Tolerance: 2, valid response: 432(100.00%) (not finished: 284)


(432, 812)

In [20]:
t=0
_,validated_data = remove_invalid_responses(data, tolerance=t) 
validated_data.index.rename('pid', inplace = True)
validated_data.photo_set_offset.fillna(0, inplace=True)
validated_data['photo_set']=validated_data.apply(lambda row: int(row.photo_set_offset)+int(row.photo_set), axis=1)
print('Final valid response with {} tolerance: {}'.format(t, len(validated_data)))
print('Number of photo sets: ', len(validated_data.photo_set.unique()))
photo_set_sizes = validated_data.groupby('photo_set').size()

Final valid response with 0 tolerance: 387
Number of photo sets:  100


In [21]:
validated_data.groupby('Q9.2').size()

Q9.2
Maybe    119
No        42
Yes      224
dtype: int64

In [51]:
validated_data['duration'] = validated_data['Duration (in seconds)'].astype(float)//60
validated_data.duration.describe()

count     387.000000
mean      223.351421
std       440.118202
min         9.000000
25%        55.000000
50%        86.000000
75%       182.000000
max      5335.000000
Name: duration, dtype: float64

In [22]:
photo_ques_headers = ['contains_human', 'photo_place', 'was_aware', 'posing','comfort', 
                      'will', 'photographer_intention', 'replacable', 'subject_bystander', 'why_subject', 
                      'why_subject_text','why_bystander', 'why_bystander_text', 'why_neither', 'why_neither_text']

high_level_concepts = ['was_aware', 'posing','comfort', 'will', 'photographer_intention', 'replacable', 'photo_place']
high_level_concepts_num = [c+'_num' for c in high_level_concepts]

questions = list(validated_data.columns)
photo_questions = questions[questions.index('1_Q6.1'):questions.index('Q7.1')]
photos_question_start = questions.index('1_Q6.1')
questions_per_photo = 15 #including text responeses for last 3 questions

def create_photo_df(df):
    
    dicts = []
    for pid, row in df.iterrows():
        per_participant_img_num =int( row['per_participant_img_num'])
        photo_set = int(row['photo_set'])    
        sequence = list(filter(None, row['sequence'].split(',')))
        for p in range(len(sequence)): #for each photo
            if sequence[p] == 'test1' or sequence[p] == 'test2':
                continue
            d = dict()
            d['photo_no'] = sequence[p]
            d['pid'] = row.name#['pid']
            for q in range(questions_per_photo):               
                d[photo_ques_headers[q]] = row[photo_questions[p * questions_per_photo + q]]    
            dicts.append(d)
    return pd.DataFrame.from_dict(dicts)
                
        
def text_to_numeric_series(df, col_name, conversion_dict):
    return df.apply(lambda row: conversion_dict[row[col_name]] if isinstance(row[col_name], str) else row[col_name], axis=1)

In [23]:
photo_df = create_photo_df(validated_data)
photo_df.set_index('photo_no', inplace=True, drop=False)
print('Total unique photos: {}'.format(len(set(photo_df.index.values))))

Total unique photos: 5000


In [30]:
photo_df.to_csv(os.path.join(survey_path, 'photo_df.csv'))
photo_df.head(2)

Unnamed: 0_level_0,photo_no,pid,contains_human,photo_place,was_aware,posing,comfort,will,photographer_intention,replacable,subject_bystander,why_subject,why_subject_text,why_bystander,why_bystander_text,why_neither,why_neither_text
photo_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
165,165,2,There is a person with some of the major body ...,A public place,Agree,Somewhat agree,Somewhat comfortable,Somewhat willing,Somewhat agree,Agree,Not sure,,,,,It isn't clear which person or persons are the...,
180,180,2,There is a person but with no major body part ...,A public place,Disagree,Neither agree nor disagree,Uncomfortable,Somewhat unwilling,Somewhat disagree,Disagree,Most probably subject,This photo is focused on this person.,,,,,


In [29]:
def remove_responses_non_person(df, tolerance= 0):
    rows_to_remove = defaultdict(int)
    for photo_no, row in df.iterrows():
        if row['contains_human'].strip() =='I don\'t see any box.'\
        or row['contains_human'].strip() =='There is just a depiction/representation of a person but not a real person (e.g. a poster/photo/sculpture of a person).'\
        or row['contains_human'].strip()=='I don\'t see any box.'\
        or row['contains_human'].strip()=='There is something else inside the box.':
            rows_to_remove[photo_no] += 1
            
            
    to_remove = [k for k in rows_to_remove.keys() if rows_to_remove[k]>tolerance]
    return to_remove, df.drop(to_remove, axis=0, inplace=False)

for t in range(3):
    removed_photos, valid_photo_df = remove_responses_non_person(photo_df, tolerance=t)
    print('{} photos removed when {} participants found no person in the photo ({:.1f}%)'.format(
        len(removed_photos), t+1, len(removed_photos)*100/len(set(photo_df.index.values))))

removed_photos, valid_photo_df = remove_responses_non_person(photo_df, tolerance=1)
print('Total unique photos containing people: {}, removed:{}'.format(
    len(set(valid_photo_df.index.values)),len(removed_photos)))

1604 photos removed when 1 participants found no person in the photo (32.1%)
920 photos removed when 2 participants found no person in the photo (18.4%)
640 photos removed when 3 participants found no person in the photo (12.8%)
Total unique photos containing people: 4080, removed:920


In [43]:
'''Convert text responses to numbers for each photo'''

valid_photo_df['contains_full_body'] = valid_photo_df.apply(lambda row: 
            1 if row['contains_human'].strip()=='There is a person with some of the major body parts visible (such as face, head, torso).'
                                                           else 0, axis=1)
valid_photo_df['subject_bystander_num'] = text_to_numeric_series(valid_photo_df, 'subject_bystander', subject)
valid_photo_df['posing_num'] = text_to_numeric_series(valid_photo_df, 'posing', agree_disagree)
valid_photo_df['photographer_intention_num'] = text_to_numeric_series(valid_photo_df, 'photographer_intention', agree_disagree)
valid_photo_df['photo_place_num'] = text_to_numeric_series(valid_photo_df, 'photo_place', photo_place)
valid_photo_df['will_num'] = text_to_numeric_series(valid_photo_df, 'will', willingness)
valid_photo_df['comfort_num'] = text_to_numeric_series(valid_photo_df, 'comfort', comfortable)
valid_photo_df['replacable_num'] = text_to_numeric_series(valid_photo_df, 'replacable', agree_disagree)
valid_photo_df['was_aware_num'] = text_to_numeric_series(valid_photo_df, 'was_aware', agree_disagree)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

In [47]:
count=valid_photo_df.groupby('photo_no').subject_bystander.count()
len(count[count==3]),len(count[count==4]),len(count[count==5]),len(count[count==6]),len(count[count==7]),len(count[count==8])

(2337, 656, 240, 210, 59, 120)

In [50]:
'''Copy removed images '''
import shutil
removed_photos, _ = remove_responses_non_person(photo_df, tolerance=1)
print(len(removed_photos))
for r in removed_photos:
    shutil.copyfile(os.path.join(survey_path,'photos/',r+'.jpg'),os.path.join(survey_path,'removed-photos',r+'.jpg'))


920


In [30]:
pickle.dump(validated_data, open(os.path.join(survey_path, 'validated_df.pkl'), 'wb'))
pickle.dump(valid_photo_df, open(os.path.join(survey_path, 'photo_df.pkl'), 'wb'))

In [31]:
# valid_photo_df = pickle.load(open(os.path.join(survey_path, 'photo_df.pkl'), 'rb'))
survey_photo_path

'/nfs/juhu/data/rakhasan/bystander-detection/pilot-study2//photos/'

In [None]:
'''Locations of the removed photos in the survey sequences'''
positions = [0]*52
position_dict = defaultdict(list)
for r in removed_photos:
    for s in list(validated_data.sequence):
        seq = s.split(',')
        if r in seq:
            position_dict[r].append(seq.index(r))
            positions[seq.index(r)]+=1

import seaborn as sns
plt.figure(figsize=(12,4))
plt.bar(np.arange(len(positions)), positions)
plt.xticks(range(52),range(52), rotation=50)
plt.yticks(range(max(positions)+1))
plt.show()

In [None]:
'''A test to see if index in dataframe is same as the image ids'''
helper.draw_photos_from_path([survey_photo_path+'100.jpg', 
                              helper.find_img_path(openImg_path, mapping[100][0]+'.jpg')], col_size=1)

### Annotations

In [None]:
'''Extract annotations of all objects for the photos used in the survey'''

anno_file = openImg_path+'train-annotations-bbox.csv'
human_part_file = openImg_path+'class-ids-human-body-parts-and-mammal.txt'
object_classes_file = openImg_path + 'class-descriptions-boxable.csv'

person_class = '/m/01g317'

In [None]:
'''Create dictionaries for object classes'''

csvreader = csv.reader(open(object_classes_file,'r'))
human_parts = dict() #dictionary to contain human_body_part_id: human_body_part_name (n=12)
for h in open(human_part_file,'r'):
    human_parts[h.strip()]=''

object_classes=dict() #object_class_id:object_class_name (n=588), except human parts
for c in csvreader:
    cid = c[0]
    name = c[1]
    
    if cid in human_parts:
        human_parts[cid]=name
    else:
        object_classes[cid]=name


human_parts.pop('/m/04rky',0) #remove 'mammal'

pickle.dump((human_parts,object_classes), open(survey_path+'human-object-classes.pkl','wb'))

In [None]:
human_parts, object_classes

In [None]:
'''Save the annotations of all objects for all photos used in the survey'''

survey_photo_ids = set([mapping[int(i)][0] for i in valid_photo_df.index.values])

'''Load all annotations'''
csvreader = csv.reader(open(anno_file,'r'))
all_annotations = []
for anno in csvreader:
    all_annotations.append(anno)
del all_annotations[0] # delete header

In [None]:
'''Group all annotations for a specific photo'''
survey_photo_annotations = defaultdict(list)
for i in tqdm(range(len(all_annotations))):
    anno = all_annotations[i]
    if anno[0] in survey_photo_ids:
        survey_photo_annotations[anno[0]].append(anno)

In [None]:
pickle.dump(survey_photo_annotations, open(survey_path+'survey_photo_annotations.pkl','wb'))

In [None]:
1515+2287

In [None]:
helper.draw_photos_from_path([survey_photo_path+i+'.jpg' for i in removed_photos[10:19]], col_size=3)

## Demographics

In [None]:
validated_data.groupby(['Q8.1']).ResponseId.count(),(221*100)/(221+164)

In [None]:
validated_data.groupby(['Q8.2']).ResponseId.count(),191/len(validated_data), 71/len(validated_data)

In [None]:
validated_data.groupby(['Q8.3']).ResponseId.count(),242/len(validated_data)

In [None]:
validated_data.groupby(['Q4.1']).ResponseId.count(),(154*100)/len(validated_data)

In [None]:
validated_data.groupby(['Q7.1']).ResponseId.count(), 345/len(validated_data) #OSN account holder

In [None]:
validated_data.groupby(['Q7.2']).ResponseId.count(), 30/len(validated_data) #OSN account holder

### Agreement

In [26]:
photo_df.loc['160']

Unnamed: 0_level_0,comfort,contains_human,photo_no,photo_place,photographer_intention,pid,posing,replacable,subject_bystander,was_aware,why_bystander,why_bystander_text,why_neither,why_neither_text,why_subject,why_subject_text,will
photo_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
160,Highly uncomfortable,There is a person with some of the major body ...,160,A private place,Somewhat disagree,2,Disagree,Disagree,Definitely subject,Disagree,,,,,This photo is about what this person was doing.,,Somewhat unwilling
160,Neither uncomfortable nor comfortable,There is a person with some of the major body ...,160,A public place,Strongly disagree,8,Disagree,Disagree,Definitely subject,Neither agree nor disagree,,,,,"This photo is focused on this person.,This pho...",,Somewhat willing
160,Somewhat comfortable,There is a person with some of the major body ...,160,A private place,Agree,611,Agree,Strongly agree,Most probably bystander,Agree,Object(s) other than people are the subject(s)...,,,,,,Completely willing


In [31]:
len(list(set(photo_df.index.astype(int))))

5000

In [1]:
import pickle

In [4]:
anno = pickle.load(open(survey_path+'survey_photo_annotations.pkl','rb'))

In [8]:
anno['000967eb6b45a22b']

[['000967eb6b45a22b',
  'xclick',
  '/m/01g317',
  '1',
  '0.020000',
  '0.053750',
  '0.484991',
  '0.663227',
  '0',
  '0',
  '0',
  '0',
  '0'],
 ['000967eb6b45a22b',
  'xclick',
  '/m/01g317',
  '1',
  '0.206250',
  '0.230000',
  '0.488743',
  '0.611632',
  '0',
  '0',
  '0',
  '0',
  '0'],
 ['000967eb6b45a22b',
  'xclick',
  '/m/01g317',
  '1',
  '0.431250',
  '0.446875',
  '0.496248',
  '0.593809',
  '0',
  '0',
  '0',
  '0',
  '0'],
 ['000967eb6b45a22b',
  'xclick',
  '/m/01g317',
  '1',
  '0.688750',
  '0.730000',
  '0.513133',
  '0.617261',
  '0',
  '0',
  '0',
  '0',
  '0'],
 ['000967eb6b45a22b',
  'xclick',
  '/m/01g317',
  '1',
  '0.850625',
  '0.882500',
  '0.495310',
  '0.619137',
  '0',
  '0',
  '0',
  '0',
  '0']]

In [7]:
anno.keys()

dict_keys(['000967eb6b45a22b', '000999c5837b0342', '000be572efea5716', '000bf6ec68aedaed', '000c034c59effe6d', '000c1395e58070e5', '000c2995db80cbe4', '000c9121b17ee0ff', '000d1eae99343db8', '000d2108aa6cf36b', '000d28725962d4a9', '000d2e3d33b18463', '000d3c1733354f93', '000d9fb563fa7df2', '000da253e3b41fa8', '000db814e722951d', '000dc470e568ef7c', '000e08d38bf9334a', '000e156e0a22fafb', '000e884273a66cfb', '000eadf236641863', '000eb4abd9cee093', '000ed3c4d2e19808', '000efb7df8aa950c', '000efee7b4ef4273', '000f220c9a9533cc', '000f31e71b56641e', '000f7d292c338123', '000f939674892476', '000fa2817277baf5', '000fa30892ace474', '000fcf404455b8dc', '000fd38f35d0ad0c', '000fe35c654124c5', '001006b6c4f8aa9d', '00100fe58a324932', '001015dc32bc9c0b', '00102eb3057668c2', '0010383f11a3d639', '001065dbc46c29c8', '0010729972da6b48', '00107c9045200c13', '0010f4c10f7ab07e', '0011205b216aea05', '00112c426578e854', '00112e1d6c62af6b', '0011383248470b57', '001139535799c155', '001147fdb389b8ed', '00117ed3