In [None]:
import json, datetime, pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats
from collections import Counter

# Load utility files
attr_names = pickle.load(open('attr_names.pkl', 'rb'))
name2attr = pickle.load(open('name_to_attr.pkl', 'rb'))

scene_hierarchy = pd.read_csv('places_scene_hierarchy.csv', header=1)
scene_names = []
for sceneid in range(365):
    splitted = scene_hierarchy['category'][sceneid].replace('\'', '').split('/')
    scene_names.append(splitted[len(splitted)-1])
scene_names = list(scene_hierarchy['category'])

tradeoff_convert = {(1, 4), (2, 8), (3, 16), (4, 32), (5, 64)}
tradeoff_convert = dict(tradeoff_convert)

In [None]:
# Load explanations
sgroup1_expl = pickle.load(open('commbuildings_towns_explanations_final.pkl', 'rb'))
sgroup2_expl = pickle.load(open('home_hotel_explanations_final.pkl', 'rb'))

# Adjust the bias term in the explanations
K_lst = [4, 8, 16, 32, 64]
for K in K_lst:
    for sgroup in [sgroup1_expl, sgroup2_expl]:
        sgroup[K]['clf'].intercept_ = sgroup[K]['clf'].intercept_ - np.min(sgroup[K]['clf'].intercept_)

In [None]:
# Load results and inputs files
data = pickle.load(open('all_results_inputs.pkl', 'rb'))

In [None]:
# Choose which study to analyze
study = 'example'; K = 0
# study = '8concept'; K = 8
# study = '16concept'; K = 16
# study = '32concept'; K = 32

In [None]:
# Define based on the chosen study
results = data[study]['results']
inputs =  data[study]['inputs']

study_keys = [list(r['output'][0].keys()) for r in results]
study_keys = [item for sublist in study_keys for item in sublist]
study_keys = set(study_keys)

In [None]:
# Get the list of concepts used in the explanations
if study != 'example':
    sgroup1_attr = [s.replace(' ', '') for s in sgroup1_expl[K]['imp_attr']]
    sgroup2_attr = [s.replace(' ', '') for s in sgroup2_expl[K]['imp_attr']]

In [None]:
# Create a dictionary of explanation coefficients
if study != 'example':
    coef = {}; coef['sgroup1'] = {}; coef['sgroup2'] = {}
    for attr in sgroup1_attr: 
        coef['sgroup1'][attr] = [0, 0, 0, 0]
    for attr in sgroup2_attr:
        coef['sgroup2'][attr] = [0, 0, 0, 0]

    display = False
    
    # Loop over scene groups
    for g in [1, 2]:
        sgroup_expl = sgroup1_expl if g == 1 else sgroup2_expl

        # Loop over 4 classes
        for optionid in range(4):
            cls = sgroup_expl[K]['clf'].classes_[optionid]
            if display: print('\nExplanation for scene class: {} ({})'.format(scene_names[cls], cls))

            coefs = sgroup_expl[K]['clf'].coef_[optionid]
            coefs_local = []; attrs_local = []
            for a in range(112):
                if coefs[a] != 0.:
                    coefs_local.append(coefs[a])
                    attrs_local.append(attr_names[a].replace(' ', ''))
            if display: print('{} attributes in total\n'.format(len(coefs_local)))

            # Sort attributes by coefficient magnitude
            order = np.argsort(-np.abs(coefs_local))
            coefs_local = np.array(coefs_local)[order]
            attrs_local = np.array(attrs_local)[order]
            for a in range(len(coefs_local)):
                if display: print('{:.2f} {}'.format(coefs_local[a], attrs_local[a]))
                coef['sgroup{}'.format(g)][attrs_local[a]][optionid] = coefs_local[a]
            if display: print('{:.2f}'.format(sgroup_expl[K]['clf'].intercept_[optionid]))


## Analysis: Basics

In [None]:
# Generate a summary of results
float_keys = ['ml_exp', 'subjective_expl', 'subjective', 'subjective2', 'subjective3', 'tradeoff', 'correctcount', 'correctcount2']
string_keys = ['reasons', 'descriptions', 'feedback', 'gender', 'gender_written', 'race']

print('Varible: (Mean \u00B1 Standard deviation) [Q1, Q2, Q3]\n')

results_summary = {}
for key in float_keys+string_keys:
    if key in study_keys:
        results_summary[key] = []
        for i in range(len(results)):
            r = results[i]
            if key in float_keys: 
                if key == 'tradeoff': 
                    results_summary[key].append(tradeoff_convert[float(r['output'][0][key])])
                elif study == '32concept' and key == 'correctcount':
                    if i < 25: results_summary[key].append(float(r['output'][0][key]))
                elif study == '32concept' and key == 'correctcount2': 
                    if i >= 25: results_summary[key].append(float(r['output'][0][key]))
                else: 
                    results_summary[key].append(float(r['output'][0][key]))
            if key in string_keys:
                if r['output'][0][key] != '': results_summary[key].append(r['output'][0][key])

        if key in float_keys:        
            print('{}: ({:.1f} \u00B1 {:.1f})'.format(key, np.mean(results_summary[key]), np.std(results_summary[key])), 
                  np.quantile(results_summary[key], [0.25, 0.5, 0.75]))

        if key in string_keys:
            print('\n{}:'.format(key), results_summary[key])

In [None]:
# Total duration of the study
loc = 6 if study == 'example' else 2

min_lst = []
for i in range(len(results)):
    r = results[i]
    milisec = r['output'][loc]['timing']['submit'] - r['output'][loc]['timing']['start']
    min_lst.append((milisec/1000)/60)
print('time per study (minutes) - {} data points'.format(len(min_lst)))
print('({:.1f} \u00B1 {:.1f})'.format(np.mean(min_lst), np.std(min_lst)))
print(np.round(np.quantile(np.array(min_lst), [0.25, 0.50, 0.75]), 1))

In [None]:
# Time spent on each photo
sec_lst = []
for i in range(len(results)):
    r = results[i]
    # k = 1, 2, 3, 4
    for k in range(1, 5):
        if study == '32concept':
            if i < 25: sec_lst.append((r['output'][loc]['timing']['nextphoto'][k] - r['output'][loc]['timing']['nextphoto'][k-1])/1000.)
            else: sec_lst.append((r['output'][loc]['timing']['nextphoto2'][k] - r['output'][loc]['timing']['nextphoto2'][k-1])/1000.)
        else:
            sec_lst.append((r['output'][loc]['timing']['nextphoto'][k] - r['output'][loc]['timing']['nextphoto'][k-1])/1000.)
            sec_lst.append((r['output'][loc]['timing']['nextphoto2'][k] - r['output'][loc]['timing']['nextphoto2'][k-1])/1000.)
    # k = 5
    if study == 'example':
        sec_lst.append((r['output'][loc]['timing']['pg_task2task2'] - r['output'][loc]['timing']['nextphoto'][4])/1000.)
        sec_lst.append((r['output'][loc]['timing']['pg_task22post'] - r['output'][loc]['timing']['nextphoto2'][4])/1000.)
    elif study == '32concept':
        if i < 25: sec_lst.append((r['output'][2]['timing']['pg_task2agree'] - r['output'][2]['timing']['nextphoto'][4])/1000.)
        else: sec_lst.append((r['output'][2]['timing']['pg_task2agree'] - r['output'][2]['timing']['nextphoto2'][4])/1000.)
    else:
        sec_lst.append((r['output'][loc]['timing']['pg_task2agree'] - r['output'][loc]['timing']['nextphoto'][4])/1000.)
        sec_lst.append((r['output'][loc]['timing']['pg_task22agree2'] - r['output'][loc]['timing']['nextphoto2'][4])/1000.)

print('time per photo (seconds) - {} data points'.format(len(sec_lst)))
print('({:.1f} \u00B1 {:.1f})'.format(np.mean(sec_lst), np.std(sec_lst)))
print(np.round(np.quantile(np.array(sec_lst), [0.25, 0.50, 0.75]), 1))

if study == '32concept': sec_person = np.array(sec_lst).reshape((len(results), 5)).mean(1)
else: sec_person = np.array(sec_lst).reshape((len(results), 10)).mean(1)
print('\ntime per photo per person (seconds) - {} data points'.format(len(sec_person)))
print('({:.1f} \u00B1 {:.1f})'.format(np.mean(sec_person), np.std(sec_person)))
print(np.round(np.quantile(np.array(sec_person), [0.25, 0.50, 0.75]), 1))

In [None]:
# simplicity-correctness tradeoff
if study != 'example':
    print('percentage of participants who prefer each explanation')
    for v in [4, 8, 16, 32, 64]:
        print('{} concepts: {:.0f}%'.format(v, np.mean(np.array(results_summary['tradeoff'])==v)*100))

## Analysis: Concept recognition

In [None]:
# Avg number of concepts recognized per photo
if study != 'example':
    nconcept_lst = []
    for i in range(len(results)):
        r = results[i]
        if study in ['8concept', '16concept']:
            nconcept_lst += [len(a['selected']) for a in r['output'][3]['individual_answers']]
            nconcept_lst += [len(a['selected']) for a in r['output'][7]['individual_answers2']]
        elif study in ['32concept']:
            if i < 25: nconcept_lst += [len(a['selected']) for a in r['output'][3]['individual_answers']]
            elif i >= 25: nconcept_lst += [len(a['selected']) for a in r['output'][3]['individual_answers2']]


    print('concepts clicked per photo - {} data points'.format(len(nconcept_lst)))
    print('({:.1f} \u00B1 {:.1f})'.format(np.mean(nconcept_lst), np.std(nconcept_lst)))
    print(np.quantile(nconcept_lst, [0.25, 0.50, 0.75]))

    if study in ['8concept', '16concept']:
        nconcept_person = np.array(nconcept_lst).reshape((len(results), 10)).mean(1)
    elif study in ['32concept']:
        nconcept_person = np.array(nconcept_lst).reshape((len(results), 5)).mean(1)
    print('\nconcepts clicked per photo per person - {} data points'.format(len(nconcept_person)))
    print('({:.1f} \u00B1 {:.1f})'.format(np.mean(nconcept_person), np.std(nconcept_person)))
    print(np.round(np.quantile(nconcept_person, [0.25, 0.50, 0.75]), 1)) 

In [None]:
# Reformat results as a dictionary organized by input type
if study in ['8concept', '16concept', 'example']:
    results_by_inputid = {}
    for inputid in range(5):
        results_by_inputid[inputid] = []
    for i in range(len(results)):
        results_by_inputid[results[i]['output'][1]['input']].append(results[i])
        
elif study in ['32concept']:
    results_by_inputid1 = {}; results_by_inputid2 = {}
    for inputid in range(5):
        results_by_inputid1[inputid] = []
        results_by_inputid2[inputid] = []
    for i in range(len(results)):
        if i < 25: results_by_inputid1[results[i]['output'][1]['input']].append(results[i])
        else: results_by_inputid2[results[i]['output'][1]['input']].append(results[i])

In [None]:
# Create attribute labels for each photo
if study != 'example':
    attr_targets = {}
    for inputid in range(5):
        attr_targets[inputid] = {}
        attr_targets[inputid]['sgroup1'] = []
        attr_targets[inputid]['sgroup2'] = []

        for img in inputs[inputid]['input_imagename']:
            attr = name2attr['ade20k/'+img]
            attr = np.array(attr_names)[attr.astype(np.bool)]
            attr = [a.replace(' ', '') for a in attr]
            target = set(attr).intersection(sgroup1_attr)
            attr_targets[inputid]['sgroup1'].append(target)

        for img in inputs[inputid]['input_imagename2']:
            attr = name2attr['ade20k/'+img]
            attr = np.array(attr_names)[attr.astype(np.bool)]
            attr = [a.replace(' ', '') for a in attr]
            target = set(attr).intersection(sgroup2_attr)
            attr_targets[inputid]['sgroup2'].append(target)

In [None]:
# Create a dictionary to store performance
if study != 'example':
    attr_acc = {}; attr_acc['sgroup1'] = {}; attr_acc['sgroup2'] = {}
    for attr in sgroup1_attr:
        attr_acc['sgroup1'][attr] = {}
        attr_acc['sgroup1'][attr]['label'] = np.zeros(5*5)
        attr_acc['sgroup1'][attr]['correct'] = np.zeros(5*5)
    for attr in sgroup2_attr:
        attr_acc['sgroup2'][attr] = {}
        attr_acc['sgroup2'][attr]['label'] = np.zeros(5*5)
        attr_acc['sgroup2'][attr]['correct'] = np.zeros(5*5)

In [None]:
# Calculate concept recognition performance
if study != 'example':
    recall1 = []; recall2 = []
    precision1 = []; precision2 = []
    acc1 = []; acc2 = []
    for inputid in range(5):
        for personid in range(5):

            if study in ['8concept', '16concept']:
                r = results_by_inputid[inputid][personid]
                selections1 = [a['selected'] for a in r['output'][3]['individual_answers']]
                selections2 = [a['selected'] for a in r['output'][7]['individual_answers2']]
            elif study in ['32concept']:
                r1 = results_by_inputid1[inputid][personid]
                r2 = results_by_inputid2[inputid][personid]
                selections1 = [a['selected'] for a in r1['output'][3]['individual_answers']]
                selections2 = [a['selected'] for a in r2['output'][3]['individual_answers2']]

            # Loop over 5 images
            for imageid in range(5):

                # Sgroup1
                target = attr_targets[inputid]['sgroup1'][imageid]
                for attr in target:
                    attr_acc['sgroup1'][attr]['label'][inputid*5+imageid] = 1

                selection = [s.replace('g1_', '') for s in selections1[imageid]]
                tp = 0; fp = 0;
                for s in selection:
                    if s in target: 
                        tp += 1
                        attr_acc['sgroup1'][s]['correct'][inputid*5+imageid] += 1
                    else: 
                        fp += 1
                acc1.append(tp/K*100)
                recall1.append(tp/len(target)*100)
                if tp+fp == 0: precision1.append(np.NaN)
                else: precision1.append(tp/(tp+fp)*100)

                # Sgroup2
                target = attr_targets[inputid]['sgroup2'][imageid]
                for attr in target:
                    attr_acc['sgroup2'][attr]['label'][inputid*5+imageid] = 1

                selection = []
                for s in selections2[imageid]:
                    selection.append(s.replace('g2_', ''))
                tp = 0; fp = 0;
                for s in selection:
                    if s in target: 
                        tp += 1
                        attr_acc['sgroup2'][s]['correct'][inputid*5+imageid] += 1
                    else: 
                        fp += 1
                acc2.append(tp/K*100)
                recall2.append(tp/len(target)*100)
                if tp+fp == 0: precision2.append(np.NaN)
                else: precision2.append(tp/(tp+fp)*100)

    print('recall (both):                 {:.1f}% \u00B1 {:.1f}%'.format(np.mean(recall1+recall2), np.std(recall1+recall2)))
    print('recall (1. commbuildings):     {:.1f}% \u00B1 {:.1f}%'.format(np.mean(recall1), np.std(recall1)))
    print('recall (2. home/hotel):        {:.1f}% \u00B1 {:.1f}%'.format(np.mean(recall2), np.std(recall2)))

    print()
    print('precision (both):              {:.1f}% \u00B1 {:.1f}%'.format(np.nanmean(precision1+precision2), np.nanstd(precision1+precision2)))
    print('precision (1. commbuildings):  {:.1f}% \u00B1 {:.1f}%'.format(np.nanmean(precision1), np.nanstd(precision1)))
    print('precision (2. home/hotel):     {:.1f}% \u00B1 {:.1f}%'.format(np.nanmean(precision2), np.nanstd(precision2)))

    print()
    print('accuracy (both):               {:.1f}% \u00B1 {:.1f}%'.format(np.mean(acc1+acc2), np.std(acc1+acc2)))
    print('accuracy (1. commbuildings):   {:.1f}% \u00B1 {:.1f}%'.format(np.mean(acc1), np.std(acc1)))
    print('accuracy (2. home/hotel):      {:.1f}% \u00B1 {:.1f}%'.format(np.mean(acc2), np.std(acc2)))


In [None]:
# Calculate individual concept recall
if study != 'example':
    df = pd.DataFrame(columns=['Group', 'Concepts', 'N_photos', 'Mean and Std', 'Quantile'])
    for conceptid in range(K):
        attr = sgroup1_attr[conceptid]
        recall = attr_acc['sgroup1'][attr]['correct'][attr_acc['sgroup1'][attr]['label'].astype(bool)]/5*100
        if len(recall) > 0:
            df.loc[conceptid] = ['sgroup1', attr, int(attr_acc['sgroup1'][attr]['label'].sum()), 
                                 str(np.round(np.mean(recall), 1)) + " \u00B1 " + str(np.round(np.std(recall), 1)),
                                 np.quantile(recall, [0.25, 0.5, 0.75])]
        else:
            df.loc[conceptid] = ['sgroup1', attr, int(attr_acc['sgroup1'][attr]['label'].sum()), '', '']

    for conceptid in range(K):
        attr = sgroup2_attr[conceptid]
        recall = attr_acc['sgroup2'][attr]['correct'][attr_acc['sgroup2'][attr]['label'].astype(bool)]/5*100
        if len(recall) > 0:
            df.loc[conceptid+K] = ['sgroup2', attr, int(attr_acc['sgroup2'][attr]['label'].sum()), 
                                   str(np.round(np.mean(recall), 1)) + u" \u00B1 " + str(np.round(np.std(recall), 1)),
                                   np.quantile(recall, [0.25, 0.5, 0.75])]
        else:
            df.loc[conceptid+K] = ['sgroup2', attr, int(attr_acc['sgroup2'][attr]['label'].sum()), '', '']


    df.to_csv('concept_recall_{}concept.csv'.format(K))
    df    

## Analysis: Check explanation score calculation

In [None]:
# Initialize scores dictionary 
if study != 'example':
    scores = {}
    for attr in sgroup1_attr: scores[attr] = 0.
    for attr in sgroup2_attr: scores[attr] = 0.

    # Check that the recorded explanation scores matches the calculated explanation scores
    for inputid in range(5):
        for personid in range(5):        

            if study in ['8concept', '16concept']:
                r = results_by_inputid[inputid][personid]
                selections1 = [a['selected'] for a in r['output'][3]['individual_answers']]
                selections2 = [a['selected'] for a in r['output'][7]['individual_answers2']]
            elif study in ['32concept']:
                r1 = results_by_inputid1[inputid][personid]
                r2 = results_by_inputid2[inputid][personid]
                selections1 = [a['selected'] for a in r1['output'][3]['individual_answers']]
                selections2 = [a['selected'] for a in r2['output'][3]['individual_answers2']]

            # Loop over images
            for imageid in range(5):

                # Sgroup1
                scores = dict.fromkeys(scores, 0.)
                for attr in [s.replace('g1_', '') for s in selections1[imageid]]: 
                    scores[attr] = 1

                if K == 8:
                    s1 = -0.12*scores['sidewalk'] + 0.00
                    s2 = -1.44*scores['skyscraper'] -1.03*scores['sky'] + 0.69*scores['grass'] -0.23*scores['car'] + 0.23*scores['plant'] + 1.04
                    s3 = 1.54*scores['skyscraper'] - 1.11*scores['car'] - 1.04*scores['road'] - 1.00*scores['sidewalk'] -0.75*scores['person'] + 1.04
                    s4 = -1.90*scores['skyscraper'] + 0.27*scores['car'] - 0.19*scores['grass'] + 0.04*scores['sidewalk'] + 0.61
                if K == 16:
                    s1 = 0.89*scores['skyscraper'] + 0.85*scores['flag'] - 0.79*scores['awning'] + 0.59*scores['wall'] + 0.58*scores['car'] + 0.47*scores['trafficlight'] + 0.39*scores['streetlight'] - 0.37*scores['sidewalk'] + 0.27*scores['truck'] + 0.24*scores['sky'] + 0.23*scores['grass'] - 0.12*scores['person'] + 0.09*scores['road'] + 0.04*scores['palm'] + 0.00
                    s2 = -3.07*scores['skyscraper'] + 1.64*scores['stairway'] + 1.58*scores['grass'] -1.20*scores['sky'] + 0.78*scores['palm'] + 0.76*scores['plant'] -0.69*scores['truck'] - 0.57*scores['car'] + 0.28*scores['flag'] + 0.26*scores['trafficlight'] - 0.24*scores['streetlight'] + 0.16*scores['sidewalk'] - 0.13*scores['awning'] + 0.08*scores['road'] + 2.40
                    s3 = 2.10*scores['skyscraper'] - 2.08*scores['person'] - 1.64*scores['car'] - 1.33*scores['road'] - 1.28*scores['sidewalk'] + 0.22*scores['sky'] - 0.08*scores['streetlight'] - 0.07*scores['wall'] + 2.40
                    s4 = - 2.44*scores['skyscraper'] - 1.57*scores['grass'] - 0.80*scores['flag'] + 0.73*scores['road'] - 0.65*scores['trafficlight'] + 0.64*scores['car'] + 0.56*scores['sidewalk'] + 0.53*scores['awning'] - 0.53*scores['plant'] + 0.40*scores['person'] - 0.23*scores['stairway'] - 0.19*scores['wall'] + 0.14*scores['sky'] + 0.56
                if K == 32:
                    s1 = 1.15*scores['flag'] + 1.12*scores['skyscraper'] - 0.99*scores['awning'] + 0.86*scores['earth'] + 0.83*scores['floor'] + 0.80*scores['car'] - 0.76*scores['pot'] + 0.63*scores['tradename'] + 0.57*scores['trafficlight'] + 0.55*scores['wall'] + 0.55*scores['streetlight'] - 0.53*scores['sidewalk'] + 0.51*scores['stairs'] + 0.50*scores['sky'] + 0.43*scores['truck'] + 0.40*scores['pedestal'] - 0.39*scores['ashcan'] + 0.37*scores['grass'] + 0.32*scores['road'] + 0.32*scores['flowerpot'] + 0.32*scores['tree'] + 0.26*scores['bag'] - 0.26*scores['van'] + 0.26*scores['palm'] + 0.24*scores['bucket'] - 0.19*scores['person'] - 0.04*scores['spotlight'] + 0.00
                    s2 = -3.74*scores['skyscraper'] + 2.13*scores['stairway'] + 1.73*scores['grass'] - 1.37*scores['sky'] + 1.26*scores['palm'] - 0.90*scores['truck'] + 0.89*scores['rock'] + 0.89*scores['plant'] - 0.84*scores['box'] - 0.79*scores['car'] - 0.48*scores['flowerpot'] + 0.44*scores['flag'] + 0.40*scores['trafficlight'] - 0.34*scores['streetlight'] + 0.30*scores['road'] - 0.28*scores['van'] - 0.26*scores['mountain'] + 0.20*scores['sidewalk'] - 0.19*scores['spotlight'] - 0.16*scores['awning'] - 0.15*scores['bag'] + 0.13*scores['ashcan'] - 0.07*scores['stairs'] - 0.01*scores['tradename'] + 3.04
                    s3 = -2.69*scores['person'] + 2.11*scores['skyscraper'] - 1.71*scores['car'] - 1.42*scores['road'] - 1.41*scores['sidewalk'] + 0.56*scores['sky'] - 0.48*scores['wall'] - 0.31*scores['tree'] - 0.30*scores['streetlight'] - 0.26*scores['flag'] + 3.04
                    s4 = -2.73*scores['skyscraper'] - 1.88*scores['grass'] - 1.07*scores['flag'] + 1.01*scores['road'] - 0.92*scores['stairway'] - 0.78*scores['trafficlight'] + 0.69*scores['sidewalk'] + 0.68*scores['car'] + 0.68*scores['awning'] - 0.60*scores['plant'] + 0.48*scores['person'] + 0.41*scores['van'] + 0.40*scores['sky'] - 0.38*scores['palm'] - 0.30*scores['wall'] - 0.23*scores['earth'] + 0.19*scores['spotlight'] - 0.11*scores['tradename'] + 0.07*scores['mountain'] + 0.63

                calculated = [s1, s2, s3, s4]
                if study in ['8concept', '16concept']: recorded = list(r['output'][6]['finalscores'][imageid].values())
                elif study in ['32concept']: recorded = list(r1['output'][6]['finalscores'][imageid].values())
                assert calculated == recorded

                # Sgroup2
                scores = dict.fromkeys(scores, 0.)
                for attr in [s.replace('g2_', '') for s in selections2[imageid]]: 
                    scores[attr] = 1

                if K == 8:
                    s1 = -0.70*scores['chair'] -0.10*scores['floor'] + 0.08
                    s2 = -1.41*scores['bed'] -0.20*scores['cushion'] + 0.16
                    s3 = -0.52*scores['windowpane'] -0.33*scores['floor'] -0.04*scores['wall'] + 0.00
                    s4 = -1.05*scores['bed'] -0.31*scores['table'] + 0.14*scores['sofa'] + 0.16
                if K == 16:
                    s1 = 1.88*scores['bed'] - 0.95*scores['chair'] - 0.60*scores['sofa'] - 0.28*scores['armchair'] - 0.04*scores['table'] - 0.03*scores['sconce'] + 0.00
                    s2 = -3.20*scores['bed'] + 1.47*scores['chair'] - 1.38*scores['sofa'] - 0.80*scores['cushion'] - 0.39*scores['coffeetable'] - 0.14*scores['armchair'] - 0.14*scores['lamp'] + 1.40
                    s3 = 1.36*scores['bed'] - 1.02*scores['windowpane'] - 0.92*scores['wall'] - 0.31*scores['plant'] - 0.24*scores['carpet'] + 0.19*scores['sconce'] - 0.18*scores['floor'] - 0.15*scores['cushion'] - 0.11*scores['vase'] + 1.16
                    s4 = 2.00*scores['sofa'] - 1.73*scores['bed'] - 0.88*scores['table'] + 0.68*scores['coffeetable'] - 0.52*scores['chair'] - 0.38*scores['wall'] + 0.30*scores['armchair'] + 0.20*scores['fireplace'] + 0.17*scores['cushion'] + 1.40
                if K == 32:
                    s1 = 3.57*scores['bed'] - 1.02*scores['sofa'] - 0.97*scores['coffeetable'] - 0.86*scores['chair'] - 0.80*scores['sconce'] + 0.64*scores['windowpane'] - 0.60*scores['armchair'] - 0.60*scores['television'] - 0.58*scores['drinkingglass'] + 0.52*scores['fan'] - 0.42*scores['switch'] + 0.32*scores['cushion'] - 0.28*scores['table'] + 0.26*scores['box'] - 0.25*scores['curtain'] + 0.24*scores['blind'] + 0.23*scores['chestofdrawers'] + 0.12*scores['clock'] - 0.10*scores['telephone'] - 0.01*scores['chandelier'] + 0.00
                    s2 = -4.39*scores['bed'] + 2.70*scores['chair'] - 1.98*scores['sofa'] - 1.01*scores['coffeetable'] - 1.00*scores['cushion'] - 0.94*scores['fireplace'] + 0.75*scores['table'] - 0.75*scores['pillow'] - 0.73*scores['armchair'] + 0.65*scores['chandelier'] + 0.46*scores['plate'] - 0.45*scores['clock'] - 0.42*scores['lamp'] - 0.29*scores['curtain'] + 0.22*scores['wallsocket'] - 0.19*scores['ottoman'] - 0.18*scores['book'] - 0.14*scores['television'] - 0.05*scores['sconce'] + 0.01*scores['drinkingglass'] + 2.23
                    s3 = 2.20*scores['bed'] - 2.09*scores['wall'] - 1.21*scores['windowpane'] + 0.96*scores['television'] - 0.96*scores['box'] - 0.94*scores['chandelier'] - 0.86*scores['carpet'] - 0.77*scores['plant'] - 0.71*scores['blind'] + 0.69*scores['desk'] + 0.64*scores['sconce'] + 0.50*scores['armchair'] + 0.50*scores['curtain'] + 0.47*scores['telephone'] - 0.46*scores['cushion'] - 0.42*scores['chestofdrawers'] + 0.38*scores['switch'] + 0.29*scores['pillow'] - 0.22*scores['book'] - 0.08*scores['clock'] + 0.07*scores['coffeetable'] - 0.07*scores['wallsocket'] - 0.05*scores['fireplace'] - 0.03*scores['fan'] - 0.03*scores['table'] + 2.68
                    s4 = -2.41*scores['bed'] + 2.13*scores['sofa'] + 1.21*scores['fireplace'] + 1.17*scores['coffeetable'] - 1.07*scores['wall'] - 0.98*scores['chair'] - 0.95*scores['table'] + 0.86*scores['cushion'] + 0.83*scores['ottoman'] + 0.75*scores['armchair'] + 0.65*scores['seat'] + 0.49*scores['book'] + 0.28*scores['carpet'] - 0.28*scores['wallsocket'] - 0.28*scores['fan'] - 0.25*scores['chandelier'] - 0.24*scores['plate'] + 0.15*scores['plant'] - 0.11*scores['telephone'] + 0.10*scores['box'] + 0.03*scores['sconce'] + 0.02*scores['windowpane'] + 2.77

                calculated = [s1, s2, s3, s4]
                if study in ['8concept', '16concept']: recorded = list(r['output'][10]['finalscores2'][imageid].values())
                elif study in ['32concept']: recorded = list(r2['output'][6]['finalscores2'][imageid].values())
                assert calculated == recorded


## Analysis: Task accuracy

In [None]:
# Do participants' scene matches with the explanation?
match1_lst = []; match2_lst = []
maxcorrect1_lst = []; maxcorrect2_lst = []
selectedcorrect1_lst = []; selectedcorrect2_lst = []
for inputid in range(5):
    for personid in range(5):
        
        if study in ['8concept', '16concept']:
            r = results_by_inputid[inputid][personid]
            selected1 = [a['predictedclass']-1 for a in r['output'][3]['individual_answers']]
            selected2 = [a['predictedclass']-1 for a in r['output'][7]['individual_answers2']]
        elif study in ['32concept']:
            r1 = results_by_inputid1[inputid][personid]
            r2 = results_by_inputid2[inputid][personid]
            selected1 = [a['predictedclass']-1 for a in r1['output'][3]['individual_answers']]
            selected2 = [a['predictedclass']-1 for a in r2['output'][3]['individual_answers2']]
        elif study in ['example']:
            r = results_by_inputid[inputid][personid]
            selected1 = [a['predictedclass']-1 for a in r['output'][2]['individual_answers']]
            selected2 = [a['predictedclass']-1 for a in r['output'][4]['individual_answers2']]
   
        predicted1 = [i-1 for i in inputs[inputid]['predicted']]
        predicted2 = [i-1 for i in inputs[inputid]['predicted2']]
        
        selectedcorrect1_lst.append(np.mean(np.array(predicted1)==np.array(selected1))*100)
        selectedcorrect2_lst.append(np.mean(np.array(predicted2)==np.array(selected2))*100)

        if study != 'example':
            maxscore1 = []; maxscore2 = []
            for imageid in range(5):
                if study in ['8concept', '16concept']:
                    maxscore1.append(np.argmax(list(r['output'][6]['finalscores'][imageid].values())))
                    maxscore2.append(np.argmax(list(r['output'][10]['finalscores2'][imageid].values())))
                elif study in ['32concept']:
                    maxscore1.append(np.argmax(list(r1['output'][6]['finalscores'][imageid].values())))
                    maxscore2.append(np.argmax(list(r2['output'][6]['finalscores2'][imageid].values())))

            match1_lst.append(np.mean(np.array(selected1)==np.array(maxscore1))*100)
            match2_lst.append(np.mean(np.array(selected2)==np.array(maxscore2))*100)

            maxcorrect1_lst.append(np.mean(np.array(predicted1)==np.array(maxscore1))*100)
            maxcorrect2_lst.append(np.mean(np.array(predicted2)==np.array(maxscore2))*100)
        
        
print('selected is correct (both):                                {:.1f}% \u00B1 {:.1f}%'.format(np.mean(selectedcorrect1_lst+selectedcorrect2_lst), 
                                                                                             np.std(selectedcorrect1_lst+selectedcorrect2_lst)))
print('selected is correct (1. commbuildings):                    {:.1f}% \u00B1 {:.1f}%'.format(np.mean(selectedcorrect1_lst), 
                                                                                             np.std(selectedcorrect1_lst)))
print('selected is correct (2. home/hotel):                       {:.1f}% \u00B1 {:.1f}%'.format(np.mean(selectedcorrect2_lst), 
                                                                                             np.std(selectedcorrect2_lst)))

if study != 'example':
    print()
    print('highest score is correct (both):                           {:.1f}% \u00B1 {:.1f}%'.format(np.mean(maxcorrect1_lst+maxcorrect2_lst), 
                                                                                                 np.std(maxcorrect1_lst+maxcorrect2_lst)))
    print('highest score is correct (1. commbuildings):               {:.1f}% \u00B1 {:.1f}%'.format(np.mean(maxcorrect1_lst), 
                                                                                                 np.std(maxcorrect1_lst)))
    print('highest score is correct (2. home/hotel):                  {:.1f}% \u00B1 {:.1f}%'.format(np.mean(maxcorrect2_lst), 
                                                                                                 np.std(maxcorrect2_lst)))

    print()
    print('selects explanation with highest score (both):             {:.1f}% \u00B1 {:.1f}%'.format(np.mean(match1_lst+match2_lst),
                                                                                                 np.std(match1_lst+match2_lst)))
    print('selects explanation with highest score (1. commbuildings): {:.1f}% \u00B1 {:.1f}%'.format(np.mean(match1_lst), 
                                                                                                 np.std(match1_lst)))
    print('selects explanation with highest score (1. home/hotel):    {:.1f}% \u00B1 {:.1f}%'.format(np.mean(match2_lst), 
                                                                                             np.std(match2_lst)))
