In [None]:
import pandas as pd
import numpy as np

import seaborn as sns

import datetime

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import warnings
warnings.filterwarnings('ignore')

# Survey figures

In [None]:
data_raw = pd.read_csv('analysis/analysis_data.csv')
data = data_raw.copy()
print(data.shape)
data.head()

In [None]:
def convert_to_photoshoot_names(x):
    if x == 'lr':
        return "Linear Regression"
    if x == 'dt':
        return "Decision Tree"
    if x == 'no_shap':
        return "Black-box model"
    if x == 'shap':
        return "Black-box model (with SHAP)"
    if x == 'education':
        return "Education"
    if x == 'housing':
        return "Housing"

In [None]:
temp1 = data.filter(items=['dataset_name','int_model','understood_system_int']).rename(columns={'int_model':'model',
                                                                             'understood_system_int':'understood_system'})
temp2 = data.filter(items=['dataset_name','bb_model','understood_system_bb']).rename(columns={'bb_model':'model',
                                                                           'understood_system_bb':'understood_system'})
temp = temp1.append(temp2)
temp['model'] = temp['model'].apply(convert_to_photoshoot_names)
temp['dataset_name'] = temp['dataset_name'].apply(convert_to_photoshoot_names)
Understood_System_Data = temp.rename(columns={'dataset_name': "Domain",
                                              'model': "Model",
                                              'understood_system': "Understood System"})

In [None]:
ax = sns.boxplot(x="Understood System", y="Model", hue="Domain", data=Understood_System_Data, orient="h", palette="Set2")
ax.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0, title="Domain")

In [None]:
temp1 = data.filter(items=['dataset_name','int_model','system_confusing_int']).rename(columns={'int_model':'model',
                                                                                               'system_confusing_int':'system_confusing'})
temp2 = data.filter(items=['dataset_name','bb_model','system_confusing_bb']).rename(columns={'bb_model':'model',
                                                                                             'system_confusing_bb':'system_confusing'})
temp = temp1.append(temp2)
temp['model'] = temp['model'].apply(convert_to_photoshoot_names)
temp['dataset_name'] = temp['dataset_name'].apply(convert_to_photoshoot_names)
System_Confusing_Data = temp.rename(columns={'dataset_name': "Domain",
                                              'model': "Model",
                                              'system_confusing': "System Confusing"})

In [None]:
ax = sns.boxplot(x="System Confusing", y="Model", hue="Domain", data=System_Confusing_Data, orient="h", palette="Set2")
ax.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0, title="Domain")

In [None]:
edu_data = data[data['dataset_name']=="education"]
hous_data = data[data['dataset_name']=="housing"]
means = [['Education','dt', edu_data[edu_data['int_model']=='dt'].q1_is_correct_int.mean()],
         ['Education','lr', edu_data[edu_data['int_model']=='lr'].q1_is_correct_int.mean()],
         ['Education','shap', edu_data[edu_data['bb_model']=='shap'].q1_is_correct_bb.mean()],
         ['Education','no_shap', edu_data[edu_data['bb_model']=='no_shap'].q1_is_correct_bb.mean()],
         ['Housing','dt', hous_data[hous_data['int_model']=='dt'].q1_is_correct_int.mean()],
         ['Housing','lr', hous_data[hous_data['int_model']=='lr'].q1_is_correct_int.mean()],
         ['Housing','shap', hous_data[hous_data['bb_model']=='shap'].q1_is_correct_bb.mean()],
         ['Housing','no_shap', hous_data[hous_data['bb_model']=='no_shap'].q1_is_correct_bb.mean()]]
    
Task_1_Performance_Data = pd.DataFrame(means, columns =['Domain','Model', 'Task 1 Performance (% correct)'])
Task_1_Performance_Data['Model'] = Task_1_Performance_Data['Model'].apply(convert_to_photoshoot_names)

In [None]:
ax = sns.barplot(x="Task 1 Performance (% correct)", y="Model", hue="Domain",data=Task_1_Performance_Data, orient="h", palette="Set2")
ax.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0, title="Domain")

In [None]:
means2 = [['Education','dt', edu_data[edu_data['int_model']=='dt'].q2_is_correct_int.mean()],
         ['Education','lr', edu_data[edu_data['int_model']=='lr'].q2_is_correct_int.mean()],
         ['Education','shap', edu_data[edu_data['bb_model']=='shap'].q2_is_correct_bb.mean()],
         ['Education','no_shap', edu_data[edu_data['bb_model']=='no_shap'].q2_is_correct_bb.mean()],
         ['Housing','dt', hous_data[hous_data['int_model']=='dt'].q2_is_correct_int.mean()],
         ['Housing','lr', hous_data[hous_data['int_model']=='lr'].q2_is_correct_int.mean()],
         ['Housing','shap', hous_data[hous_data['bb_model']=='shap'].q2_is_correct_bb.mean()],
         ['Housing','no_shap', hous_data[hous_data['bb_model']=='no_shap'].q2_is_correct_bb.mean()]]
    
Task_2_Performance_Data = pd.DataFrame(means2, columns =['Domain','Model', 'Task 2 Performance (% correct)'])
Task_2_Performance_Data['Model'] = Task_2_Performance_Data['Model'].apply(convert_to_photoshoot_names)

In [None]:
ax = sns.barplot(x="Task 2 Performance (% correct)", y="Model", hue="Domain",data=Task_2_Performance_Data, orient="h", palette="Set2")
ax.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0, title="Domain")

# Model performance figures