# Attribute Analysis
The following notebook can analyze a given attribute with any number of expected values (2+), and will perform metrics att_siz, att_cnt, att_dis, att_clu, att_scn generalized to any attribute.

## Instruction
1. Fill in the dataset in section 1.1

2. Run all cells

3. Look at the summary pdf generated AND/OR explore each metric below.
    - All metrics are identified by a short keyword, and consist of a "Setup" and "Analyses" portion. The "Setup" portion contains code that does not need to be modified unless customization is needed, and the "Analyses" portion provides an interactive display of the results.
    
## Table of Contents
1. [Initial Setup](#setup) <br/>
    1.1 [Dataset](#dataset) <br/>
2. att_siz Metric: [Distance from center, size, attribute label inference](#att_siz)<br/>
    2.1 [Setup](#att_siz_setup)<br/>
    2.2 [Analyses](#att_siz_analyses)

# Initial Setup
<a id="setup"></a>

In [None]:
from __future__ import print_function
import argparse
import os
import os
try: 
    import datasets
except ModuleNotFoundError: # switch to parent directory on first run 
    os.chdir(os.pardir)
    import datasets
import pickle
import itertools
import torchvision.transforms as transforms
import torch.utils.data as data
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.manifold import TSNE
import seaborn as sns
import numpy as np
from scipy import stats
import PIL.Image
from scipy.cluster.hierarchy import dendrogram, linkage
from math import sqrt
import cv2
import matplotlib.patches as patches
from scipy.spatial.distance import squareform
import pycountry
from geonamescache import GeonamesCache
from matplotlib.patches import Polygon
from matplotlib.collections import PatchCollection
from sklearn import svm
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import re
import plotly.graph_objects as go
import textwrap
import matplotlib.patches as mpatches
import operator
from matplotlib.font_manager import FontProperties
import imageio
from ipywidgets import interact, interactive, fixed, interact_manual, HBox, Layout
import ipywidgets as widgets
from IPython.display import Image
from IPython.core.display import HTML
from IPython.display import display
import time
import warnings
import random
from matplotlib.transforms import Bbox
from IPython.display import clear_output
import math

In [None]:
COLORS = sns.color_palette('Set2')
SAME_EXTENT = (-0.5, 6.5, -0.5, 6.5)
np.seterr(divide='ignore', invalid='ignore')
warnings.filterwarnings("ignore")

if not os.path.exists("dataloader_files"):
    os.mkdir("dataloader_files")
if not os.path.exists("results"):
    os.mkdir("results")
if not os.path.exists("checkpoints"):
    os.mkdir("checkpoints")

In [None]:
%matplotlib inline

In [None]:
# https://stackoverflow.com/questions/31517194/how-to-hide-one-specific-cell-input-or-output-in-ipython-notebook
def hide_toggle(for_next=False, toggle_text='Toggle show/hide'):
    this_cell = """$('div.cell.code_cell.rendered.selected')"""
    next_cell = this_cell + '.next()'

    target_cell = this_cell  # target cell to control with toggle
    js_hide_current = ''  # bit of JS to permanently hide code in current cell (only when toggling next cell)

    if for_next:
        target_cell = next_cell
        js_hide_current = this_cell + '.find("div.input").hide();'

    js_f_name = 'code_toggle_{}'.format(str(random.randint(1,2**64)))

    html = """
        <script>
            function {f_name}() {{
                {cell_selector}.find('div.input').toggle();
            }}

            {js_hide_current}
        </script>

        <a href="javascript:{f_name}()">{toggle_text}</a>
    """.format(
        f_name=js_f_name,
        cell_selector=target_cell,
        js_hide_current=js_hide_current, 
        toggle_text=toggle_text
    )

    return HTML(html)

In [None]:
hide_toggle(for_next=True, toggle_text='Show/hide helper functions')

In [None]:
def folder(num, folder):
    if not os.path.exists("results/{0}/{1}".format(folder, num)):
        os.mkdir("results/{0}/{1}".format(folder, num))
    file = open("results/{0}/{1}/results.txt".format(folder, num), "w")
    return file

# Projecting a set of features into a lower-dimensional subspace with PCA
def project(features, dim):
    standardized = StandardScaler().fit_transform(features)
    pca = PCA(n_components=dim)
    principalComponents = pca.fit_transform(X=standardized)
    return principalComponents

# Calculating the binomial proportion confidence interval
def wilson(p, n, z = 1.96):
    denominator = 1 + z**2/n
    centre_adjusted_probability = p + z*z / (2*n)
    adjusted_standard_deviation = sqrt((p*(1 - p) + z*z / (4*n)) / n)
    
    lower_bound = (centre_adjusted_probability - z*adjusted_standard_deviation) / denominator
    upper_bound = (centre_adjusted_probability + z*adjusted_standard_deviation) / denominator
    return (lower_bound, upper_bound)

def country_to_iso3(country):
    missing = {'South+Korea': 'KOR',
            'North+Korea': 'PRK',
            'Laos': 'LAO',
            'Caribbean+Netherlands': 'BES',
            'St.+Lucia': 'LCA',
            'East+Timor': 'TLS',
            'Democratic+Republic+of+Congo': 'COD',
            'Swaziland': 'SWZ',
            'Cape+Verde': 'CPV',
            'C%C3%B4te+d%C2%B4Ivoire': 'CIV',
            'Ivory+Coast': 'CIV',
            'Channel+Islands': 'GBR'
            }
    try:
        iso3 = pycountry.countries.search_fuzzy(country.replace('+', ' '))[0].alpha_3
    except LookupError:
        try:
            iso3 = missing[country]
        except KeyError:
            iso3 = None
    return iso3

def full_extent(ax, pad=0.0):
    """Get the full extent of an axes, including axes labels, tick labels, and
    titles."""
    # For text objects, we need to draw the figure first, otherwise the extents
    # are undefined.
    ax.figure.canvas.draw()
    items = ax.get_xticklabels() + ax.get_yticklabels() 
    items += [ax, ax.title]
    bbox = Bbox.union([item.get_window_extent() for item in items])

    return bbox.expanded(1.0 + pad, 1.0 + pad)

def display_filepaths(filepaths, width=100, height=100):
    try: 
        sidebyside = widgets.HBox([widgets.Image(value=open(filepath, 'rb').read(), format='png', width=width, height=height) for filepath in filepaths], layout=Layout(height='{}px'.format(height)))
        display(sidebyside)
    except FileNotFoundError: 
        print('Filepath not found. If using CocoDatasetNoImages Class, some functionality is not available.')

def dec_to_show(p):
    if p < .001:
        return '{:0.3e}'.format(p)
    else:
        return round(p, 3)

## Dataset
Fill in below with dataset and file path names
<a id="dataset"></a>

In [None]:
transform_train = transforms.Compose([ 
        transforms.ToTensor()
        ])
dataset = datasets.CoCoDataset(transform_train)
folder_name = 'coco_example'

# dataset = datasets.OpenImagesDataset(transform_train)
# folder_name = 'openimages_supp'

In [None]:
save_loc = '1_pager_gen'
os.system("rm -r results/{0}/{1}".format(folder_name, save_loc))
file = folder(save_loc, folder_name)
first_pass = True
to_write = {}
if not os.path.exists("checkpoints/{}".format(folder_name)):
    os.mkdir("checkpoints/{}".format(folder_name))

In [None]:
data_folder = None

distances = pickle.load(open("results/{}/att_dis.pkl".format(folder_name), "rb"))
sample_file = distances[0][0][0][3]
print(sample_file)
if not os.path.exists(sample_file):
    assert data_folder is not None, "initialize data_folder with folder path of your data"
    dataset.init_folder_path(data_folder)
    print("overwriting from_path() function")
    dataset.from_path = dataset.from_path_prerun

In [None]:
attr_names = dataset.attribute_names
num_attrs = len(attr_names)

# att_siz Metric: Distance from center, size, attribute label inference
<a id="att_siz"></a>

## Setup
<a id="att_siz_setup"></a>

In [None]:
hide_toggle(for_next=True, toggle_text='Show/hide M1 code')

In [None]:
info = pickle.load(open("results/{}/att_siz.pkl".format(folder_name), "rb"))
sizes = info['sizes']
dists = info['distances']
tiny_sizes = info['tiny_sizes']
no_faces = info['noface_sizes']
        
scenes = [None]*num_attrs
for attr in range(num_attrs):
    try:
        scenes[attr]=np.array(list(itertools.chain.from_iterable([chunk[2] for chunk in no_faces[attr]])) + list(itertools.chain.from_iterable([chunk[1] for chunk in tiny_sizes[attr]])))
    except TypeError:
        if len(tiny_sizes)==0:
            print("There are no images with faces too small to label for group: {0}".format(attr_names[attr]))
        else:
            print("There are no images where a face is not detected for group: {0}".format(attr_names[attr]))
    tiny_sizes[attr] = [chunk[0] for chunk in tiny_sizes[attr]]
    no_faces[attr] = [chunk[0] for chunk in no_faces[attr]]
    
info = pickle.load(open('util_files/places_scene_info.pkl', 'rb'))
idx_to_scene = info['idx_to_scene']
idx_to_scenegroup = info['idx_to_scenegroup']
sceneidx_to_scenegroupidx = info['sceneidx_to_scenegroupidx']

xaxis = [idx_to_scenegroup[i] for i in range(len(idx_to_scenegroup))]
xaxis = ['\n'.join(textwrap.wrap(chunk, width=30)) for chunk in xaxis]
barWidth = .4
fontsize = 15

r1 = np.arange(len(idx_to_scenegroup))
r1 = r1 * ((barWidth * num_attrs) + .2)

scenes = [np.bincount(scenes[i]) for i in range(num_attrs)]
total_images = np.sum(scenes)
scenes_ratio = [scenes[i]/total_images for i in range(num_attrs)]

all_sizes = [tiny_sizes[i]+no_faces[i]+sizes[i] for i in range(num_attrs)]

def numbers_where_attribute_inferred():
    tiny = [len(tiny_sizes[i]) for i in range(num_attrs)]
    noface = [len(no_faces[i]) for i in range(num_attrs)]
    original = [tiny[i]+noface[i]+len(sizes[i]) for i in range(num_attrs)]
    
    total_original = np.sum(original)
    if total_original >0:
        print("Total labelled images: {0},".format(total_original))
    for i in range(num_attrs):
        if original[i]>0:
            print("{0} were {1}".format(original[i], attr_names[i]))
      
    max_original = 0
    max_attribute = -1
    for attr in range(num_attrs):
        if not math.isnan(original[attr]/total_original) and original[attr]/total_original > max_original:
            max_original = original[attr]/total_original
            max_attribute = attr
    if max_attribute > -1:
        print("{0} is assigned to {1}% labelled images in the dataset, and is the most commonly assigned label".format(attr_names[max_attribute], round(max_original, 4)*100))  
    print()
    
    tiny_total = np.sum(tiny)
    if tiny_total > 0:
        print("Discarded {0} images for being too small,".format(tiny_total))
    for i in range(num_attrs):
        if tiny[i]>0:
            print("{0} were {1}".format(tiny[i], attr_names[i]))
        
    max_original = 0
    max_attribute = -1
    for attr in range(num_attrs):
        if not math.isnan(tiny[attr]/tiny_total) and tiny[attr]/tiny_total > max_original:
            max_original = tiny[attr]/tiny_total
            max_attribute = attr
    if max_attribute > -1:
        print("{0} is assigned to {1}% labelled images where a person is too small to properly see, and is the most commonly assigned label among such images".format(attr_names[max_attribute], round(max_original, 4)*100))
    print()
    
    noface_total = np.sum(noface)
    if noface_total >0:
        print("Discarded {0} images for having no face detected,".format(noface_total))
    for i in range(num_attrs):
        if noface[i] >0:
            print("{0} were {1}".format(noface[i], attr_names[i]))
        
    max_original = 0
    max_attribute = -1
    for attr in range(num_attrs):
        if not math.isnan(noface[attr]/noface_total) and noface[attr]/noface_total > max_original:
            max_original = noface[attr]/noface_total
            max_attribute = attr
    if max_attribute > -1:
        print("{0} is assigned to {1}% labelled images where a face is not detected, and is the most commonly assigned label among such images".format(attr_names[max_attribute], round(max_original, 4)*100))
       
    labelled = [tiny[i]+noface[i] for i in range(num_attrs)]
    max_labelled = np.argmax(labelled)
    labelled_others = np.sum(labelled)
    prob = labelled[max_labelled] / labelled_others
    prob_statement = "Probability image is labeled {0} when it should not be, i.e. given there's no face detected or person is too small: {1}".format(attr_names[max_labelled], round(prob, 4))
    if (prob < .45 or prob > .55) and first_pass:
        to_write[0] = ["(att_siz) " + prob_statement]
    print()
    print(prob_statement)
    
def scenes_where_no_face():
    barWidths = [barWidth * i for i in range(num_attrs)]
    fig = plt.figure(figsize=(12, 6))
    r = [r1]
    for i in range(1, num_attrs+1):
        r.append([x + barWidth for x in r[len(r)-1]])
        
    scene_attr_ratios = [0 for i in range(len(scenes[0]))]
    max_scenes = [0 for i in range(len(scenes[0]))]
    for scene in range(len(scenes[0])):
        max_scene = 0
        min_scene = 1
        max_attr = -1
        for attr in range(num_attrs):
            if scenes_ratio[attr][scene] > max_scene:
                max_scene = scenes_ratio[attr][scene]
                max_attr = attr
            if scenes_ratio[attr][scene] < min_scene:
                min_scene = scenes_ratio[attr][scene]
        scene_attr_ratios[scene] = max_scene / min_scene
        max_scenes[scene] = max_attr
            
    order = np.argsort(scene_attr_ratios)
    biggest_diff_scenes = []
    if first_pass and scene_attr_ratios[order[-1]] > 1.:
        biggest_diff_scenes.append("{0} is the scene where the label of {1} is most likely to be picked over that of others".format(xaxis[order[-1]], attr_names[max_scenes[order[-1]]]))
    if first_pass and scene_attr_ratios[order[0]] < 1.:
        biggest_diff_scenes.append("{0} is the scene where the label of {1} is most likely to be picked over that of others".format(xaxis[order[0]], attr_names[max_scenes[order[0]]]))
    if len(biggest_diff_scenes) > 0:
        to_write[1] = biggest_diff_scenes
     
    for i in range(num_attrs-1, -1, -1):
        plt.barh(r[i], scenes_ratio[i][order], height=barWidth, color=COLORS[i], edgecolor='white', label=attr_names[i])
    ticks = r[0]+(num_attrs/2)*barWidth
    plt.yticks(ticks, np.array(xaxis)[order], fontsize=fontsize)
    plt.xticks(fontsize=fontsize)
    plt.ylabel('Scene', fontsize=fontsize)
    plt.xlabel('Proportion of Labelled Images Discarded with this Scene', fontsize=fontsize)
    plt.legend(loc='best', prop={'size': fontsize})
    plt.title("Scenes where image was labeled when it should not have been", fontsize=fontsize)
    plt.tight_layout()
    plt.gcf().subplots_adjust(bottom=0.18)
    plt.gcf().subplots_adjust(left=0.4)
    plt.show()
    
comparisons_widget = widgets.Dropdown(options=['Sizes', 'Distances', 'All sizes', 'Sizes where no face was detected'], value='Sizes')

def compare_sizedist(metric):
    def mean_and_std(data, data_type):
        mean = [np.mean(data[i]) for i in range(num_attrs)]
        std = [np.std(data[i]) for i in range(num_attrs)]

        min_p = 100
        one = -1
        two = -1
        for i in range(num_attrs):
            for a in range(i+1, num_attrs):
                t, p = stats.ttest_ind(data[i], data[a])
                if p < min_p:
                    min_p=p
                    one = i
                    two = a
        p = min_p

        to_save = False
        if metric == 'first_pass' and p < .05 and first_pass:
            data_descrip = ''
            if data_type == 'dists':
                data_descrip = 'Distance from center'
            if data_type == 'sizes':
                data_descrip = 'Fraction of image taken up by a person'
            to_write[2] = ["(att_siz) {0} is different between the attributes with a p-value of {1} for the most significant pair ({2} and {3}), distribution shown below".format(data_descrip, dec_to_show(p), attr_names[one], attr_names[two])]
            to_save = True
            
        if metric == 'first_pass' or metric != 'first_pass':
            for i in range(num_attrs):
                histogram_a, bins_a = np.histogram(data[i], bins='auto')
                bin_centers_a = 0.5*(bins_a[1:] + bins_a[:-1])
                area_a = np.trapz(histogram_a, x=bin_centers_a)
                plt.plot(bin_centers_a, histogram_a/area_a, alpha=.75, label=attr_names[i], color=COLORS[i])

            plt.legend(loc='upper right')
            plt.xlabel('Distances' if data_type == 'dists' else 'Sizes')
            plt.ylabel('Frequency')
            if to_save and first_pass:
                plt.savefig("results/{0}/{1}/0.png".format(folder_name, save_loc))
                plt.close()
            elif metric == 'first_pass':
                plt.close()
            else:
                plt.show()

        if metric != 'first_pass':
            for i in range(num_attrs):
                print("{0}: {1} +- {2}\n".format(attr_names[i], round(mean[i], 4), round(std[i], 4)))
            print("The smallest P value, which is between the groups {} and {}: {}\n".format(attr_names[one],attr_names[two], '{:0.3e}'.format(p)))

    if metric == 'Sizes':
        mean_and_std(sizes, 'sizes')
    elif metric == 'All sizes':
        mean_and_std(all_sizes, 'all_sizes')
    elif metric == 'Sizes where no face was detected':
        mean_and_std(no_faces, 'no_faces')  
    elif metric == 'Distances':
        mean_and_std(dists, 'dists')
    elif metric == 'first_pass' and first_pass:
        mean_and_std(sizes, 'sizes')
        mean_and_std(dists, 'dists')


## Analyses 
<a id="att_siz_analyses"></a>

Statistics on how many attribute labels were inferred when they shouldn't have been because the person was either too small, or no face was detected. The scenes where this happens are shown to investigate if perhaps annotators are relying on contextual clues to make this assumption.

In [None]:
numbers_where_attribute_inferred()
scenes_where_no_face()

Distribution by attribute of sizes and distances, both after removing images where attribute was unlikely to be able to be labeled, all sizes before any images were removed, and the sizes of people where no face was detected.

In [None]:
if first_pass:
    compare_sizedist('first_pass')
all_things = [comparisons_widget]
ui = HBox(all_things)
out = widgets.interactive_output(compare_sizedist, {'metric': comparisons_widget})
display(ui, out)