In [None]:
import pandas as pd
from collections import defaultdict
import pickle
import gzip
import numpy as np
from collections import defaultdict
from multilabelexplanations import distance_functions
from scipy.spatial import distance
import seaborn as sns

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
%pylab inline

In [None]:
feat_matrices = defaultdict(list)
for dataset in ['yeast','woman','medical']:
    pdist_matrix_squared = pd.read_csv("../dataset/%s_featspace_pdist.csv" % dataset, header=None).values
    feat_matrices[dataset]=pdist_matrix_squared

In [None]:
columns_ylist = {'woman': 'service', 'yeast': 'Class', 'medical': 'Class'}

## Statistical dispersion

For each instance to be explained I select the first k neighbors and compute their dispersion using the SSE (the sum of the squared differences between each observation and its group's mean)
$$\frac{1}{k}\sum_{i=1}^k|\vec{v}_i-\bar{v}|^2$$

In [None]:
def neigh_sse(dataset):
    df_2e = pd.read_csv('../dataset/%s_2e.csv' % dataset)
    cols_Y = [col for col in df_2e.columns if col.startswith(columns_ylist[dataset])]
    cols_X = [col for col in df_2e.columns if col not in cols_Y]
    X2E = df_2e[cols_X]
    k=int(0.5*np.sqrt(len(X2E)))
    
    sse_list = []
    for instance in X2E.index.values:
        sampleKnn_feat_space = X2E.loc[pd.DataFrame(feat_matrices[dataset]).loc[instance].sort_values().index.values[0:k]]
        #norm_sampleKnn_feat_space = sampleKnn_feat_space.div(np.linalg.norm(sampleKnn_feat_space, axis=1), axis='index')
        #mean_squared_dist = sum(np.linalg.norm(norm_sampleKnn_feat_space - norm_sampleKnn_feat_space.mean(),axis=1)**2)/len(norm_sampleKnn_feat_space)
        mean_squared_dist = sum(np.linalg.norm(sampleKnn_feat_space - sampleKnn_feat_space.mean(),axis=1)**2)/len(sampleKnn_feat_space)
        
        sse_list.append(mean_squared_dist)
    return sse_list

In [None]:
yeast_sse_list = neigh_sse('yeast')
hist(yeast_sse_list)
title('Yeast')
print(mean(yeast_sse_list))
show()

In [None]:
df_2e = pd.read_csv('../dataset/yeast_2e.csv')
cols_Y = [col for col in df_2e.columns if col.startswith(columns_ylist['yeast'])]
cols_X = [col for col in df_2e.columns if col not in cols_Y]
X2E_yeast = df_2e[cols_X]
mean_squared_dist = sum(np.linalg.norm(X2E_yeast - X2E_yeast.mean(),axis=1)**2)/len(X2E_yeast)
print(mean_squared_dist)

In [None]:
woman_sse_list = neigh_sse('woman')
hist(woman_sse_list)
print(mean(woman_sse_list))
title('Woman')
show()

In [None]:
df_2e = pd.read_csv('../dataset/woman_2e.csv')
cols_Y = [col for col in df_2e.columns if col.startswith(columns_ylist['woman'])]
cols_X = [col for col in df_2e.columns if col not in cols_Y]
X2E_woman = df_2e[cols_X]
mean_squared_dist = sum(np.linalg.norm(X2E_woman - X2E_woman.mean(),axis=1)**2)/len(X2E_woman)
print(mean_squared_dist)

In [None]:
medical_sse_list = neigh_sse('medical')
hist(medical_sse_list)
print(mean(medical_sse_list))
title('Medical')
show()

In [None]:
df_2e = pd.read_csv('../dataset/medical_2e.csv')
cols_Y = [col for col in df_2e.columns if col.startswith(columns_ylist['medical'])]
cols_X = [col for col in df_2e.columns if col not in cols_Y]
X2E_medical = df_2e[cols_X]
mean_squared_dist = sum(np.linalg.norm(X2E_medical - X2E_medical.mean(),axis=1)**2)/len(X2E_medical)
print(mean_squared_dist)

## Mean mixed distance

For each instance to be explained I select the first k neighbors and compute their dispersion using the *mixed distance* (which has values between 0 and 1) between each observation and the group mean. We do this to have comparable results among all the datasete.
$$\frac{1}{k}\sum_{i=1}^kd_{mix}(\vec{v}_i-\bar{v})$$

In [None]:
def mixed_distance(x, y, n_var_cont, cdist, ddist):
    # type: (pandas.Series, pandas.Series, list, list, list, function, function) -> double
    """
    This function return the mixed distance between instance x and instance y
    :param x: np.array, instance 1
    :param y: np.array, instance 2
    :param discrete: slices dicrete
    :param continuous: slices continuos
    :param ddist: function, distance function for discrete variables
    :param cdist: function, distance function for continuos variables
    :return: double
    """
    wc=0.
    wd=0.
    cd=0.
    dd=0.
    n_var_disc = len(x[n_var_cont:])
    
    if n_var_cont != 0:
        wc = n_var_cont/(n_var_cont+n_var_disc)
        xc = x[0:n_var_cont]
        yc = y[0:n_var_cont]
        cd = cdist(xc, yc)
        
        
    if n_var_disc != 0:
        wd = n_var_disc/(n_var_cont+n_var_disc)
        xd = x[n_var_cont:]
        yd = y[n_var_cont:]
        dd = ddist(xd, yd)

    return wd * dd + wc * cd

In [None]:
#dizionario con chiave nome del dataset e valore una lista di liste, lista[0] = nomi var continue, lista[1] = nomi var discrete
with open('../dataset/dict_names.pickle', 'rb') as handle:
    columns_type_dataset = pickle.load(handle)

In [None]:
def create_dist_func(dataset):
    
    mydist = lambda x, y: mixed_distance(x, y, n_var_cont=len(columns_type_dataset[dataset][0]),
                     cdist=distance_functions.normalized_euclidean_distance,
                     ddist=distance.hamming)
    return mydist

In [None]:
def mixed_distance_from_mean(dataset):
    
    df_2e = pd.read_csv('../dataset/%s_2e.csv' % dataset)
    cols_Y = [col for col in df_2e.columns if col.startswith(columns_ylist[dataset])]
    cols_X = [col for col in df_2e.columns if col not in cols_Y]
    X2E = df_2e[cols_X]
    k=int(0.5*np.sqrt(len(X2E)))
    mydist = create_dist_func(dataset)
    
    mixed_distances_list = []
    for instance in X2E.index.values:
        #seleziono i primi k vicini:
        sampleKnn_feat_space = X2E.loc[pd.DataFrame(feat_matrices[dataset]).loc[instance].sort_values().index.values[0:k]]
        #ordino le colonne (voglio prima le var continue e poi quelle discrete)
        sorted_sampleKnn_feat_space = sampleKnn_feat_space[columns_type_dataset[dataset][0]+columns_type_dataset[dataset][1]]
        #calcolo il vettore medio
        mean_vec = sorted_sampleKnn_feat_space.mean()
        #calcolo la distanza mixed tra ogni vettore e il vettore medio
        mean_mixed_dist = sum(sorted_sampleKnn_feat_space.apply(lambda x: mydist(x.values,mean_vec.values), axis=1))/len(sorted_sampleKnn_feat_space)
        mixed_distances_list.append(mean_mixed_dist)
    return mixed_distances_list

In [None]:
mean_mixed_distance_from_mean_yeast = mixed_distance_from_mean('yeast')
mean_mixed_distance_from_mean_woman = mixed_distance_from_mean('woman')
mean_mixed_distance_from_mean_medical = mixed_distance_from_mean('medical')

In [None]:
fig = plt.figure(figsize=(9,4))
ax = plt.subplot(111)

ax.hist(mean_mixed_distance_from_mean_medical, label='Medical mean value: %.2f' % mean(mean_mixed_distance_from_mean_medical),alpha=0.7,color='green',bins=linspace(0,0.65,20))
ax.hist(mean_mixed_distance_from_mean_yeast, label='Yeast mean value: %.2f' % mean(mean_mixed_distance_from_mean_yeast),alpha=0.7,color='r',bins=linspace(0,0.65,20))
ax.hist(mean_mixed_distance_from_mean_woman, label='Woman mean value: %.2f'% mean(mean_mixed_distance_from_mean_woman),alpha=0.7,color='b',bins=linspace(0,0.65,20))

ax.legend(bbox_to_anchor=(0.8, 1), loc=2, borderaxespad=0.,fancybox=True, shadow=True,fontsize=15)
ax.set_xlabel('Mean mixed distance in the core real neighborhood',fontsize=20)

ax.tick_params(axis='y',labelsize=14)
ax.tick_params(axis='x',labelsize=14)
savefig('../fig/dispersion_hist.png', format='png', bbox_inches='tight')
show()

In [None]:
fig, ax1 = plt.subplots()
figsize(10,7)

n,bins,yeast_hist = ax1.hist(mean_mixed_distance_from_mean_yeast, label='Yeast mean value: %.2f' % mean(mean_mixed_distance_from_mean_yeast),alpha=0.7,color='r',bins=linspace(0,0.65,20))
ax1.set_xlabel('Mean mixed distance in the core real neighborhood',fontsize=20)
ax1.tick_params(axis='y', colors='red',size=12,labelsize=14)
ax1.tick_params(axis='x',size=12,labelsize=14)

ax2 = ax1.twinx()
n,bins,woman_hist = ax2.hist(mean_mixed_distance_from_mean_woman, label='Woman mean value: %.2f'% mean(mean_mixed_distance_from_mean_woman),alpha=0.7,color='b',bins=linspace(0,0.65,20))

#ax1.xaxis


lns = yeast_hist+woman_hist
labs = [l.get_label() for l in lns]
ax1.legend(lns, labs, loc='upper center', bbox_to_anchor=(0.5, 1.2),ncol=2, fancybox=True, shadow=True,fontsize=20)
ax2.tick_params(axis='y', colors='blue',size=12,labelsize=14)
ax1.spines["top"].set_visible(False)
ax2.spines["top"].set_visible(False)

savefig('../fig/dispersion_hist_woman_yeast.png', format='png', bbox_inches='tight')
show()