# Implementing analogical explanations

In [1]:
import numpy as np
from math import *
import random
from random import randint
from scipy.spatial.distance import hamming
import itertools
from itertools import combinations
from sklearn.utils import shuffle
from functools import reduce
#FOR CHI-SQUARE - MUTUAL INFORMATION
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif
from matplotlib import pyplot

## ACTION 1:  Dataset generation

In [2]:
'''Create initial artificial dataset of 1024 elements a = a binary CSV file for Boolean function'''
'''The class of a is the last element in a raw. The class is given as a Boolean function f.'''
dataset_csv= "dataset.csv"
#FUNCTIONS TO BE TESTED
def xor(x,y):
    return (not(x) and y) or (x and not(y))  
        
def g1(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10):
    if (x1 and (x2 or x3)):
        return 1
    return 0

def g2(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10):
    if (not(x1) and x2) or (x1 and not(x2)) :
        return 1
    return 0

def g3(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10):
    if xor(xor(x1,x2),x3):
        return 1
    return 0

def g4(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10):
    if (x1+x2+x3+x4+x5+x6+x7+x8+x9+x10==3):
        return 1
    return 0

def g5(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10):
    if (x1 or x2 or x3) and (x4 or not(x5) or x6):
        return 1
    return 0

def g6(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10):
    if (x1 and (x2 or not (x3))):
        return 1
    return 0

def g7(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10):
    if (x1 + x2 + x3 == 2):
        return 1
    return 0
    
def create_dataset(f):
    result=open(dataset_csv, 'w')
    with result as outfile:
        for i1,i2,i3,i4,i5,i6,i7,i8,i9,i10 in itertools.product(range(2),range(2),range(2),range(2),range(2),range(2),range(2),range(2),range(2),range(2)):
            cl = f(i1,i2,i3,i4,i5,i6,i7,i8,i9,i10)
            outfile.write(str(i1)+","+str(i2)+","+str(i3)+","+str(i4)+","+str(i5)+","+str(i6)+","+
                                            str(i7)+","+str(i8)+","+str(i9)+","+str(i10)+","+str(cl)+"\n")
    print("Dataset successfully created with the class function you have chosen.")

## ACTION 2: Sample dataset, compute pairs from S, build  full profile for all pairs

In [3]:
#GENERATE RANDOM SAMPLE SET OF size ELEMENTS FROM A DATASET
def generate_sample_set(dataset,size):
    rdataset=shuffle(dataset)
    sample_set= rdataset[:size]
    #sample_set.astype(int)
    return sample_set

def pick_a_random_element(sample_set): #return a random (a,cl(a))
    n = randint(0,sample_set.shape[0])
    a = sample_set[n]
    return a

#COMPUTE ALL PAIRS
def compute_all_pairs(sample_set):
    n=sample_set.shape[0]
    test_list=[]
    for i in range(n):
        test_list.append(sample_set[i])
    set_of_pairs = list(combinations(test_list, 2)) 
    return set_of_pairs

def mask_item(a,mask):  # a = [1,0,...,cl(a)] 
    masked_a=[x and y for x,y in zip(a,mask)]
    return masked_a

def build_profile(a,b,dim): #  A profile is a vector of dim 10 but len(a) = 11
    profile=[]
    for i in range(dim):
        if a[i]==b[i]:
            profile.append(-1)
        else:
            #profile.append(int(a[i]))
            profile.append(int(a[i]))
    return profile
    
# full profile =[a,b,profile(a,b),xor,masked(a),masked(b),masked_profile]
def build_pair_with_full_profile(a,b,dim,mask): #a = [a, cl(a)] and b  2 vectors of same dim with class included
    profile=build_profile(a,b,dim)
    class_a=a[dim]
    class_b=b[dim]
    masked_a=mask_item(a,mask)
    masked_b=mask_item(b,mask)
    masked_profile=build_profile(masked_a,masked_b,dim)
    return [a,b,profile,xor(class_a,class_b),masked_a,masked_b,masked_profile]
        
#profile(a,b) != profile(b,a)
def create_all_pairs_and_profile(set_of_pairs,dim,mask): #[a,b,[-1 agreement, a_i where disag],xor(cl(a),cl(b))]
    all_pairs_and_profiles=[]
    for (a,b) in set_of_pairs:
        full_profile=build_pair_with_full_profile(a,b,dim,mask)
        all_pairs_and_profiles.append(full_profile)
    return all_pairs_and_profiles

#Filtering full profiles
def opposite(bool):
    return 1 - bool

def are_matching_profiles(p1,p2): # take care of the fact that (a,b) should be equiv to (b,a) in term of profile
    if p1==p2:
        return True
    else:
        answer = True
        for i in range(len(p1)):
            b= (p1[i]==-1) and (p2[i]==-1)
            answer = answer and (b or (p1[i]==opposite(p2[i])))
        return answer
            
def get_all_pairs_from_profile(profile,all_pairs_and_profiles,masked=False):
    matching_profiles=[]
    index_of_profile=2
    if masked:
        index_of_profile=6
    for p in all_pairs_and_profiles: #p=[a,b,profile(a,b),xor,masked(a),masked(b),masked_profile]
        if are_matching_profiles(p[index_of_profile],profile):
            matching_profiles.append(p)
    return matching_profiles

def get_all_pairs_from_profile_with_class_info(cl,profile,all_pairs_and_profiles,masked):
    matching_profiles = get_all_pairs_from_profile(profile,all_pairs_and_profiles,masked)
    matching_profiles_same_class=[]
    matching_profiles_class_change=[]
    for p in matching_profiles:
        if (p[0][10]==cl) and (p[3]==0): #cl(a) = p[0][10]
            matching_profiles_same_class.append(p)
        elif (p[0][10]==cl) and (p[3]!=0):
            matching_profiles_class_change.append(p)
    return matching_profiles_same_class, matching_profiles_class_change

## ACTION 3: Define utilities

In [4]:
def Ag(a,b,list_of_attributes): #agreement set between Boolean vectors of same dimension  - attribute list should be parameter
    ag=[]
    for i in list_of_attributes:
        if a[i] == b[i]:
            ag.append(i)
    return ag

#Dis is never empty because a is always distinct from b
def Dis(a,b,list_of_attributes):  #disagreement set between Boolean vectors of same dimension
    dis=[]
    for i in list_of_attributes:
        if a[i] != b[i]:
            dis.append(i)
    return dis

# Hamming distance between Boolean vectors of same dimension - whatever the dimension
def H(a,b):
    h=sum(x!=y for x,y in zip(a,b))
    return h

# k-nn
def get_neighbors(X,dim,a,k):  #the k nearest neighbors of a in dataset X : a= a1...a10 class
    distances=[]
    size = X.shape[0]
    dim=X.shape[1]
    for i in range(0, size):
        x=X[i]
        hamming = H(x[:dim], a[:dim])
        distances.append((x, hamming))  #x contains its class as last component
    distances.sort(key=lambda l:l[1])  
    nn_distances = []
    nn_neighbors=[]
    for x in range(1,k): #we suppress a from the list
        nn_neighbors.append(distances[x][0])
        nn_distances.append(distances[x][1])
    return nn_distances, nn_neighbors

def get_neighbors_masked(X,dim,a,k,mask):  #the k nearest neighbors of a in dataset X : a= a1...a10 class
    distances=[]
    size = X.shape[0]
    dim=X.shape[1]
    for i in range(0, size):
        x= mask_item(X[i],mask)
        hamming = H(x[:dim], mask_item(a[:dim],mask))
        distances.append((x, hamming))  #x contains its class as last component
    distances.sort(key=lambda l:l[1])  
    nn_distances = []
    nn_neighbors=[]
    for x in range(1,k): #we suppress a from the list
        nn_neighbors.append(distances[x][0])
        nn_distances.append(distances[x][1])
    return nn_distances, nn_neighbors

#get both first with same class and different class    
def get_both_first_same_different_class(a,dim,list_of_nn):
    cl_a=a[dim]
    l=len(list_of_nn)
    index=0
    finish_same= False
    finish_different=False
    while not (finish_same and finish_different):
        x=list_of_nn[index]
        cl_x=x[dim]
        if cl_x==cl_a:
            if not finish_same:
                finish_same=True
                first_nn_same_class=x
        else:
            if not finish_different:
                finish_different =True
                first_nn_different_class=x
        index+=1
    return np.array(first_nn_same_class),np.array(first_nn_different_class)

## ACTION 4 
Create a sample S, pick a random d in S, then compute c as the nearest neighbor of d in a different class. Compute the profile of (c,d) then look for the number of pairs (a,b) with same profile and compute confidence. Consider c and confidence as an explanation.

In [25]:
def prepare_latex(a):
    line=""
    for i in range(len(a)):
        line = line +" & "+str(a[i])
    return line+"\\"
        

#CREATE FULL DATASET
f=g5
dim=10
mask = [1,1,1,1,1,1,0,0,0,0,1] # last 1 is to keep the class when we mask an element a
create_dataset(f)
dataset = np.loadtxt('dataset.csv', delimiter=",")
list_of_attributes=[0,1,2,3,4,5,6,7,8,9]
#dim=8
#mask = [0,1,1,0,0,1,0,1,1] #
#filename='datasets/HIV.csv'
#dataset = np.loadtxt(filename, delimiter=",")

dataset.astype(int)
n= np.sum(dataset, axis = 0)[dim]
#CREATE SAMPLE
sample_size = 500
sample_set = generate_sample_set(dataset,sample_size)
m= np.sum(sample_set, axis = 0)[dim]
#print("********SOME INFORMATION ON OUR ARTIFICIAL INITIAL DATA*********")
#print("We have a total of ",dataset.shape[0]," candidate elements with",n," in class 1.")
#print("We have created a sample set S of size", sample_set.shape[0]," with ",m," elements in class 1.")
#print("###################################################################")

#START WORKING TO EXPLAIN
set_of_pairs=compute_all_pairs(sample_set)
all_pairs_and_profiles=create_all_pairs_and_profile(set_of_pairs,dim,mask)
average_matching_masked_profiles_class_change=0
total_matching_masked_profile=0
d=pick_a_random_element(sample_set)
cl_d = d[dim]
k_nn=sample_size
nn_distances, nn_neighbors = get_neighbors_masked(sample_set,dim,d,k_nn,mask)
first_nn_same_class, c = get_both_first_same_different_class(d,dim,nn_neighbors)
cl_c=c[dim]
full_profile_cd=build_pair_with_full_profile(c,d,dim,mask)  #profile of (c,d)
masked_c=full_profile_cd[4]
masked_d=full_profile_cd[5]
att=Dis(masked_c,masked_d,list_of_attributes)
att = [z+1 for z in att]
masked_profile_cd=build_profile(masked_c,masked_d,dim) #masked profile of (c,d) - only on relevant attributes
matching_masked_profiles_same_class, matching_masked_profiles_with_class_change = get_all_pairs_from_profile_with_class_info(cl_d,full_profile_cd[6],all_pairs_and_profiles,masked=True)
total_matching_masked_profile+= len(matching_masked_profiles_same_class)+len(matching_masked_profiles_with_class_change)
average_matching_masked_profiles_class_change+=len(matching_masked_profiles_with_class_change)
#print("We have ",total_matching_masked_profile,"pairs having the same profile as (c,d)")
alpha =len(matching_masked_profiles_with_class_change)
beta =len(matching_masked_profiles_with_class_change)+len(matching_masked_profiles_same_class)
try:
    confidence=alpha/beta
except:
    confidence=0
relevant_attribute=[]
for i in range(dim):
    if mask[i]==1:
        relevant_attribute.append(i+1)
    
print("Our explanations why vector D:",d,"is in class",int(cl_d),":")
print("1) C",c,"is one of the nearest neighbours of D.")
print("2) C is in class",int(cl_c),"and the relevant attributes are",relevant_attribute,".")
print("3) We have",int(100*confidence),"% of confidence that attribute(s)",att,"cause(s) the change of class.")

Dataset successfully created with the class function you have chosen.
Our explanations why vector D: [0. 1. 1. 1. 1. 1. 1. 0. 0. 0. 1.] is in class 1 :
1) C [0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0.] is one of the nearest neighbours of D.
2) C is in class 0 and the relevant attributes are [1, 2, 3, 4, 5, 6] .
3) We have 23 % of confidence that attribute(s) [4, 6] cause(s) the change of class.
