# Implementing analogical explanations

In [1]:
import os
import numpy as np
from math import *
import warnings

warnings.filterwarnings("ignore")

# FOR CSV
from pandas import read_csv

# FOR utilities
from utils import *

## MAIN PROCESS
Create a sample S, pick a random d in S, then compute c as the nearest neighbor of d in a different class. Compute the profile of (c,d) then look for the number of pairs (a,b) with same profile and compute confidence. Consider c and confidence as an explanation.

In [4]:
# HYPER PARAMETERS
max_distance = 5

# CREATE AND LOAD SYNTHETIC DATASET ACCORDING TO 4 PARAMETERS
f = g_example
built_in_dimension = 5
sample_size = 2 ** (built_in_dimension - 1)  # half size
sample_size = min(1000, sample_size)
categorical_range = 1
# dataset0 = create_categorical_dataset(f, built_in_dimension, sample_size, categorical_range)
dataset_example = "example/sampleSize10-dim5WithNoise.csv"

# LOADING REAL DATASETS
folder_real_data = "realDatasets/"
data_filename = "data.csv"
filename1 = folder_real_data + "1-adult/" + data_filename
filename2 = folder_real_data + "2-bach/" + data_filename
filename3 = folder_real_data + "3-cars/" + data_filename
filename4 = folder_real_data + "4-chess/" + data_filename
filename5 = folder_real_data + "5-contraception/" + data_filename
filename6 = folder_real_data + "6-dmft/" + data_filename
filename7 = folder_real_data + "7-dress/" + data_filename
filename8 = folder_real_data + "8-mushrooms/" + data_filename
filename9 = folder_real_data + "9-phishing/" + data_filename
filename10 = folder_real_data + "10-portugalBank/" + data_filename
filename11 = folder_real_data + "11-school_grade/" + data_filename
filename12 = folder_real_data + "12-vote/" + data_filename

# MAIN PROGRAM
filename_list = [
    filename1,
    filename2,
    filename3,
    filename4,
    filename5,
    filename6,
    filename7,
    filename8,
    filename9,
    filename10,
    filename11,
    filename12,
]
filename_list = [filename12]
number_of_test = 1
dimension_list = []
average_num_of_features_list = []
average_confidence_list = []
for dataset in filename_list:
    print("***************************************************************")
    number_of_explanatory_features_list = []
    total_confidence = 0
    for i in range(number_of_test):
        data, X, y, dimension, initial_size = load_dataset(dataset)
        if initial_size > 1000:
            data = generate_sample_set(data, 1000)
        row_number = data.shape[0]
        (
            len_set_of_pairs,
            global_list_of_relevant_attributes,
            d,
            c,
            alpha,
            beta,
            actual_relevant_attributes,
        ) = explanation_loop(data, max_distance)
        try:
            confidence = alpha / beta
        except:
            confidence = 0
        total_confidence += confidence
        number_of_explanatory_features_list.append(len(actual_relevant_attributes))
    average_confidence_list.append(total_confidence / number_of_test)
    dimension_list.append(dimension)
    average_num_of_features_list.append(
        sum(number_of_explanatory_features_list) / number_of_test
    )
    print(
        "Dataset:",
        dataset,
        "-dimension:",
        dimension,
        "-number of rows:",
        row_number,
        "-initial size:",
        initial_size,
    )

    # PRINTING EXPLANATIONS
    print(
        "Dataset:",
        dataset,
        "-dimension:",
        dimension,
        "-number of rows:",
        row_number,
        "-initial size:",
        initial_size,
    )
    print(
        "Total pairs (a,b):",
        len_set_of_pairs,
        " -matching pairs i.e. such that a:b::c:d",
        beta,
        " -matching pairs with different classes:",
        alpha,
    )

    print(
        "1) We have found",
        len(global_list_of_relevant_attributes),
        "relevant attributes among",
        dimension,
        "which are:",
        global_list_of_relevant_attributes,
        ".",
    )
    print("Our explanations why vector D:", d[:-1], "is in class", d[-1], ":")
    print(
        "2) C", c[:-1], "is one of the nearest neighbours of D and C is in class", c[-1]
    )
    try:
        confidence = alpha / beta
    except:
        confidence = 0
    if beta != 0:
        print(
            "3) We have",
            int(100 * confidence),
            "% of confidence that attribute(s)",
            actual_relevant_attributes,
            "cause(s) the change of class.",
        )
    else:
        print(
            "3) We do not have any matching profile in the dataset: we cannot conclude."
        )
    print(number_of_explanatory_features_list)
    print("***************************************************************")

"""
x_labels = [
    "Adul.",
    "Bach",
    "Car",
    "Ches.",
    "Cont.",
    "Dmft",
    "Dres.",
    "Mush.",
    "Phis.",
    "Port.",
    "Scho.",
    "Vote",
]
bar1 = dimension_list
color1 = "blue"
label1 = "dataset dimension"
bar2 = average_num_of_features_list
color2 = "red"
label2 = "average used features"
y_label = "Num of features"
show_bar(bar1, color1, label1, bar2, color2, label2, x_labels, y_label, legend=True)

bar1 = average_confidence_list
label1 = "Confidence between 0 and 1"
bar2 = average_confidence_list
y_label = "Confidence between 0 and 1"
show_bar(bar1, color1, label1, bar2, color1, label1, x_labels, y_label, legend=False)

# EXPERIMENT SAMPLE SIZE IMPACT ON HIGH DIMENSIONAL DATASET ADULT - PHISHING - PORTUGAL BANK
#dataset=filename1
dataset=filename9
#dataset=filename10
number_of_test=10
data_init, _, _, _, _ = load_dataset(dataset)
average_confidence_for_size_list=[]
size_list=[500,1000,1500,2000]
for size in size_list: #           #,1500,2000,3000,5000]:
    print("SIZE:",size)
    total_confidence_for_size=0
    for i in range(number_of_test):
        #print("TEST:",i)
        data = generate_sample_set(data_init, size)
        row_number = data.shape[0]
        (
            len_set_of_pairs,
            global_list_of_relevant_attributes,
            d,
            c,
            alpha,
            beta,
            actual_relevant_attributes,
        ) = explanation_loop(data,max_distance)
        try:
            confidence = alpha / beta
        except:
            confidence = 0
        total_confidence_for_size += confidence
        print(total_confidence_for_size)
    average_confidence_for_size= total_confidence_for_size / number_of_test
    average_confidence_for_size_list.append(average_confidence_for_size)
    
print(dataset,average_confidence_for_size_list)

t = [500,1000,1500,2000]:
a = [0.7085887796414112, 0.5210526315789473, 0.7606590452661688, 0.9487870619946092]
b = [0.6545698924731183, 0.8443478260869565, 0.5950847293887664, 0.308874439211239 ]
c = [0.6675924096854329, 0.5203873482467506, 0.3847218357931196, 0.49149181819953736 ]

plt.plot(t, list(zip(a, b, c)),label=['Adul.', 'Phis.', 'Port.']);
#plt.plot(t, a, t, b, t, c, label=['Adul', 'Phis', 'Port'])
plt.legend()
plt.show()
"""

***************************************************************
Dataset: realDatasets/12-vote/data.csv -dimension: 16 -number of rows: 435 -initial size: 435
Dataset: realDatasets/12-vote/data.csv -dimension: 16 -number of rows: 435 -initial size: 435
Total pairs (a,b): 94395  -matching pairs i.e. such that a:b::c:d 1877  -matching pairs with different classes: 1549
1) We have found 3 relevant attributes among 16 which are: [3, 4, 12] .
Our explanations why vector D: ["'n'" '?' "'y'" "'n'" "'n'" "'n'" "'y'" "'y'" "'y'" "'y'" "'y'" "'n'"
 "'n'" "'y'" "'y'" "'y'"] is in class 'democrat' :
2) C ["'y'" "'y'" "'y'" "'y'" "'n'" "'n'" "'y'" "'y'" "'y'" "'y'" "'y'" "'n'"
 "'n'" "'y'" "'n'" "'y'"] is one of the nearest neighbours of D and C is in class 'republican'
3) We have 82 % of confidence that attribute(s) [4] cause(s) the change of class.
[1]
***************************************************************


'\nx_labels = [\n    "Adul.",\n    "Bach",\n    "Car",\n    "Ches.",\n    "Cont.",\n    "Dmft",\n    "Dres.",\n    "Mush.",\n    "Phis.",\n    "Port.",\n    "Scho.",\n    "Vote",\n]\nbar1 = dimension_list\ncolor1 = "blue"\nlabel1 = "dataset dimension"\nbar2 = average_num_of_features_list\ncolor2 = "red"\nlabel2 = "average used features"\ny_label = "Num of features"\nshow_bar(bar1, color1, label1, bar2, color2, label2, x_labels, y_label, legend=True)\n\nbar1 = average_confidence_list\nlabel1 = "Confidence between 0 and 1"\nbar2 = average_confidence_list\ny_label = "Confidence between 0 and 1"\nshow_bar(bar1, color1, label1, bar2, color1, label1, x_labels, y_label, legend=False)\n\n# EXPERIMENT SAMPLE SIZE IMPACT ON HIGH DIMENSIONAL DATASET ADULT - PHISHING - PORTUGAL BANK\n#dataset=filename1\ndataset=filename9\n#dataset=filename10\nnumber_of_test=10\ndata_init, _, _, _, _ = load_dataset(dataset)\naverage_confidence_for_size_list=[]\nsize_list=[500,1000,1500,2000]\nfor size in size_list: