# Implementing AIR and comparing with chi2 and mutual information

In [1]:
import numpy as np
from math import *
from random import randint
import itertools
from itertools import combinations
from scipy.spatial.distance import hamming

#FOR CHI-SQUARE - MUTUAL INFORMATION
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
#FOR PLOT
from matplotlib import pyplot

# evaluation of a model using all input features
from pandas import read_csv
 

In [2]:
#FUNCTIONS TO BE TESTED
def xor(x,y): #utility
    return (not(x) and y) or (x and not(y))  
        
def g1(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10):
    if (x1 and (x2 or x3)):
        return 1
    return 0

def g2(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10):
    if (not(x1) and x2) or (x1 and not(x2)) :
        return 1
    return 0

def g3(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10):
    if xor(xor(x1,x2),x3):
        return 1
    return 0

def g4(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10):
    if (x1+x2+x3+x4+x5+x6+x7+x8+x9+x10==3):
        return 1
    return 0

def g5(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10):
    if (x1 or x2 or x3) and (x4 or not(x5) or x6):
        return 1
    return 0

def g6(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10):
    if (x1 and (x2 or not (x3))):
        return 1
    return 0

def g7(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10):
    if (x1 + x2 + x3 == 2):
        return 1
    return 0

## ACTION 1:  Dataset generation and sampling

In [3]:
'''Create initial artificial dataset of 1024 elements a = a binary CSV file for Boolean function'''
'''The class of a is the last element in a raw. The class is given as a Boolean function f.'''
dataset_csv= "dataset.csv"

def create_csv_dataset(f):
    result=open(dataset_csv, 'w')
    with result as outfile:
        for i1,i2,i3,i4,i5,i6,i7,i8,i9,i10 in itertools.product(range(2),range(2),range(2),range(2),range(2),range(2),range(2),range(2),range(2),range(2)):
            cl = f(i1,i2,i3,i4,i5,i6,i7,i8,i9,i10)
            outfile.write(str(i1)+","+str(i2)+","+str(i3)+","+str(i4)+","+str(i5)+","+str(i6)+","+
                                            str(i7)+","+str(i8)+","+str(i9)+","+str(i10)+","+str(cl)+"\n")
    print("Dataset successfully created with the class function you have chosen.")
    
#GENERATE RANDOM SAMPLE SET OF size ELEMENTS FROM A DATASET
def generate_sample_set(dataset,size):
    rdataset=shuffle(dataset)
    sample_set= rdataset[:size]
    #sample_set.astype(str) #FOR UCI DATA
    return sample_set

# LOADING METHODS
def load_dataset_np(filename): #WHEN DATA ARE NUMBERS
    data = np.loadtxt(filename, dtype="str",delimiter=",")
    data.astype(str)
    X = data[:, :-1]
    y = data[:,-1]
    return data, X, y

def load_dataset_panda(filename):  #WHEN DATA ARE STRINGS LIKE A, A, C, etc...
    dataset = read_csv(filename, header=None)
    data = dataset.values
    data.astype(str)
    X = data[:, :-1]
    y = data[:,-1]
    return data, X, y
 

## ACTION 2: Compute pairs from S, build  full profile for all pairs.

In [4]:
#COMPUTE ALL PAIRS FROM SAMPLE SET
def all_pairs(sample_set):
    n=sample_set.shape[0]
    test_list=[]
    for i in range(n):
        test_list.append(sample_set[i])
    set_of_pairs = list(combinations(test_list, 2)) 
    return set_of_pairs

def mask_item(a,m):  # a = [1,0,...,cl(a)] 
    masked_a=[x and y for x,y in zip(a,m)]
    return masked_a

def build_profile(a,b,dim): #  A profile is a vector of dim 10 but len(a) = 11
    profile=[]
    for i in range(dim):
        if a[i]==b[i]:
            profile.append(-1)
        else:
            profile.append(int(a[i]))
    return profile
    
# full profile =[a,b,profile(a,b),xor,masked(a),masked(b),masked_profile]
def build_pair_with_full_profile(a,b,dim,mask): #a = [a, cl(a)] and b  2 vectors of same dim with class included
    profile=build_profile(a,b,dim)
    class_a=a[dim]
    class_b=b[dim]
    masked_a=mask_item(a,mask)
    masked_b=mask_item(b,mask)
    masked_profile=build_profile(masked_a,masked_b,dim)
    return [a,b,profile,xor(class_a,class_b),masked_a,masked_b,masked_profile]
        
#profile(a,b) != profile(b,a)
def create_all_pairs_and_profile(set_of_pairs,dim,mask): #[a,b,[-1 agreement, a_i where disag],xor(cl(a),cl(b))]
    all_pairs_and_profiles=[]
    for (a,b) in set_of_pairs:
        full_profile=build_pair_with_full_profile(a,b,dim,mask)
        all_pairs_and_profiles.append(full_profile)
    return all_pairs_and_profiles

#Filtering full profiles
def opposite(bool):
    return 1 - bool

def are_matching_profiles(p1,p2): # take care of the fact that (a,b) should be equiv to (b,a) in term of profile
    if p1==p2:
        return True
    else:
        answer = True
        for i in range(len(p1)):
            b= (p1[i]==-1) and (p2[i]==-1)
            answer = answer and (b or (p1[i]==opposite(p2[i])))
        return answer
            
def get_all_pairs_from_profile(profile,all_pairs_and_profiles,masked=False):
    matching_profiles=[]
    index=2
    if masked:
        index=6
    for p in all_pairs_and_profiles: #p=[a,b,profile(a,b),xor,masked(a),masked(b),masked_profile]
        if are_matching_profiles(p[index],profile):
            matching_profiles.append(p)
    return matching_profiles

def get_all_pairs_from_profile_with_class_info(cl,profile,all_pairs_and_profiles,masked):
    matching_profiles = get_all_pairs_from_profile(profile,all_pairs_and_profiles,masked)
    matching_profiles_same_class=[]
    matching_profiles_class_change=[]
    for p in matching_profiles:
        if (p[0][10]==cl) and (p[3]==0): #cl(a) = p[0][10]
            matching_profiles_same_class.append(p)
        elif (p[0][10]==cl) and (p[3]!=0):
            matching_profiles_class_change.append(p)
    return matching_profiles_same_class, matching_profiles_class_change

# ACTION 3: Define utilities for chi2 and mutual information

In [5]:
# prepare input data
def prepare_inputs(X_train, X_test):
    oe = OrdinalEncoder()
    oe.fit(X_train)
    X_train_enc = oe.transform(X_train)
    X_test_enc = oe.transform(X_test)
    return X_train_enc, X_test_enc
    
# prepare target
def prepare_targets(y_train, y_test):
    le = LabelEncoder()
    le.fit(y_train)
    y_train_enc = le.transform(y_train)
    y_test_enc = le.transform(y_test)
    return y_train_enc, y_test_enc

def prepare_all(X, y, test_size, random_state=1):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=1)
    X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)
    y_train_enc, _ = prepare_targets(y_train, y_test)
    return X_train_enc, y_train_enc, X_test_enc
 
#TESTING WITH ALL FEATURES
# feature selection chi2
def select_all_features_chi2(X_train, y_train, X_test):
    fs = SelectKBest(score_func=chi2, k='all')
    fs.fit(X_train, y_train)
    X_train_fs = fs.transform(X_train)
    X_test_fs = fs.transform(X_test)
    return X_train_fs, X_test_fs, fs

# feature selection mutual information
def select_all_features_mutual(X_train, y_train, X_test):
    fs = SelectKBest(score_func=mutual_info_classif, k='all')
    fs.fit(X_train, y_train)
    X_train_fs = fs.transform(X_train)
    X_test_fs = fs.transform(X_test)
    return X_train_fs, X_test_fs, fs

#TESTING WITH 4 BEST FEATURES
# feature selection
def select_k_features_chi2(X_train, y_train, X_test):
    fs = SelectKBest(score_func=chi2, k=4)
    fs.fit(X_train, y_train)
    X_train_fs = fs.transform(X_train)
    X_test_fs = fs.transform(X_test)
    return X_train_fs, X_test_fs, fs

# feature selection mutual information
def select_k_features_mutual(X_train, y_train, X_test):
    fs = SelectKBest(score_func=mutual_info_classif, k=4)
    fs.fit(X_train, y_train)
    X_train_fs = fs.transform(X_train)
    X_test_fs = fs.transform(X_test)
    return X_train_fs, X_test_fs, fs
 

## ACTION 4: Define AIR

In [6]:
#Dis is never empty because a is always distinct from b
def Dis(a,b,list_of_attributes):  #disagreement set between Boolean vectors of same dimension
    dis=[]
    for i in list_of_attributes:
        if a[i] != b[i]:
            dis.append(i)
    return dis

def air(dim,attribute,list_of_attributes,set_of_pairs):
    m_att=0
    ag_att=0
    for (a,b) in set_of_pairs:
        if Dis(a[0:dim],b[0:dim],list_of_attributes)==[attribute]: 
            ag_att+=1
            if a[dim]==b[dim]:
                m_att+=1
    ratio=-1
    if ag_att!=0:
        ratio=m_att/ag_att
    return ratio
        
def select_features_air(dim,list_of_attributes,set_of_pairs):
    scores=[]
    for attribute in list_of_attributes:
        ratio=air(dim,attribute,list_of_attributes,set_of_pairs)
        scores.append(ratio)
    return scores        

## ACTION 5: Comparing AIR - chi-square - mutual information

In [8]:
#TESTING
f=g5
create_csv_dataset(f)
#filename='datasets/breast-cancer-out.csv'
#filename='datasets/HIV.csv'
#filename='datasets/mushroom.csv'
#filename='datasets/HIV-reduced-air.csv'
filename="dataset.csv"
dataset, X, y = load_dataset_panda(filename)
#INFO
dataset_size=dataset.shape[0]
dimension=dataset.shape[1] - 1
positive= np.sum(dataset, axis = 0)[dimension]
print("****INFORMATION ON INITIAL DATA *******")
print("dataset:",filename,"size",dataset_size,"dimension:",dimension," - with",positive,"elements in class 1.")

#INIT
#create list of attribute as index 0, 1, ...
list_of_attributes=[]
for i in range(dimension):
    list_of_attributes.append(i)

sample_ratio=0.33
sample_size = int(dataset_size*sample_ratio)

number_of_test=10
mean_air_scores = [0]*dimension
mean_chi_scores = [0]*dimension
mean_mut_scores = [0]*dimension
for u in range(number_of_test):
    #MAIN TESTING LOOP
    sample_set = generate_sample_set(dataset,sample_size)
    list_of_pairs = all_pairs(sample_set)
    air_scores = select_features_air(dimension,list_of_attributes,list_of_pairs)
    air_scores=[1-k for k in air_scores]
#NOT SURE WE TEST ON THE SAME SET BECAUSE OF TEST_SIZE PARAM IN PREPARE
    X_train_enc, y_train_enc, X_test_enc=prepare_all(X, y, test_size=sample_ratio, random_state=1)
    X_train_chi, X_test_chi, fs_chi = select_all_features_chi2(X_train_enc, y_train_enc, X_test_enc)
    X_train_mut, X_test_mut, fs_mut = select_all_features_mutual(X_train_enc, y_train_enc, X_test_enc)
#All scores are normalized
#NORMALIZATION FACTORS
    Z_air,Z_chi,Z_mi=0,0,0
    for i in range(dimension):
        Z_air+=air_scores[i]
        Z_chi+=fs_chi.scores_[i]
        Z_mi+=fs_mut.scores_[i]
    #UPDATE MEAN SCORES BY ADDING NORMALIZED SCORES IN [0,1]  +0.01 to avoid division by 0
    for j in range(dimension):
        mean_air_scores[j]+= air_scores[j]/(Z_air + 0.01)
        mean_chi_scores[j]+= fs_chi.scores_[j]/(Z_chi + 0.01)
        mean_mut_scores[j]+= fs_mut.scores_[j]/(Z_mi + 0.01)

mean_air_scores = [a*(1/number_of_test) for a in mean_air_scores]
mean_chi_scores = [a*(1/number_of_test) for a in mean_chi_scores]
mean_mut_scores = [a*(1/number_of_test) for a in mean_mut_scores]

'''
pyplot.title("AIR")
pyplot.bar([i for i in range(len(air_scores))], air_scores)
pyplot.show()

pyplot.title("CHI-SQUARE")
pyplot.bar([i for i in range(len(fs_chi.scores_))], fs_chi.scores_*(1/Z_chi))
pyplot.show()

pyplot.title("MUTUAL INFORMATION")
pyplot.bar([i for i in range(len(fs_mut.scores_))], fs_mut.scores_*(1/Z_mi))
pyplot.show()
'''  

#PREPARE FOR LATEX
#AIR
latex_line=""
for i in range(dimension):
    latex_line+=" & " +str(round(mean_air_scores[i],2))
print("air"+latex_line+"\\")

#CHI_SQUARE
latex_line=""
for i in range(len(fs_chi.scores_)):
    latex_line+=" & " +str(round(mean_chi_scores[i],2))
print("chi2"+latex_line+"\\")

#MUTUAL INF
latex_line=""
for i in range(len(fs_mut.scores_)):
    latex_line+=" & " +str(round(mean_mut_scores[i],2))
print("mi"+latex_line+"\\")

Dataset successfully created with the class function you have chosen.
****INFORMATION ON INITIAL DATA *******
dataset: dataset.csv size 1024 dimension: 10  - with 784 elements in class 1.
air & 0.17 & 0.18 & 0.17 & 0.16 & 0.16 & 0.16 & 0.0 & 0.0 & 0.0 & 0.0\
chi2 & 0.17 & 0.14 & 0.17 & 0.18 & 0.18 & 0.16 & 0.0 & 0.0 & 0.0 & 0.0\
mi & 0.15 & 0.12 & 0.16 & 0.18 & 0.15 & 0.13 & 0.01 & 0.02 & 0.01 & 0.03\


## ACTION 6: Comparing the attribute relevance methods