In [None]:
import sys
sys.path.append('./clamp/lux/pyuid3')
sys.path.append('./clamp/lux/lux')

In [None]:
from clamp.clamp import *
import sklearn
import pandas as pd
import numpy as np
import xgboost as xgb
import pickle
import re
import math
import warnings
import os
import math
import matplotlib.pyplot as plt
import socket
import tempfile
warnings.filterwarnings('ignore')

In [None]:
'''
In console run:
java -jar HMRServer.jar <numer_portu> <ilosc_watkow> e.g.
java -jar HMRServer.jar 9999 24
'''

HOST = "127.0.0.1" #"127.0.0.1"  # The server's hostname or IP address
PORT = 9999 # The port used by the server

def queryHRTDServer(query,max_msg_size=1024):
    query+='\n'
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
        s.connect((HOST, PORT))
        s.sendall(bytes(query,'UTF8'))
        fragments = []
        while True: 
            chunk = s.recv(max_msg_size)
            if not chunk: 
                break
            fragments.append(chunk)
        arr = b''.join(fragments)
    return arr.decode('UTF8')

def rem_hmr_files():
    for x in os.listdir():
        if x.endswith(".hmr"):
            os.remove(x)

# Example based on iris dataset without crossvalidation

In [None]:
from sklearn import datasets
iris = datasets.load_iris()
features = ['sepal_length','sepal_width','petal_length','petal_width']
target = 'calss'

#create daatframe with columns names as strings (LUX accepts only DataFrames withj string columns names)
dataset = pd.DataFrame(iris.data,columns=features)
dataset[target] = iris.target


In [None]:

data = dataset.drop(target, axis = 1)

# labels were removed because one of the clamp's feature is to make clustering, 
# however if necessary you can pass labels in fit function and then clustring stage will be omitted

In [None]:
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [None]:
clamp = CLAMP(clusterng_algorithm = KMeans(n_clusters = 3), 
                classification_model = xgb.XGBClassifier(), 
                description_points_ratio=5,
                test_size = 0.1,
                metric = 'minkowski',
                thresh = 0.9, 
                bounding_box_selection ='random',
                explainer_type = 'lux',
                approach = 'other')
#parameters to adjust there is also possibility to change clustering algorithm and classification model
# available bounding_box_selection parameter: centroids, outliers, tree_query, random

In [None]:
#clamp.fit(data, labels) -- this one is for data with labels
clamp.fit(data) # -- this one is for data without labels

#the dataset has been splited in to train and test dataset, train dataset in provided to anchor to generate rules based on boundingbox method and test dataset is saved to run predict method (check below)

Data without labels, clustering stage implementation
Brute approach, choosen data description skipped. All cases will be checked.
Method:  random
Lux explainer
Accuracy: 0.7333333333333333
Method:  centroids
Lux explainer
Accuracy: 0.6
Method:  outliers
Lux explainer
Accuracy: 1.0
Method:  tree_query
Lux explainer
Accuracy: 0.6666666666666666


In [None]:
predict = clamp.predict(clamp.X_test, clamp.y_test) #labels generation based on the test dataset in generated rules in previous step

Accuracy: 0.6666666666666666


In [None]:
print(clamp.explainer_accuracy) 
#score calculation (comparison labels from test dataset and those predicted by the clamp -- previous step)
#available: explainer_f1, explainer_accuracy, explainer_classification_report, explainer_recall

0.6666666666666666


In [None]:
r = clamp.justify()
r
rem_hmr_files() # remove temp hmr files


In [None]:
#display genarated rules
r

Unnamed: 0,Rule,Precision,Coverage,Cluster
0,petal_width >= 1.0 AND petal_width >= 1.3 AND ...,1,0.9954535961151124,1
1,petal_width >= 1.0 AND petal_width >= 1.5 AND ...,1,0.9948582649230956,2
2,petal_length < 1.9 AND petal_length < 1.7 AND ...,1,0.9940306544303894,0
3,petal_length < 1.9 AND petal_length >= 1.7 AND...,1,0.9940306544303894,0
4,petal_length < 1.9 AND petal_length >= 1.7 AND...,1,0.9940306544303894,0
5,petal_length >= 1.9 AND petal_width < 1.0 AND ...,1,0.9940306544303894,0
6,petal_length >= 1.9 AND petal_width < 1.0 AND ...,1,0.9940306544303894,0
7,petal_width >= 1.0 AND petal_width < 1.3 AND s...,1,0.993997298181057,1
8,petal_width >= 1.0 AND petal_width < 1.5 AND p...,1,0.9882804155349731,1
9,petal_width >= 1.0 AND sepal_length < 6.4 AND ...,1,0.9873427152633668,1


# Grid Search CV

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import Birch

In [14]:
def scorer(clamp, *args):
    clamp.predict(clamp.X_test, clamp.y_test)
    return {'f1': clamp.explainer_f1, 'accuracy': clamp.explainer_accuracy, 'precision': clamp.explainer_precision}

In [14]:
parameters = [{
                'bounding_box_selection': ['random', 'centroids', 'outliers', 'tree_query'],
                'description_points_ratio': [2, 4, 8, 13],
                'test_size': [0.2],
                'thresh':[0.9],
                'explainer_type': ['lux'],
                'neighborhood_size':  [20],
                'max_depth': [2, 4, 5]
},
            {
                'explainer_type': ['global'],
                'description_points_ratio': [2,4,8,13],
                'test_size': [0.25]
}, 
                { 
                'description_points_ratio': [2, 4, 8, 13],
                'test_size': [0.2],
                'thresh':[0.9],
                'explainer_type': ['lux'],
                'approach': ['other'],
                'neighborhood_size':  [20],
                'max_depth': [2, 4, 5]
}]


In [15]:
list_of_choosen_datasets = [
    'ecoli.csv',
    'lymphography.csv',
    'glass.csv',
    'balance.csv',
    'breast_tissue.csv',
    'primary_tumor.csv',
    'vote.csv',
    'ionosphere.csv',
    'cmc.csv',
    'bupa.csv']

In [16]:
list_of_clusters = []
for data in list_of_choosen_datasets:
    list_of_clusters.append(len(pd.read_csv('synthetic_datasets/'+data)['y'].unique()))

In [None]:
%%time
from sklearn.preprocessing import LabelEncoder
#parameter settings as described above
cv_restuls = []
cv_datasets = []

for dataset,cluster_number in zip(list_of_choosen_datasets, list_of_clusters):
    data = pd.read_csv('synthetic_datasets/'+dataset)
    data.dropna(inplace = True)
    data.reset_index(drop = True, inplace = True)
    le = LabelEncoder()
    labels = le.fit_transform(data['y']) #label encode it
    data = data.drop('y', axis = 1)
    print(f'Dataset: {dataset}')
    
    clamp = CLAMP(clusterng_algorithm = KMeans(n_clusters = cluster_number), classification_model = xgb.XGBClassifier())
    
    clf = GridSearchCV(clamp, parameters, scoring = scorer, cv = 10, refit = 'accuracy', n_jobs = 20, error_score = 'raise')
    clf.fit(data, labels)
    
    cv_restuls.append(clf.cv_results_)
    cv_datasets.append(dataset)
    with open('art_results-'+str(dataset.split('.')[0]) +'.pickle', 'wb') as f:
        pickle.dump([cv_restuls, cv_datasets], f)
    rem_hmr_files()

Dataset: ecoli.csv


ValueError: Expected n_neighbors <= n_samples,  but n_samples = 2, n_neighbors = 20

In [19]:
rem_hmr_files()