# PSyKE's demo for CREAM and ORCHiD

Some imports.

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier

from psyke import Extractor
from psyke.regression import FeatureRanker
from psyke.utils.logic import pretty_theory
from psyke.utils import Target

Import Iris dataset separating features and class.

In [2]:
x, y = load_iris(return_X_y=True, as_frame=True)

Rename of the features.

In [3]:
x.columns = ['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth']

Replace integer indices with the corresponding string class.

In [4]:
y = pd.DataFrame(y).replace({"target": {0: 'setosa', 1: 'versicolor', 2: 'virginica'}})
y

Unnamed: 0,target
0,setosa
1,setosa
2,setosa
3,setosa
4,setosa
...,...
145,virginica
146,virginica
147,virginica
148,virginica


The final dataset:

In [5]:
dataset = x.join(y)
dataset.columns = [*dataset.columns[:-1], 'iris']
dataset

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,iris
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


Split between train and test set in a reproducible way.

In [6]:
train, test = train_test_split(dataset, test_size=0.5, random_state=0)

We use CREAM to perform an interpretable clustering on the Iris dataset

In [7]:
cream = Extractor.cream(depth=2, error_threshold=0.1, output=Target.CLASSIFICATION)
_ = cream.extract(train)
cream.print()

Output is setosa if:
    SepalLength is in [4.40, 5.80]
    SepalWidth is in [2.30, 4.10]
    PetalLength is in [1.20, 1.70]
    PetalWidth is in [0.10, 0.50]
Output is versicolor if:
    SepalLength is in [4.90, 6.70]
    SepalWidth is in [2.20, 3.20]
    PetalLength is in [3.00, 5.00]
    PetalWidth is in [1.00, 1.80]
Output is virginica if:
    SepalLength is in [4.40, 7.90]
    SepalWidth is in [2.20, 4.10]
    PetalLength is in [1.20, 6.90]
    PetalWidth is in [0.10, 2.50]


To perform knowledge extraction via CReEPy, we need a blabk-box classifier.
We use a KNN with K = 7 and we train it.

In [8]:
predictor = KNeighborsClassifier(n_neighbors=7)
predictor.fit(train.iloc[:, :-1], train.iloc[:, -1])
predictor.score(test.iloc[:, :-1], test.iloc[:, -1])

0.96

We use ORCHiD to extract Prolog rules out of the KNN.
Since ORCHiD relies on CREAM, the extracted rules will be built upon the aformenetioned clusters

In [9]:
orchid = Extractor.orchid(predictor, depth=2, error_threshold=0.1, output=Target.CLASSIFICATION)
theory_from_orchid = orchid.extract(train)
print('ORCHiD performance ({} rules):\nAccuracy = {:.2f}\nFidelity = {:.2f}\n'
      .format(orchid.n_rules, orchid.accuracy(test), orchid.accuracy(test, predictor)))
print('ORCHiD extracted rules:\n\n' + pretty_theory(theory_from_orchid))

ORCHiD performance (3 rules):
Accuracy = 0.83
Fidelity = 0.87

ORCHiD extracted rules:

iris(PetalLength, PetalWidth, SepalLength, SepalWidth, setosa) :-
    SepalLength in [4.39, 5.80], SepalWidth in [2.29, 4.10], PetalLength in [1.19, 1.70], PetalWidth in [0.09, 0.50].
iris(PetalLength, PetalWidth, SepalLength, SepalWidth, versicolor) :-
    SepalLength in [4.89, 6.70], SepalWidth in [2.19, 3.10], PetalLength in [2.99, 5.00], PetalWidth in [0.99, 1.70].
iris(PetalLength, PetalWidth, SepalLength, SepalWidth, virginica) :-
    SepalLength in [4.39, 7.90], SepalWidth in [2.19, 4.10], PetalLength in [1.19, 6.90], PetalWidth in [0.09, 2.50].


We can refine the output rules of ORCHiD by removing from the rules antecedents correponding to non-relevant features.
Thus, we use a FeatureRanker to measure feature relevance.

In [10]:
ranked = FeatureRanker(x.columns).fit(predictor, x).rankings()
ranked

[('SepalLength', 0.10256657767944181),
 ('SepalWidth', 0.04329335731659775),
 ('PetalLength', 1.0),
 ('PetalWidth', 0.8207964847109807)]

We choose to consider only the 2 most relevant features (petal width and legth)

In [11]:
orchid = Extractor.orchid(predictor, depth=2, error_threshold=0.1, output=Target.CLASSIFICATION,
                          ranks=ranked, ignore_threshold=.8)
theory_from_orchid = orchid.extract(train)
print('ORCHiD performance ({} rules):\nAccuracy = {:.2f}\nFidelity = {:.2f}\n'
      .format(orchid.n_rules, orchid.accuracy(test), orchid.accuracy(test, predictor)))
print('ORCHiD extracted rules:\n\n' + pretty_theory(theory_from_orchid))

ORCHiD performance (3 rules):
Accuracy = 0.92
Fidelity = 0.96

ORCHiD extracted rules:

iris(PetalLength, PetalWidth, SepalLength, SepalWidth, setosa) :-
    PetalLength in [1.19, 1.70], PetalWidth in [0.09, 0.50].
iris(PetalLength, PetalWidth, SepalLength, SepalWidth, versicolor) :-
    PetalLength in [2.99, 5.00], PetalWidth in [0.99, 1.70].
iris(PetalLength, PetalWidth, SepalLength, SepalWidth, virginica) :-
    PetalLength in [1.19, 6.90], PetalWidth in [0.09, 2.50].


In the following cell we consider only the most relevant input feature

In [12]:
orchid = Extractor.orchid(predictor, depth=2, error_threshold=0.1, output=Target.CLASSIFICATION,
                          ranks=ranked, ignore_threshold=.99)
theory_from_orchid = orchid.extract(train)
print('ORCHiD performance ({} rules):\nAccuracy = {:.2f}\nFidelity = {:.2f}\n'
      .format(orchid.n_rules, orchid.accuracy(test), orchid.accuracy(test, predictor)))
print('ORCHiD extracted rules:\n\n' + pretty_theory(theory_from_orchid))

ORCHiD performance (3 rules):
Accuracy = 0.86
Fidelity = 0.90

ORCHiD extracted rules:

iris(PetalLength, PetalWidth, SepalLength, SepalWidth, setosa) :-
    PetalLength in [1.19, 1.70].
iris(PetalLength, PetalWidth, SepalLength, SepalWidth, versicolor) :-
    PetalLength in [2.99, 5.00].
iris(PetalLength, PetalWidth, SepalLength, SepalWidth, virginica) :-
    PetalLength in [1.19, 6.90].
