In [1]:
import pandas as pd
import numpy as np

#  plot confusion matrices
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier

import sys
sys.path.insert(1, '../../')
from py_oqat.config_algorithms import ACOConfig
from py_oqat.classifier import OQATClassifier

In [2]:
SEED = 42

def pretty_print_confusion_matrix(confusion_matrix):
    for row in confusion_matrix:
        print(row)

def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        
    # print(cm)

    df_cm = pd.DataFrame(cm, index = [i for i in classes],
                  columns = [i for i in classes])
    plt.figure(figsize = (7,5))
    sns.heatmap(df_cm, annot=True, cmap=cmap)

    plt.title(title)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()


In [3]:
# Load iris dataset from sklearn
from sklearn.datasets import load_wine
wine = load_wine()
X = wine.data
y = wine.target

# Define the feature type for each column (discrete or continuous)
column_names = ["a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9", "a10", "a11", "a12", "a13"]
column_types = ["num", "num", "num", "num", "num", "num", "num", "num", "num", "num", "num", "num", "num"]

print(X.shape)
print(y.shape)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.6, random_state=SEED)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

# Training data distribution
print("Distribution of training data")
print(pd.Series(y_train).value_counts())


(178, 13)
(178,)
(71, 13)
(71,)
(107, 13)
(107,)
Distribution of training data
1    28
0    23
2    20
dtype: int64


In [5]:
# Hypothesis: the most concise models classify better
# We can meassure ow concise a model is by counting the number of cliques (disjunctive clauses)

n_experiments = 10
classifiers = []
n_cycles = [2*i for i in range(1, 11)]
n_ants = [2*i for i in range(1, 11)]
for i in range(10):
    aco_config = ACOConfig(algorithm="vertex-ac", cycles=n_cycles[i], ants=n_ants[i], alpha=1, rho=0.99, tau_max=6., tau_min=0.01)
    classifier = OQATClassifier(collision_strategy="best_score", null_strategy="weighted", heuristic="aco", heuristic_config=aco_config)
    classifier.fit(X_train, y_train, column_names, column_types, n_discrete_bins=3, learn_classes=[1])
    classifiers.append(classifier)
    print(f"Model {i + 1}, number of clauses: {len(classifier.model[1]['cnf_weights'])}")

Model for class 1 created
Score: 0.9545454545454546
Model 1, number of clauses: 3
Model for class 1 created
Score: 0.9090909090909091
Model 2, number of clauses: 3
Model for class 1 created
Score: 0.9090909090909091
Model 3, number of clauses: 3
Model for class 1 created
Score: 0.9545454545454546
Model 4, number of clauses: 3
Model for class 1 created
Score: 0.9545454545454546
Model 5, number of clauses: 3
Model for class 1 created
Score: 0.9545454545454546
Model 6, number of clauses: 3
Model for class 1 created
Score: 0.9545454545454546
Model 7, number of clauses: 3
Model for class 1 created
Score: 0.9090909090909091
Model 8, number of clauses: 3
Model for class 1 created
Score: 0.9090909090909091
Model 9, number of clauses: 3
Model for class 1 created
Score: 0.9090909090909091
Model 10, number of clauses: 3
