In [None]:
import json
import pandas as pd
import numpy as np
import os

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import cohen_kappa_score, accuracy_score, balanced_accuracy_score, recall_score, mutual_info_score

from tqdm import tqdm
from rmatrix.classification import RMatrixClassifier
from decision_rules.serialization.utils import JSONSerializer
from decision_rules.classification.ruleset import ClassificationRuleSet
from decision_rules.measures import c2, precision

from scipy.stats import kendalltau, spearmanr


In [None]:
def mutual_inclusion(R, M):
  """
  This function takes two lists, R and M, and returns the result of the equation:

  F(R) n F(M) = F(R) U F(M)

  where n is the intersection and U is the union of the sets.

  Args:
    R: A list of elements.
    M: A list of elements.

  Returns:
    A list containing the elements in the intersection and union of R and M.
  """

  # Find the intersection of R and M using a set
  intersection = set(R).intersection(set(M))

  # Find the union of R and M using the + operator
  union = set(R).union(set(M))

  # Combine the intersection and union into a single list
  result = len(intersection)/len(union) 

  return result

def rmatrix_unique_rules(rule, feature_names=None):
    rule_str = rule.premise.to_string(feature_names)
    features = []
    conditions = rule_str.split(" AND ")
    for condition in conditions:
        feature_value = condition.split(" = ")
        features.append(feature_value[0])
    
    return list(pd.unique(features))

def extract_shap_fi(shap_df_row, shap_df, shap_length, stat):
    row_values = np.abs(shap_df.iloc[shap_df_row,:].values)
    col_names = np.array(shap_df.columns)
    row_values_sort_idx = np.argsort(-row_values)
    if stat == "all":
      feature_top = list(col_names[row_values_sort_idx])
    elif stat == "max":
      top = np.max(shap_length)
      feature_top = list(col_names[row_values_sort_idx])[:top]
    elif stat == "len":
      top = shap_length[shap_df_row]
      feature_top = list(col_names[row_values_sort_idx])[:top]
    return feature_top

def precision(c) -> float:  # pylint: disable=missing-function-docstring
    if (c.p + c.n) == 0:
        return 0
    return c.p / (c.p + c.n)

def coverage(c) -> float:  # pylint: disable=missing-function-docstring
    return c.p / c.P

def calculate_correlations(fi_rule, fi_bb):
    fi_bb = fi_bb.sort_values(by="importance", ascending=False)
    fi_bb["bb_rank"] = fi_bb["importance"].rank(method="min", ascending=False)
    # fi_bb_top = fi_bb.iloc[:fi_length,:]

    fi_rules = pd.DataFrame({'attribute': fi_rule})
    fi_rules["rule_rank"] = fi_rules.index+1

    fi_rules_bb = fi_rules.merge(fi_bb, how="left", on="attribute")

    tau = kendalltau(fi_rules_bb["rule_rank"], fi_rules_bb["bb_rank"])
    spearman = spearmanr(fi_rules_bb["rule_rank"], fi_rules_bb["bb_rank"])
    return 0 if np.isnan(tau[0]) else tau[0], 0 if np.isnan(spearman[0]) else spearman[0]

def calculate_correlations2(fi_rule, fi_bb):
    rank_bb = fi_bb.loc[fi_rule]["bb_rank"].values

    tau = kendalltau(list(range(1,len(fi_rule)+1)), rank_bb)
    spearman = spearmanr(list(range(1,len(fi_rule)+1)), rank_bb)
    return 0 if np.isnan(tau[0]) else tau[0], 0 if np.isnan(spearman[0]) else spearman[0]


In [None]:
bb_models = pd.read_csv("results/selected_bb_models.csv")
datasets = bb_models["dataset"].unique()

# RMatrix without ranking

In [None]:
class_types = ["_filterFF_precision_approx"]

results_all = pd.DataFrame()
results_detailed = pd.DataFrame()

for class_type in class_types:

    for sel_dataset in tqdm(datasets):

        models = np.unique(bb_models[bb_models["dataset"]==sel_dataset]["model"])

        x_train_df = pd.read_csv(f"../results_all/{sel_dataset}/train.csv")
        x_train_df = x_train_df.rename(columns={'target': 'name'})
        y_train = x_train_df["name"].squeeze().astype(str)
        x_train_df.drop(columns=["name"], inplace=True)

        x_test_df = pd.read_csv(f"../results_all/{sel_dataset}/test.csv")
        x_test_df = x_test_df.rename(columns={'target': 'name'})
        y_test = x_test_df["name"].squeeze().astype(str)
        x_test_df.drop(columns=["name"], inplace=True)

        binary_columns = list(x_train_df.columns[x_train_df.isin([0,1]).all()])
        if len(binary_columns) > 0:
            x_train_df[binary_columns] = x_train_df[binary_columns].astype(str)
            x_test_df[binary_columns] = x_test_df[binary_columns].astype(str)

        feature_names = x_train_df.columns

        for sel_model in models:

            fi_path = f"../results_new/{sel_dataset}/{sel_model}/fi_test.csv"
            file_path= f"../results_all/{sel_dataset}/{sel_model}/ruleset{class_type}.json"

            if os.path.exists(fi_path) and os.path.exists(file_path):

                fi = pd.read_csv(fi_path)
                fi_bb = fi.sort_values(by="importance", ascending=False).set_index("attribute")
                fi_bb["bb_rank"] = fi_bb["importance"].rank(method="min", ascending=False)

                with open(file_path, 'r') as json_file:
                    ruleset_json_read = json.load(json_file)

                classifier = JSONSerializer.deserialize(ruleset_json_read, target_class=ClassificationRuleSet)
                if "c2" in class_type:
                    classifier.update(x_train_df, y_train, measure=c2)
                elif "precision" in class_type:
                    classifier.update(x_train_df, y_train, measure=precision)

                rules_features = [rmatrix_unique_rules(rule, feature_names) for rule in classifier.rules]
                rules_lengths = [len(rule) for rule in rules_features]
                
                corr_all = [calculate_correlations2(rules_features[obs], fi_bb) for obs in range(len(rules_features))]
                
                corr_tau_all = [cor[0] for cor in corr_all]
                corr_sro_all = [cor[1] for cor in corr_all]
                
                corr_df = pd.DataFrame({'corr_tau_all': corr_tau_all, 'corr_sro_all': corr_sro_all})
                corr_df["dataset"] = sel_dataset
                corr_df["model"] = sel_model
                corr_df["rmatrix"] = "c2" if "c2" in class_type else "precision"

                results_all = pd.concat([results_all, corr_df])

In [None]:
results_all.to_csv("results/corr_without_fi.csv", index=False)

# RMatrix with global ranking

In [None]:
class_types = ["_filterFF_precision_global"]

results_all = pd.DataFrame()
results_detailed = pd.DataFrame()

for class_type in class_types:

    for sel_dataset in tqdm(datasets):

        models = np.unique(bb_models[bb_models["dataset"]==sel_dataset]["model"])

        x_train_df = pd.read_csv(f"../results_all/{sel_dataset}/train.csv")
        x_train_df = x_train_df.rename(columns={'target': 'name'})
        y_train = x_train_df["name"].squeeze().astype(str)
        x_train_df.drop(columns=["name"], inplace=True)

        x_test_df = pd.read_csv(f"../results_all/{sel_dataset}/test.csv")
        x_test_df = x_test_df.rename(columns={'target': 'name'})
        y_test = x_test_df["name"].squeeze().astype(str)
        x_test_df.drop(columns=["name"], inplace=True)

        binary_columns = list(x_train_df.columns[x_train_df.isin([0,1]).all()])
        if len(binary_columns) > 0:
            x_train_df[binary_columns] = x_train_df[binary_columns].astype(str)
            x_test_df[binary_columns] = x_test_df[binary_columns].astype(str)

        feature_names = x_train_df.columns

        for sel_model in models:

            fi_path = f"../results_new/{sel_dataset}/{sel_model}/fi_test.csv"
            file_path= f"../results_all/{sel_dataset}/{sel_model}/ruleset{class_type}.json"

            if os.path.exists(fi_path) and os.path.exists(file_path):

                fi = pd.read_csv(fi_path)
                fi_bb = fi.sort_values(by="importance", ascending=False).set_index("attribute")
                fi_bb["bb_rank"] = fi_bb["importance"].rank(method="min", ascending=False)

                with open(file_path, 'r') as json_file:
                    ruleset_json_read = json.load(json_file)

                classifier = JSONSerializer.deserialize(ruleset_json_read, target_class=ClassificationRuleSet)
                if "c2" in class_type:
                    classifier.update(x_train_df, y_train, measure=c2)
                elif "precision" in class_type:
                    classifier.update(x_train_df, y_train, measure=precision)

                rules_features = [rmatrix_unique_rules(rule, feature_names) for rule in classifier.rules]
                rules_lengths = [len(rule) for rule in rules_features]
                
                corr_all = [calculate_correlations2(rules_features[obs], fi_bb) for obs in range(len(rules_features))]
                
                corr_tau_all = [cor[0] for cor in corr_all]
                corr_sro_all = [cor[1] for cor in corr_all]
                
                corr_df = pd.DataFrame({'corr_tau_all': corr_tau_all, 'corr_sro_all': corr_sro_all})
                corr_df["dataset"] = sel_dataset
                corr_df["model"] = sel_model
                corr_df["rmatrix"] = "c2" if "c2" in class_type else "precision"

                results_all = pd.concat([results_all, corr_df])

In [None]:
results_all.to_csv("results/corr_with_fi_global.csv", index=False)

# RMatrix with local ranking

In [None]:
def shap_rank(shap_df, shap_df_row):
    row_values = np.abs(shap_df.iloc[shap_df_row,:].values)
    col_names = np.array(shap_df.columns)
    shap_bb = pd.DataFrame({'attribute': col_names, 'importance': row_values})

    shap_bb = shap_bb.sort_values(by="importance", ascending=False).set_index("attribute")
    shap_bb["bb_rank"] = shap_bb["importance"].rank(method="min", ascending=False)

    return shap_bb

In [None]:
class_types = ["_filterFF_precision_local"]

results_all = pd.DataFrame()
results_detailed = pd.DataFrame()

for class_type in class_types:

    for sel_dataset in tqdm(datasets):

        models = np.unique(bb_models[bb_models["dataset"]==sel_dataset]["model"])

        x_train_df = pd.read_csv(f"../results_all/{sel_dataset}/train.csv")
        x_train_df = x_train_df.rename(columns={'target': 'name'})
        y_train = x_train_df["name"].squeeze().astype(str)
        x_train_df.drop(columns=["name"], inplace=True)

        x_test_df = pd.read_csv(f"../results_all/{sel_dataset}/test.csv")
        x_test_df = x_test_df.rename(columns={'target': 'name'})
        y_test = x_test_df["name"].squeeze().astype(str)
        x_test_df.drop(columns=["name"], inplace=True)

        binary_columns = list(x_train_df.columns[x_train_df.isin([0,1]).all()])
        if len(binary_columns) > 0:
            x_train_df[binary_columns] = x_train_df[binary_columns].astype(str)
            x_test_df[binary_columns] = x_test_df[binary_columns].astype(str)

        feature_names = x_train_df.columns

        for sel_model in models:

            if sel_model.split("_")[0] != 'SVC':

                shap_path = f"../results_all/{sel_dataset}/{sel_model}/shap.csv"
                file_path= f"../results_all/{sel_dataset}/{sel_model}/ruleset{class_type}.json"

                if os.path.exists(shap_path) and os.path.exists(file_path):

                    shap = pd.read_csv(shap_path)
                    
                    with open(file_path, 'r') as json_file:
                        ruleset_json_read = json.load(json_file)

                    classifier = JSONSerializer.deserialize(ruleset_json_read, target_class=ClassificationRuleSet)
                    if "c2" in class_type:
                        classifier.update(x_train_df, y_train, measure=c2)
                    elif "precision" in class_type:
                        classifier.update(x_train_df, y_train, measure=precision)

                    rules_features = [rmatrix_unique_rules(rule, feature_names) for rule in classifier.rules]
                    rules_lengths = [len(rule) for rule in rules_features]
                    
                    corr_all = [calculate_correlations2(rules_features[obs], shap_rank(shap, obs)) for obs in range(len(rules_features))]
                    
                    corr_tau_all = [cor[0] for cor in corr_all]
                    corr_sro_all = [cor[1] for cor in corr_all]
                    
                    corr_df = pd.DataFrame({'corr_tau_all': corr_tau_all, 'corr_sro_all': corr_sro_all})
                    corr_df["dataset"] = sel_dataset
                    corr_df["model"] = sel_model
                    corr_df["rmatrix"] = "c2" if "c2" in class_type else "precision"

                    results_all = pd.concat([results_all, corr_df])

In [None]:
results_all.to_csv("results/corr_with_fi_local.csv", index=False)