In [None]:
import json
import pandas as pd
import numpy as np
import os

import warnings
warnings.filterwarnings('ignore')

from tqdm import tqdm

from scipy.stats import kendalltau, spearmanr

In [None]:
def mutual_inclusion(R, M):
  """
  This function takes two lists, R and M, and returns the result of the equation:

  F(R) n F(M) = F(R) U F(M)

  where n is the intersection and U is the union of the sets.

  Args:
    R: A list of elements.
    M: A list of elements.

  Returns:
    A list containing the elements in the intersection and union of R and M.
  """

  # Find the intersection of R and M using a set
  intersection = set(R).intersection(set(M))

  # Find the union of R and M using the + operator
  union = set(R).union(set(M))

  # Combine the intersection and union into a single list
  result = len(intersection)/len(union) 

  return result


def rmatrix_unique_rules(rule_str, feature_names=None):
    features = []
    conditions = rule_str.split(" AND ")
    for condition in conditions:
        feature_value = condition.split(" = ")
        features.append(feature_value[0])
    
    return list(pd.unique(features))

def anchor_unique_rules(rule_str, feature_names=None):
    features = []
    conditions = rule_str.split(" AND ")
    for condition in conditions:
      for feature in feature_names:
        if feature in condition:
          features.append(feature)
    
    return list(pd.unique(features))

def calculate_correlations(fi_rule, fi_bb):
    rank_bb = fi_bb.loc[fi_rule]["bb_rank"].values

    tau = kendalltau(list(range(1,len(fi_rule)+1)), rank_bb)
    spearman = spearmanr(list(range(1,len(fi_rule)+1)), rank_bb)
    return 0 if np.isnan(tau[0]) else tau[0], 0 if np.isnan(spearman[0]) else spearman[0]

In [None]:
datasets = ["car", "credit-a", "breast-w"]

In [None]:
results_all = pd.DataFrame()

for sel_dataset in tqdm(datasets):

    d = pd.read_csv(f"results_anchor/{sel_dataset}_mincov3_train.csv", sep=";")

    fi = pd.read_csv(f"results_anchor/{sel_dataset}_mincov3_fi.csv")
    fi = fi.sort_values("fi", ascending=False).set_index("vars")
    fi["bb_rank"] = fi["fi"].rank(method="min", ascending=False)
    features = list(fi.index)

    rmatrix_rules = d["rmatrix_rule"].values
    anchor_rules = d["anchor_rule"].values

    mi_rmatrix_anchor = [mutual_inclusion(rmatrix_unique_rules(rmatrix_rules[obs], features), anchor_unique_rules(anchor_rules[obs], features)) for obs in range(len(rmatrix_rules))]

    rmatrix_length = [len(rmatrix_unique_rules(rule, features)) for rule in rmatrix_rules]
    mi_rmatrix_fi_all = [mutual_inclusion(rmatrix_unique_rules(rmatrix_rules[obs], features), features) for obs in range(len(rmatrix_rules))]
    mi_rmatrix_fi_len = [mutual_inclusion(rmatrix_unique_rules(rmatrix_rules[obs], features), features[:rmatrix_length[obs]]) for obs in range(len(rmatrix_rules))]

    anchor_length = [len(anchor_unique_rules(rule, features)) for rule in anchor_rules]
    mi_anchor_fi_all = [mutual_inclusion(anchor_unique_rules(anchor_rules[obs], features), features) for obs in range(len(anchor_rules))]
    mi_anchor_fi_len = [mutual_inclusion(anchor_unique_rules(anchor_rules[obs], features), features[:anchor_length[obs]]) for obs in range(len(anchor_rules))]

    corr_rmatrix = [calculate_correlations(rmatrix_unique_rules(rmatrix_rules[obs], features), fi) for obs in range(len(rmatrix_rules))]                
    corr_tau_rmatrix = [cor[0] for cor in corr_rmatrix]
    corr_sro_rmatrix = [cor[1] for cor in corr_rmatrix]

    corr_anchor = [calculate_correlations(anchor_unique_rules(anchor_rules[obs], features), fi) for obs in range(len(anchor_rules))]                    
    corr_tau_anchor = [cor[0] for cor in corr_anchor]
    corr_sro_anchor = [cor[1] for cor in corr_anchor]

    results = pd.DataFrame({'mi_rmatrix_anchor': mi_rmatrix_anchor, 
                            'mi_rmatrix_fi_all': mi_rmatrix_fi_all, 'mi_rmatrix_fi_len': mi_rmatrix_fi_len,
                            'mi_anchor_fi_all': mi_anchor_fi_all, 'mi_anchor_fi_len': mi_anchor_fi_len,
                            'corr_tau_rmatrix': corr_tau_rmatrix, 'corr_tau_anchor': corr_tau_anchor
                           })
    
    results["dataset"] = sel_dataset
    results_all = pd.concat([results_all, results])


In [None]:
results_all.to_csv(f"results_anchor/anchor_mi_corr.csv", index=False)