In [25]:
import pandas as pd

def process_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    - Integer dtypes -> float64
    - For object/string/categorical columns:
        * If unique non-null values <= 15: keep as object
        * If unique non-null values > 15: keep top 15, others -> "Unknown"
    - Other dtypes left unchanged
    """
    out = df.copy()

    for col in out.columns:
        s = out[col]

        # 1) Integers -> float64
        if pd.api.types.is_integer_dtype(s):
            out[col] = s.astype("float64")
            continue

        # 2) Non-numeric categoricals/strings
        is_cat_like = (
            pd.api.types.is_object_dtype(s)
            or isinstance(s.dtype, pd.CategoricalDtype)
            or pd.api.types.is_string_dtype(s)
        )
        if is_cat_like:
            n_unique = s.nunique(dropna=True)
            if n_unique > 10:
                top15 = s.value_counts(dropna=True).index[:10]
                out[col] = s.where(s.isna() | s.isin(top15), "Unknown").astype("object")
            else:
                out[col] = s.astype("object")

    return out

In [26]:
df = process_df(pd.read_csv("../assets/online_news.csv"))

In [27]:
import numpy as np
import FIRM.base.fuzzy_data as fuzzy_data
from FIRM.methods.AARFI import AARFI

In [28]:
dataset = df.copy()
dataset = dataset.iloc[:, :20]
# find integer columns and convert them to float
int_cols = dataset.select_dtypes(include=['int']).columns
dataset[int_cols] = dataset[int_cols].astype(float)
fuzzy_dataset = fuzzy_data.FuzzyDataQuantiles('symmetric', dataset, 3, ['L', 'M', 'H'])

In [29]:
dataset.info

<bound method DataFrame.info of         n_tokens_title   n_tokens_content   n_unique_tokens  \
0                 12.0              219.0          0.663594   
1                  9.0              255.0          0.604743   
2                  9.0              211.0          0.575130   
3                  9.0              531.0          0.503788   
4                 13.0             1072.0          0.415646   
...                ...                ...               ...   
39639             11.0              346.0          0.529052   
39640             12.0              328.0          0.696296   
39641             10.0              442.0          0.516355   
39642              6.0              682.0          0.539493   
39643             10.0              157.0          0.701987   

        n_non_stop_words   n_non_stop_unique_tokens   num_hrefs  \
0                    1.0                   0.815385         4.0   
1                    1.0                   0.791946         3.0   
2         

In [30]:
import FIRM.base.operators.implications as implications
import FIRM.base.operators.tnorms as tnorms
I = lambda x, y: 1 - x + x * (y**0.01)
T = lambda x, y: np.maximum(x + y - 1, 0)
#I = implications.ImplicationsExamples.get_fuzzy_implication(implications.ImplicationsExamples.IGNORE)
#T = tnorms.TnormsExamples.get_tnorm(tnorms.TnormsExamples.PRODUCT)
rules = AARFI(dataset, fuzzy_dataset, T, I, min_cov=0.2, min_supp=0.2, min_conf=0.7, max_feat=3)
measures = rules.measures(fuzzy_dataset)
measures

KeyboardInterrupt: 

In [298]:
import FIRM.base.ct_fuzzy_rule as fuzzy_rule
rule1 = fuzzy_rule.CRFuzzyRule([(2, 0), (1, 0)])
rule2 = fuzzy_rule.CRFuzzyRule([(1, 0), (2, 0)])
rule1.evaluate_rule_database(dataset, fuzzy_dataset, T, I)
rule2.evaluate_rule_database(dataset, fuzzy_dataset, T, I)
print(rule1.sentence_rule(fuzzy_dataset))
print("  Support:", rule1.fsupport())
print("  Confidence:", rule1.fconfidence())

print(rule2.sentence_rule(fuzzy_dataset))
print("  Support:", rule2.fsupport())
print("  Confidence:", rule2.fconfidence())

IF ( HouseMedianAge IS L ) THEN Latitude IS L
  Support: 0.16615977883338928
  Confidence: 0.4448797546619245
IF ( Latitude IS L ) THEN HouseMedianAge IS L
  Support: 0.1868894100189209
  Confidence: 0.4817239778529963


In [299]:
# Extract measures
num_rules = len(measures['num_features'])
fcoverage_mean, fcoverage_std = np.mean(measures['fcoverage']), np.std(measures['fcoverage'])
fsupport_mean, fsupport_std = np.mean(measures['fsupport']), np.std(measures['fsupport'])
fconfidence_mean, fconfidence_std = np.mean(measures['fconfidence']), np.std(measures['fconfidence'])
fwracc_mean, fwracc_std = np.mean(measures['fwracc']), np.std(measures['fwracc'])

# Print results
print('num rules:', num_rules)
print(f'fcoverage: mean={fcoverage_mean:.4f}, std={fcoverage_std:.4f}')
print(f'fsupport: mean={fsupport_mean:.4f}, std={fsupport_std:.4f}')
print(f'fconfidence: mean={fconfidence_mean:.4f}, std={fconfidence_std:.4f}')

num rules: 36
fcoverage: mean=0.3381, std=0.0575
fsupport: mean=0.3021, std=0.0512
fconfidence: mean=0.8972, std=0.0757


In [31]:
data = dataset.copy()

for i in range(len(fuzzy_dataset.fv_list)):
    labels = getattr(fuzzy_dataset.fv_list[i], "get_labels")
    labels = labels() if callable(labels) else labels
    
    data[dataset.columns[i]] = dataset[dataset.columns[i]].map(
        lambda x: fuzzy_dataset.fv_list[i].eval_max_fuzzy_set(x)
    )

df_encoded = pd.get_dummies(data,columns=data.columns)

In [32]:
from mlxtend.frequent_patterns import apriori, association_rules

df = apriori(df_encoded, min_support=0.2, use_colnames=True, verbose=1)
df_ar = association_rules(df, metric="confidence", min_threshold=0.7)
# Keep only rules with <=3 items in antecedent and <=1 in consequent
df_rules_filtered = df_ar[
    (df_ar['antecedents'].apply(len) <= 3) &
    (df_ar['consequents'].apply(len) <= 1)
].reset_index(drop=True)

rules_sorted = df_rules_filtered.sort_values(by="confidence", ascending=False).reset_index(drop=True)
rules_sorted


Processing 13 combinations | Sampling itemset size 131210


  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,( n_tokens_title_H),( num_imgs_L),0.294572,1.000000,0.294572,1.000000,1.000000,1.0,0.000000,inf,0.000000,0.294572,0.000000,0.647286
1,"( num_videos_L, num_hrefs_L, data_channel_is...",( data_channel_is_entertainment_L),0.429422,1.000000,0.429422,1.000000,1.000000,1.0,0.000000,inf,0.000000,0.429422,0.000000,0.714711
2,"( num_hrefs_L, average_token_length_L, data_...",( num_videos_L),0.204823,1.000000,0.204823,1.000000,1.000000,1.0,0.000000,inf,0.000000,0.204823,0.000000,0.602411
3,"( num_videos_L, num_hrefs_L, average_token_l...",( data_channel_is_tech_L),0.204823,1.000000,0.204823,1.000000,1.000000,1.0,0.000000,inf,0.000000,0.204823,0.000000,0.602411
4,"( data_channel_is_tech_L, num_hrefs_L, avera...",( num_videos_L),0.204823,1.000000,0.204823,1.000000,1.000000,1.0,0.000000,inf,0.000000,0.204823,0.000000,0.602411
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14737,"( n_tokens_content_L, data_channel_is_socmed_...",( num_hrefs_L),0.338185,0.429422,0.240440,0.710972,1.655649,1.0,0.095216,1.974127,0.598365,0.456098,0.493447,0.635444
14738,"( data_channel_is_tech_L, n_tokens_content_L,...",( num_hrefs_L),0.338185,0.429422,0.240440,0.710972,1.655649,1.0,0.095216,1.974127,0.598365,0.456098,0.493447,0.635444
14739,"( n_tokens_content_L, data_channel_is_world_L...",( num_hrefs_L),0.338185,0.429422,0.240440,0.710972,1.655649,1.0,0.095216,1.974127,0.598365,0.456098,0.493447,0.635444
14740,"( n_tokens_content_L, kw_min_min_L, n_non_st...",( num_hrefs_L),0.338185,0.429422,0.240440,0.710972,1.655649,1.0,0.095216,1.974127,0.598365,0.456098,0.493447,0.635444


In [302]:
# Number of rules (rows)
num_rules = len(rules_sorted)

# Compute mean and std only for support and confidence
support_mean, support_std = rules_sorted['support'].mean(), rules_sorted['support'].std()
confidence_mean, confidence_std = rules_sorted['confidence'].mean(), rules_sorted['confidence'].std()

# Print results
print(f'num rules: {num_rules}')
print(f'support: mean={support_mean:.4f}, std={support_std:.4f}')
print(f'confidence: mean={confidence_mean:.4f}, std={confidence_std:.4f}')


num rules: 89
support: mean=0.3019, std=0.0521
confidence: mean=0.9151, std=0.0931


In [303]:
rules_sorted.iloc[0]['antecedents']

frozenset({'Latitude_L', 'Longitude_H'})

In [304]:
import pandas as pd
import math

def _tokens_from_group(group):
    """Return a list of 'Var_Label' tokens from frozenset/set/list/tuple or string."""
    if group is None or (isinstance(group, float) and math.isnan(group)):
        return []
    if isinstance(group, (set, frozenset, list, tuple)):
        return [str(x).strip() for x in group if str(x).strip()]
    if isinstance(group, str):
        s = group.strip()
        if not s:
            return []
        if s.startswith("(") and s.endswith(")"):
            s = s[1:-1].strip()
        return [t.strip() for t in s.split(",")] if "," in s else [s]
    return [str(group).strip()]

def _build_var_and_label_maps(dataset_columns, fuzzy_dataset):
    """
    Returns:
      var_to_idx: {var_name -> position i}
      label_maps: {var_name -> {label_string -> label_index}}
    Labels come from fuzzy_dataset.fv_list[i].get_labels()
    """
    cols_list = list(dataset_columns.tolist()) if hasattr(dataset_columns, "tolist") else list(dataset_columns)
    var_to_idx = {name: i for i, name in enumerate(cols_list)}

    label_maps = {}
    for var_name, i in var_to_idx.items():
        # CALL the method to get the actual label list
        labels = list(fuzzy_dataset.fv_list[i].get_labels)
        # keep original strings; also add a case-insensitive shim
        per_var = {str(lab): j for j, lab in enumerate(labels)}
        label_maps[var_name] = per_var
    return var_to_idx, label_maps

def _resolve_label_idx(var: str, lab: str, label_maps: dict) -> int:
    """
    Map a label string to its index using label_maps[var].
    Tries exact, then case-insensitive exact, then first-letter (if unique).
    """
    per_var = label_maps[var]
    if lab in per_var:
        return per_var[lab]
    # case-insensitive exact
    for k in per_var.keys():
        if k.lower() == lab.lower():
            return per_var[k]
    # First-letter fallback, only if unique
    fl = lab[:1].lower()
    candidates = [name for name in per_var.keys() if name[:1].lower() == fl]
    if len(candidates) == 1:
        return per_var[candidates[0]]
    raise KeyError(f"Unknown label {lab!r} for variable {var!r}. Known labels: {list(per_var.keys())}")

def _pair_from_token(token: str, var_to_idx: dict, label_maps: dict) -> tuple[int, int]:
    """Map 'Var_Label' -> (var_idx, label_idx) using dataset column order + fuzzy_dataset labels."""
    if "_" not in token:
        raise ValueError(f"Expected 'Var_Label', got: {token!r}")
    var, lab = token.rsplit("_", 1)  # last underscore splits var from label
    var = var.strip()
    lab = lab.strip()
    if var not in var_to_idx:
        raise KeyError(f"Variable {var!r} not found in dataset columns: {list(var_to_idx.keys())}")
    var_idx = var_to_idx[var]
    lab_idx = _resolve_label_idx(var, lab, label_maps)
    return (var_idx, lab_idx)

def _group_to_pairs(group, var_to_idx, label_maps):
    tokens = _tokens_from_group(group)
    pairs = [_pair_from_token(t, var_to_idx, label_maps) for t in tokens]
    # Deterministic order (since sets/frozensets are unordered)
    pairs.sort(key=lambda x: x[0])
    return pairs

def _pick_consequent(con_pairs):
    """
    Choose one consequent tuple from con_pairs.
    - If exactly one, use it.
    - If multiple, pick the one with the highest variable index (you can change to lowest).
    """
    if not con_pairs:
        raise ValueError("Rule has no consequent.")
    if len(con_pairs) == 1:
        return con_pairs[0]
    # pick by highest var index (change to min if you prefer)
    return max(con_pairs, key=lambda x: x[0])

def df_to_crfuzzyrules(df: pd.DataFrame, dataset_columns, fuzzy_dataset):
    """
    Build CRFuzzyRule objects from df['antecedents'] and df['consequents'].
    Each CRFuzzyRule receives [*antecedents, consequent] (consequent appended last).
    Returns (rules, consequents_as_singleton_lists) to keep backward compatibility.
    """
    var_to_idx, label_maps = _build_var_and_label_maps(dataset_columns, fuzzy_dataset)


    rules, consequents = [], []
    for _, row in df.iterrows():
        ant_pairs = _group_to_pairs(row["antecedents"], var_to_idx, label_maps)
        con_pairs = _group_to_pairs(row["consequents"], var_to_idx, label_maps)
        c = _pick_consequent(con_pairs)
        combined = ant_pairs + [c]          # consequent at the end
        rules.append(fuzzy_rule.CRFuzzyRule(combined))
        consequents.append([c])             # keep a reference to the chosen consequent
    return rules, consequents

# ---------------------------
# Usage (unchanged for your call site)
dataset_columns = dataset.columns
rules_crisp, cons = df_to_crfuzzyrules(rules_sorted, dataset_columns, fuzzy_dataset)

In [305]:
from FIRM.base.ct_set_fuzzy_rules import SetFuzzyRules

In [306]:
rules1 = SetFuzzyRules(rules.rule_list[:max(1, math.ceil(0.20 * len(rules.rule_list)))])
rules2 = SetFuzzyRules(rules_crisp[:max(1, math.ceil(0.20 * len(rules_crisp)))])
rules1.jaccard_similarity(rules2)

0.0