In [250]:
from scipy.io import arff
import pandas as pd
import numpy as np
from itertools import combinations


In [251]:
def load_arff(path):    
    data = arff.loadarff(path)
    df = pd.DataFrame(data[0])
    df.dropna()
    df = pd.get_dummies(df)
    return df

In [252]:
def generate_new_combinations(old_combinations):
    items_types_in_previous_step = np.unique(old_combinations.flatten())
    for old_combination in old_combinations:
        max_combination = old_combination[-1]
        mask = items_types_in_previous_step > max_combination
        valid_items = items_types_in_previous_step[mask]
        old_tuple = tuple(old_combination)
        for item in valid_items:
            yield from old_tuple
            yield item


def apriori(df, min_support=0.5, use_colnames=True):
    def _support(_x):
        return _x.mean(0).reshape(-1)

    if min_support <= 0.0:
        raise ValueError(
            "`min_support` doit etre positive "
            "il doit etre compris dans l'interval `(0, 1]`. "
            "minsup %s." % min_support
        )
    
    X = df.values
    support = _support(X)
    ary_col_idx = np.arange(X.shape[1])
    support_dict = {1: support[support >= min_support]}
    itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)}
    max_itemset = 1

    while True :
        next_max_itemset = max_itemset + 1
        combin = generate_new_combinations(itemset_dict[max_itemset])
        combin = np.fromiter(combin, dtype=int)
        combin = combin.reshape(-1, next_max_itemset)
        
        if combin.size == 0:
            break

        _bools = np.all(X[:, combin], axis=2)
        support = _support(np.array(_bools))
        _mask = (support >= min_support).reshape(-1)
        
        if any(_mask):
            itemset_dict[next_max_itemset] = np.array(combin[_mask])
            support_dict[next_max_itemset] = np.array(support[_mask])
            max_itemset = next_max_itemset
        else:
            # Exit condition
            break
    all_res = []
    for k in sorted(itemset_dict):
        support = pd.Series(support_dict[k])
        itemsets = pd.Series([frozenset(i) for i in itemset_dict[k]], dtype="object")

        res = pd.concat((support, itemsets), axis=1)
        all_res.append(res)

    res_df = pd.concat(all_res)
    res_df.columns = ["support", "itemsets"]

    res_df = res_df.reset_index(drop=True)

    return res_df


In [253]:
df = pd.read_csv('./profexamle1.csv')
df

Unnamed: 0,A,B,C,D,E
0,1,0,1,0,1
1,1,1,1,0,0
2,1,0,0,1,1
3,0,1,1,0,1
4,0,1,1,0,0


In [254]:
itemsets = apriori(df, min_support=0.4)
itemsets

Unnamed: 0,support,itemsets
0,0.6,(0)
1,0.6,(1)
2,0.8,(2)
3,0.6,(4)
4,0.4,"(0, 2)"
5,0.4,"(0, 4)"
6,0.6,"(1, 2)"
7,0.4,"(2, 4)"


In [255]:
def association_rules(df, metric="confidence", min_threshold=0.8, support_only=False):
    
    # metrics for association rules
    metric_dict = {
        "antecedent support": lambda _, sA, __: sA,
        "consequent support": lambda _, __, sC: sC,
        "support": lambda sAC, _, __: sAC,
        "confidence": lambda sAC, sA, _: sAC / sA,
        "lift": lambda sAC, sA, sC: metric_dict["confidence"](sAC, sA, sC) / sC,
        "leverage": lambda sAC, sA, sC: metric_dict["support"](sAC, sA, sC) - sA * sC,
    }

    columns_ordered = [
        "antecedent support",
        "consequent support",
        "support",
        "confidence",
        "lift",
        "leverage",
    ]

    # check for metric compliance
    if support_only:
        metric = "support"
    else:
        if metric not in metric_dict.keys():
            raise ValueError(
                "Metric must be 'confidence' or 'lift', got '{}'".format(metric)
            )

    # get dict of {frequent itemset} -> support
    keys = df["itemsets"].values
    values = df["support"].values
    frozenset_vect = np.vectorize(lambda x: frozenset(x))
    frequent_items_dict = dict(zip(frozenset_vect(keys), values))

    # prepare buckets to collect frequent rules
    rule_antecedents = []
    rule_consequents = []
    rule_supports = []

    # iterate over all frequent itemsets
    for k in frequent_items_dict.keys():
        sAC = frequent_items_dict[k]
        # to find all possible combinations
        for idx in range(len(k) - 1, 0, -1):
            # of antecedent and consequent
            for c in combinations(k, r=idx):
                antecedent = frozenset(c)
                consequent = k.difference(antecedent)

                if support_only:
                    # support doesn't need these,
                    # hence, placeholders should suffice
                    sA = None
                    sC = None

                else:
                    try:
                        sA = frequent_items_dict[antecedent]
                        sC = frequent_items_dict[consequent]
                    except KeyError as e:
                        s = (
                            str(e) + "You are likely getting this error"
                            " because the DataFrame is missing "
                            " antecedent and/or consequent "
                            " information."
                            " You can try using the "
                            " `support_only=True` option"
                        )
                        raise KeyError(s)
                    # check for the threshold

                score = metric_dict[metric](sAC, sA, sC)
                if score >= min_threshold:
                    rule_antecedents.append(antecedent)
                    rule_consequents.append(consequent)
                    rule_supports.append([sAC, sA, sC])

    # check if frequent rule was generated
    if not rule_supports:
        return pd.DataFrame(columns=["antecedents", "consequents"] + columns_ordered)

    else:
        # generate metrics
        rule_supports = np.array(rule_supports).T.astype(float)
        df_res = pd.DataFrame(
            data=list(zip(rule_antecedents, rule_consequents)),
            columns=["antecedents", "consequents"],
        )

        if support_only:
            sAC = rule_supports[0]
            for m in columns_ordered:
                df_res[m] = np.nan
            df_res["support"] = sAC

        else:
            sAC = rule_supports[0]
            sA = rule_supports[1]
            sC = rule_supports[2]
            for m in columns_ordered:
                df_res[m] = metric_dict[m](sAC, sA, sC)

        return df_res

In [256]:
association_rules(itemsets, metric="confidence", min_threshold=0.4, support_only=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage
0,(0),(2),0.6,0.8,0.4,0.666667,0.833333,-0.08
1,(2),(0),0.8,0.6,0.4,0.5,0.833333,-0.08
2,(0),(4),0.6,0.6,0.4,0.666667,1.111111,0.04
3,(4),(0),0.6,0.6,0.4,0.666667,1.111111,0.04
4,(1),(2),0.6,0.8,0.6,1.0,1.25,0.12
5,(2),(1),0.8,0.6,0.6,0.75,1.25,0.12
6,(2),(4),0.8,0.6,0.4,0.5,0.833333,-0.08
7,(4),(2),0.6,0.8,0.4,0.666667,0.833333,-0.08
