In [982]:
from scipy.io import arff
import pandas as pd
import numpy as np
from itertools import combinations



In [983]:
def load_arff(path):    
    data = arff.loadarff(path)
    df = pd.DataFrame(data[0])
    for col, dtype in df.dtypes.items():
        if dtype == object:  # Only process object columns.
            df[col] = df[col].str.decode('utf-8')
    return df

In [984]:
def discretize(df):
    for col, dtype in df.dtypes.items():
        if dtype == float or dtype == int:
            df[col] = pd.cut(df[col], 4,precision=0)
    return df


In [985]:
df = load_arff("./../profdata/weather.numeric.arff")
df = discretize(df)
df

Unnamed: 0,outlook,temperature,humidity,windy,play
0,sunny,"(80.0, 85.0]","(80.0, 88.0]",False,no
1,sunny,"(80.0, 85.0]","(88.0, 96.0]",True,no
2,overcast,"(80.0, 85.0]","(80.0, 88.0]",False,yes
3,rainy,"(69.0, 74.0]","(88.0, 96.0]",False,yes
4,rainy,"(64.0, 69.0]","(73.0, 80.0]",False,yes
5,rainy,"(64.0, 69.0]","(65.0, 73.0]",True,no
6,overcast,"(64.0, 69.0]","(65.0, 73.0]",True,yes
7,sunny,"(69.0, 74.0]","(88.0, 96.0]",False,no
8,sunny,"(64.0, 69.0]","(65.0, 73.0]",False,yes
9,rainy,"(74.0, 80.0]","(73.0, 80.0]",False,yes


In [986]:
df = pd.get_dummies(df,dtype=bool,sparse=True)
df

Unnamed: 0,outlook_overcast,outlook_rainy,outlook_sunny,"temperature_(64.0, 69.0]","temperature_(69.0, 74.0]","temperature_(74.0, 80.0]","temperature_(80.0, 85.0]","humidity_(65.0, 73.0]","humidity_(73.0, 80.0]","humidity_(80.0, 88.0]","humidity_(88.0, 96.0]",windy_FALSE,windy_TRUE,play_no,play_yes
0,False,False,True,False,False,False,True,False,False,True,False,True,False,True,False
1,False,False,True,False,False,False,True,False,False,False,True,False,True,True,False
2,True,False,False,False,False,False,True,False,False,True,False,True,False,False,True
3,False,True,False,False,True,False,False,False,False,False,True,True,False,False,True
4,False,True,False,True,False,False,False,False,True,False,False,True,False,False,True
5,False,True,False,True,False,False,False,True,False,False,False,False,True,True,False
6,True,False,False,True,False,False,False,True,False,False,False,False,True,False,True
7,False,False,True,False,True,False,False,False,False,False,True,True,False,True,False
8,False,False,True,True,False,False,False,True,False,False,False,True,False,False,True
9,False,True,False,False,False,True,False,False,True,False,False,True,False,False,True


In [987]:
from itertools import combinations

def support(df ,frequent_itemsets : list):
    return np.array([df.iloc[:,x].prod(1).mean() for x in frequent_itemsets])

def generate_candidates1(df):
    return np.arange(df.shape[1]).reshape(-1,1)

def generate_candidates2(frequent_itemsets):
    frequent_itemsets = [x[0] for x in frequent_itemsets]
    return np.array([list(x) for x in combinations(frequent_itemsets,2)])
   
def generate_candidatesK(df ,frequent_itemsets , k):
    candidates = []
    for i in range(len(frequent_itemsets)):
        for j in range(i+1,len(frequent_itemsets)):
            if np.array_equal(frequent_itemsets[i][:k-2], frequent_itemsets[j][:k-2]):
                candidates.append(list(set(frequent_itemsets[i]) | set(frequent_itemsets[j])))
    return np.array(candidates)

def apriori (df, min_support, use_colnames = True):

    itemset_dict = {}
    support_dict = {}

    #generate set of 1-itemsets
    frequent1 = generate_candidates1(df)
    supparr = support(df,frequent1)
    itemset_dict[1] = frequent1[supparr >= min_support]
    support_dict[1] = supparr[supparr >= min_support].reshape(-1,1)
    if itemset_dict[1].shape[0] == 0:
        del itemset_dict[1]
        del support_dict[1]
        return itemset_dict,support_dict
 

    #generate set of 2-itemsets
    frequent2 = generate_candidates2(itemset_dict[1])
    supparr = support(df,frequent2)
    itemset_dict[2] = frequent2[supparr >= min_support]
    support_dict[2] = supparr[supparr >= min_support].reshape(-1,1)
    if itemset_dict[2].shape[0] == 0:
        del itemset_dict[2]
        del support_dict[2]
        return itemset_dict,support_dict


    #generate set of k-itemsets
    frequen_prev = frequent2 
    k = 3
    while True:
        frequentk = generate_candidatesK(df,frequen_prev,k)
        supparr = support(df,frequentk)
        itemset_dict[k] = frequentk[supparr >= min_support]
        support_dict[k] = supparr[supparr >= min_support].reshape(-1,1)
        frequen_prev = itemset_dict[k]   
        if itemset_dict[k].shape[0] == 0:
            del itemset_dict[k]
            del support_dict[k]
            return itemset_dict,support_dict
        k += 1




In [988]:
def aprioriclose(df, min_support):
    itemset_dict,support_dict = apriori(df, min_support)
    closed_itemset_dict = {}
    closed_support_dict = {}
    for k in itemset_dict.keys():
        closed_itemset_dict[k] = []
        closed_support_dict[k] = []
        for i in range(itemset_dict[k].shape[0]):
            isclosed = True
            for j in range(itemset_dict[k].shape[0]):
                if i != j and set(itemset_dict[k][i]).issubset(set(itemset_dict[k][j])) and support_dict[k][i] == support_dict[k][j]:
                    isclosed = False
                    break
            if isclosed:
                closed_itemset_dict[k].append(itemset_dict[k][i])
                closed_support_dict[k].append(support_dict[k][i])
        closed_itemset_dict[k] = np.array(closed_itemset_dict[k])
        closed_support_dict[k] = np.array(closed_support_dict[k])
    return closed_itemset_dict,closed_support_dict

In [989]:
def dict_to_df(df, itemset_dict, support_dict, use_colnames=False): 
    all_res = []
    for k in sorted(itemset_dict):
        support = pd.Series(support_dict[k].flatten())
        itemsets = pd.Series([frozenset(i) for i in itemset_dict[k]], dtype="object")

        res = pd.concat((support, itemsets), axis=1)
        all_res.append(res)

    res_df = pd.concat(all_res)
    res_df.columns = ["support", "itemsets"]
    if use_colnames:
        mapping = {idx: item for idx, item in enumerate(df.columns)}
        res_df["itemsets"] = res_df["itemsets"].apply(
            lambda x: frozenset([mapping[i] for i in x])
        )
    res_df = res_df.reset_index(drop=True)

    return res_df

In [990]:
itemsets,support = aprioriclose(df, min_support=0.1)
itemsss = dict_to_df(df,itemsets,support,use_colnames=True)

In [991]:
def association_rules1(df, metric="confidence", min_threshold=0.8, support_only=False):
    
    # metrics for association rules
    metric_dict = {
        "antecedent support": lambda _, sA, __: sA,
        "consequent support": lambda _, __, sC: sC,
        "support": lambda sAC, _, __: sAC,
        "confidence": lambda sAC, sA, _: sAC / sA,
        "lift": lambda sAC, sA, sC: metric_dict["confidence"](sAC, sA, sC) / sC,
        "leverage": lambda sAC, sA, sC: metric_dict["support"](sAC, sA, sC) - sA * sC,
    }

    columns_ordered = [
        "antecedent support",
        "consequent support",
        "support",
        "confidence",
        "lift",
        "leverage",
    ]

    # check for metric compliance
    if support_only:
        metric = "support"
    else:
        if metric not in metric_dict.keys():
            raise ValueError(
                "Metric must be 'confidence' or 'lift', got '{}'".format(metric)
            )

    # get dict of {frequent itemset} -> support
    keys = df["itemsets"].values
    values = df["support"].values
    frozenset_vect = np.vectorize(lambda x: frozenset(x))
    frequent_items_dict = dict(zip(frozenset_vect(keys), values))

    # prepare buckets to collect frequent rules
    rule_antecedents = []
    rule_consequents = []
    rule_supports = []

    # iterate over all frequent itemsets
    for k in frequent_items_dict.keys():
        sAC = frequent_items_dict[k]
        # to find all possible combinations
        for idx in range(len(k) - 1, 0, -1):
            # of antecedent and consequent
            for c in combinations(k, r=idx):
                antecedent = frozenset(c)
                consequent = k.difference(antecedent)

                if support_only:
                    # support doesn't need these,
                    # hence, placeholders should suffice
                    sA = None
                    sC = None

                else:
                    try:
                        sA = frequent_items_dict[antecedent]
                        sC = frequent_items_dict[consequent]
                    except KeyError as e:
                        s = (
                            str(e) + "You are likely getting this error"
                            " because the DataFrame is missing "
                            " antecedent and/or consequent "
                            " information."
                            " You can try using the "
                            " `support_only=True` option"
                        )
                        raise KeyError(s)
                    # check for the threshold

                score = metric_dict[metric](sAC, sA, sC)
                if score >= min_threshold:
                    rule_antecedents.append(antecedent)
                    rule_consequents.append(consequent)
                    rule_supports.append([sAC, sA, sC])

    # check if frequent rule was generated
    if not rule_supports:
        return pd.DataFrame(columns=["antecedents", "consequents"] + columns_ordered)

    else:
        # generate metrics
        rule_supports = np.array(rule_supports).T.astype(float)
        df_res = pd.DataFrame(
            data=list(zip(rule_antecedents, rule_consequents)),
            columns=["antecedents", "consequents"],
        )

        if support_only:
            sAC = rule_supports[0]
            for m in columns_ordered:
                df_res[m] = np.nan
            df_res["support"] = sAC

        else:
            sAC = rule_supports[0]
            sA = rule_supports[1]
            sC = rule_supports[2]
            for m in columns_ordered:
                df_res[m] = metric_dict[m](sAC, sA, sC)

        return df_res

In [992]:
association_rules1(itemsss)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage
0,(outlook_overcast),(play_yes),0.285714,0.642857,0.285714,1.0,1.555556,0.102041
1,"(temperature_(69.0, 74.0])","(humidity_(88.0, 96.0])",0.285714,0.357143,0.285714,1.0,2.8,0.183673
2,"(temperature_(74.0, 80.0])",(play_yes),0.142857,0.642857,0.142857,1.0,1.555556,0.05102
3,"(humidity_(80.0, 88.0])","(temperature_(80.0, 85.0])",0.142857,0.285714,0.142857,1.0,3.5,0.102041
4,"(humidity_(73.0, 80.0])",(windy_FALSE),0.214286,0.571429,0.214286,1.0,1.75,0.091837
5,"(humidity_(73.0, 80.0])",(play_yes),0.214286,0.642857,0.214286,1.0,1.555556,0.076531
6,"(humidity_(80.0, 88.0])",(windy_FALSE),0.142857,0.571429,0.142857,1.0,1.75,0.061224
7,"(windy_FALSE, outlook_overcast)","(temperature_(80.0, 85.0])",0.142857,0.285714,0.142857,1.0,3.5,0.102041
8,"(temperature_(80.0, 85.0], outlook_overcast)",(windy_FALSE),0.142857,0.571429,0.142857,1.0,1.75,0.061224
9,"(play_yes, temperature_(80.0, 85.0])",(outlook_overcast),0.142857,0.285714,0.142857,1.0,3.5,0.102041


In [None]:
def association_rules():
    