In [71]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [72]:
# Using the example from the lecture notes
txn_dict = {'10': 'apple,carrot,durian', 
            '20': 'banana,carrot,edamame',
            '30': 'apple,banana,carrot,edamame', 
            '40': 'banana,edamame'
            }

trans_df = pd.DataFrame.from_dict(txn_dict,orient='index')

trans_df = trans_df[0].str.split(',')

trans_df

10             [apple, carrot, durian]
20           [banana, carrot, edamame]
30    [apple, banana, carrot, edamame]
40                   [banana, edamame]
Name: 0, dtype: object

In [77]:
import pandas as pd
from preprocessor import preprocess

def prune(data,supp):
    df = data[data['supp_count'] >= supp]
    return df

def count_itemset(transaction_df, itemsets):
    count_item = {}
    for item_set in itemsets:
        set_A = set(item_set)
        for row in transaction_df:
            set_B = set(row)

            if set_B.intersection(set_A) == set_A:
                if item_set in count_item.keys():
                    count_item[item_set] += 1

                else:
                    count_item[item_set] = 1

    data = pd.DataFrame()
    data['item_sets'] = count_item.keys()
    data['supp_count'] = count_item.values()
    # print("Candidate itemset table (Counting):\n", data)

    return data

def count_item(trans_items):
    count_ind_item = {}
    for row in trans_items:
        for i in range(len(row)):
            if row[i] in count_ind_item.keys():
                count_ind_item[row[i]] += 1
            else:
                count_ind_item[row[i]] = 1

    data = pd.DataFrame()
    data['item_sets'] = count_ind_item.keys()
    data['supp_count'] = count_ind_item.values()
    data = data.sort_values('item_sets')

    return data

def join(list_of_items):
    itemsets = []
    i = 1
    for entry in list_of_items:
        proceding_items = list_of_items[i:]
        for item in proceding_items:
            if(type(item) is str):
                if entry != item:
                    tuples = (entry, item)
                    itemsets.append(tuples)
            else:
                if entry[0:-1] == item[0:-1]:
                    tuples = entry+item[1:]
                    itemsets.append(tuples)
        i = i+1
    if(len(itemsets) == 0):
        return None

    return itemsets

def apriori(trans_data,supp=10):
    freq = pd.DataFrame()

    df = count_item(trans_data)
    print('count_item')
    display(df)
    
    
    while(len(df) != 0):

        df = prune(df, supp)
        print('pruned df')
        display(df)
        
        if len(df) > 1 or (len(df) == 1 and int(df.supp_count >= supp)):
            freq = df
            print('set freq to df')

        itemsets = join(df.item_sets)
        print('itemsets', itemsets)
        
        if(itemsets is None):
            print('itemsets none => ending')
            return freq

        df = count_itemset(trans_data, itemsets)
        print('count_itemset')
        display(df)
        
    return freq


def _convert_apriori_to_set(apriori_fi_df):
    freq_itemsets = set()
    for _ , row in apriori_fi_df.iterrows():
        freq_itemsets.add(frozenset(row['item_sets']))
    return freq_itemsets


def freq_itemset_partition_db(dataset_csv_file, num_partitions, min_supp, preprocess_dataset=True, **kwargs):
    partition_itemsets = {}
    
    for trans_df in pd.read_csv(dataset_csv_file, chunksize=num_partitions, low_memory=True, engine='c', header=None, **kwargs):
        # print('chunk trans df')
        # display(trans_df)
        
        if preprocess_dataset:
            print('preprocessing')
            trans_df = preprocess(trans_df)
            print('preprocessed df')
            display(trans_df)
        
        trans_df = trans_df.set_index('tid')
        trans_df = trans_df['items'].str.split(',')

        print(type(trans_df))
        # print('set index df')
        # display(trans_df)
        
        
        df = count_item(trans_df)
        print('partition df count')
        display(df)
        
        while(len(df) != 0):
            df = prune(df, min_supp)
            print('pruned df')
            display(df)
            
            itemsets = join(df.item_sets)
            print('itemsets', itemsets)
            
            if(itemsets is None):
                print('none ending')
                break
            
            df = count_itemset(trans_df, itemsets)
            print('count itemset')
            display(df)
            
            print('now adding to pi dict')
            for i in range(len(df)):
                if df.item_sets[i] not in partition_itemsets.keys():
                    partition_itemsets[df.item_sets[i]] = df.supp_count[i]
                else:
                    partition_itemsets[df.item_sets[i]] += df.supp_count[i]
                
    return partition_itemsets

In [78]:
apriori(trans_data=trans_df, supp=3)

count_item


Unnamed: 0,item_sets,supp_count
0,apple,2
3,banana,3
1,carrot,3
2,durian,1
4,edamame,3


pruned df


Unnamed: 0,item_sets,supp_count
3,banana,3
1,carrot,3
4,edamame,3


set freq to df
itemsets [('banana', 'carrot'), ('banana', 'edamame'), ('carrot', 'edamame')]
count_itemset


Unnamed: 0,item_sets,supp_count
0,"(banana, carrot)",2
1,"(banana, edamame)",3
2,"(carrot, edamame)",2


pruned df


Unnamed: 0,item_sets,supp_count
1,"(banana, edamame)",3


set freq to df
itemsets None
itemsets none => ending


Unnamed: 0,item_sets,supp_count
1,"(banana, edamame)",3


In [79]:
p = freq_itemset_partition_db('data/dummy_data.csv', 
                            num_partitions=4, 
                            min_supp=3,
                            preprocess_dataset=False,
                            names=['tid', 'items'])

<class 'pandas.core.series.Series'>
partition df count


Unnamed: 0,item_sets,supp_count
5,'banana',1
1,'carrot',3
2,'durian'],1
4,'edamame'],3
0,['apple',2
3,['banana',2


pruned df


Unnamed: 0,item_sets,supp_count
1,'carrot',3
4,'edamame'],3


itemsets [(" 'carrot'", " 'edamame']")]
count itemset


Unnamed: 0,item_sets,supp_count
0,"( 'carrot', 'edamame'])",2


now adding to pi dict
pruned df


Unnamed: 0,item_sets,supp_count


itemsets None
none ending


{(" 'carrot'", " 'edamame']"): 2}