In [1]:
import pandas as pd
import itertools

In [2]:
products = pd.read_csv('products.csv')
products.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [3]:
data = pd.read_csv('order_products__train.csv')
data.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1


In [4]:
products_id = data.product_id.unique()
min_support = 0.01
n = len(data.order_id.unique())

In [5]:
def find_l1(item_list, n, min_support):
    l1 = []
    counts = data['product_id'].value_counts()
    for id in item_list:
        support = counts[id]/n
        if(support >= min_support):
            l1.append([id, support])
    l1 = pd.DataFrame(l1, columns=['product_set', 'support'])
    return l1

In [6]:
def get_unique_itens_from_sets(product_sets, k):
    ids = []
    for tup in product_sets['product_set']:
        for x in range(k):
            ids.append(tup[x])

    return list(set(ids))

In [7]:
def get_support(item_set, n):
    counts_item_set = data[data['product_id'].isin(item_set)].groupby(['order_id']).count()#.drop_duplicates()
    counts_item_set = len(counts_item_set[counts_item_set['product_id'] == len(item_set)])
    return item_set, (counts_item_set / n)

In [8]:
def get_support_matrix(min_support, data):
    k = 2
    columns = ['product_set', 'support']
    support_matrix = find_l1(products_id, n, min_support)
    items_support = pd.DataFrame(support_matrix, columns = columns)
    k1 = items_support.copy()
    #support_matrix = pd.DataFrame(support_matrix, columns=['product_set', 'support'])
    super_set_not_empty = True
    set_for_test = support_matrix.product_set.unique()
    #calc support
    while super_set_not_empty:
        items_id = list(itertools.combinations(set_for_test,k))
        items_support_k = [get_support(item_set, n) for item_set in items_id]
        items_support_k = pd.DataFrame(items_support_k, columns=columns)
        items_support_k = items_support_k[items_support_k['support'] > min_support]
        items_support = pd.concat([items_support, items_support_k], axis=0)
        
        if len(items_support_k) > 0:
            set_for_test = get_unique_itens_from_sets(items_support_k, k)
            k+=1
        else:
            super_set_not_empty = False
    
    return items_support.reset_index(drop=True), k1.reset_index(drop=True)


In [9]:
support_matrix, k1 = get_support_matrix(0.01, data)

In [10]:
def trust_matrix(support_matrix, k1):
    trusts = pd.DataFrame(columns=['product_set', 'support'])
    for index, item_set in support_matrix.iterrows():
        trust = calc_trust(item_set)
        trusts = pd.concat([trusts,trust], ignore_index=True)
            
    trusts['trust'] = trusts.apply(lambda row:
    row['support'] / k1[k1['product_set'] == row['product_set'][0]]['support'].item() 
    if type(row['product_set']) is tuple else 1
    , axis=1)

    
    return trusts.reset_index(drop=True)

In [11]:
def calc_trust(item_row):
    PRODUCT_SET = 'product_set'
    SUPPORT = 'support'
    set_size = len(item_row[PRODUCT_SET]) if (type(item_row[PRODUCT_SET]) is tuple) else 1
    row_with_trust = []
    if set_size == 1:
        row_with_trust.append([item_row[PRODUCT_SET], item_row[SUPPORT]])
    else:
        for row in list(itertools.permutations(item_row[PRODUCT_SET],set_size)):
            row_with_trust.append([row, item_row[SUPPORT]])
    row_with_trust = pd.DataFrame(row_with_trust, columns=[PRODUCT_SET, SUPPORT])#, columns=[PRODUCT_SET])
    return row_with_trust
    

In [12]:
trust_m = trust_matrix(support_matrix, k1)

In [13]:
def lift_matrix(trust_m, k1):
    lift_m = trust_m
    lift_m['lift'] = lift_m.apply(lambda row:
    row['support'] / (k1[k1['product_set'] == row['product_set'][0]]['support'].item() 
    * k1[k1['product_set'] == row['product_set'][0]]['support'].item()) 
    if type(row['product_set']) is tuple else 1
    , axis=1)
    lift_m.reset_index(drop=True)
    return lift_m.sort_values(by='lift', ascending=False)

In [14]:
lift_m = lift_matrix(trust_m, k1)
lift_m = lift_m[lift_m['lift'] > 1]
lift_m['rule'] = lift_m.apply(lambda row: 
products[products['product_id'] == row['product_set'][0]]['product_name'].item() + '=>'+
products[products['product_id'] == row['product_set'][1]]['product_name'].item(), axis=1)
lift_m = lift_m[['rule', 'product_set', 'support', 'trust', 'lift']]

In [15]:
lift_m

Unnamed: 0,rule,product_set,support,trust,lift
107,Organic Raspberries=>Bag of Organic Bananas,"(27966, 13176)",0.013566,0.320952,7.593184
114,Organic Raspberries=>Organic Strawberries,"(27966, 21137)",0.012728,0.301118,7.123942
121,Strawberries=>Banana,"(16797, 24852)",0.014847,0.299969,6.060773
105,Organic Hass Avocado=>Bag of Organic Bananas,"(47209, 13176)",0.018444,0.331825,5.969893
131,Limes=>Large Lemon,"(26209, 47626)",0.012156,0.264379,5.749865
129,Organic Avocado=>Banana,"(47766, 24852)",0.016889,0.299096,5.296808
127,Limes=>Banana,"(26209, 24852)",0.010144,0.22062,4.798163
123,Large Lemon=>Banana,"(47626, 24852)",0.016447,0.265274,4.278583
112,Organic Hass Avocado=>Organic Strawberries,"(47209, 21137)",0.011729,0.211024,3.796556
109,Organic Strawberries=>Bag of Organic Bananas,"(21137, 13176)",0.023428,0.282174,3.398543
