In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


In [None]:
from itertools import combinations, groupby
from collections import Counter

In [None]:
import sys
print(sys.version)

In [None]:
print("numpy version " + str(np.__version__))
print("pandas version " + str(pd.__version__))

<h2> Persiapan Data

In [None]:
op_prior = pd.read_csv('../input/instacart-market-basket-analysis/order_products__prior.csv')
op_prior.head()

In [None]:
#yang kita pakai untuk rekomendasi 2 produk yang kemgunkinan dibeli bersama akan tinggi
#menggunakan data op_prior dengan atribut order_id dan product_id, atribut lain tidak berhubungan dengan banyak pembelian yang dilakukan bersamaan
#tidak langsung digabung dengan tabel produk agar saat pemrosesan tidak terlalu memakan memori
op_prior = op_prior.set_index('order_id')['product_id']
op_prior.head()

In [None]:
op_prior.head()

In [None]:
print('Dimensi: {0}\nBanyak order: {1}\nBanyak item: {2}'.
      format(op_prior.shape, len(op_prior.index.unique()), len(op_prior.value_counts())))

<h2>Melakukan asosiasi

In [None]:
def dapat_pasangan_item(order):
    order = order.reset_index().to_numpy()

    for order_id, order_object in groupby(order, lambda x:x[0]):
        item_list = [item[1] for item in order_object]
        
        for item_pair in combinations(item_list, 2):
            yield item_pair

In [None]:
#buat fungsi asosiasi
def fungsi_asosiasi(order, min_support):
    print("ukuran awal order item: {}".format(len(order)))
    
    #hitung frekuensi item dan support
    item_stats = order.value_counts().rename("freq").to_frame("freq")
    item_stats['support'] = item_stats['freq'] / len(set(order.index)) * 100
    
    #filer item dari order yang dibawah minimum support
    items_oke= item_stats[item_stats['support'] >= min_support].index
    order = order[order.isin(items_oke)]
    
    print("Item dengan support >= {}: {}".format(min_support, len(items_oke)))
    print("order item yang tersisa: {}".format(len(order)))
    
    #filter order kurang dari 2 item
    order_size = order.index.value_counts().rename("freq")
    order_oke = order_size[order_size >= 2].index
    order = order[order.index.isin(order_oke)]
    
    print("order tersisa dengan 2 item atau lebih: {}".format(len(order_oke)))
    print("order item yang tersisa : {}".format(len(order)))
    
    #hitung ulang frekuensi dan support
    item_stats = order.value_counts().rename("freq").to_frame("freq")
    item_stats['support'] = item_stats['freq'] / len(set(order.index)) * 100
    
    #dapatkan pasangan item generator
    item_pair_gen = dapat_pasangan_item(order)
    
    #hitung frekuensi dan support dari item_pair
    item_pairs = pd.Series(Counter(item_pair_gen)).rename("freq").to_frame("freqAB")
    item_pairs['supportAB'] = item_pairs['freqAB'] / len(order_oke) * 100
    
    print("Banyak pasangan item: {}".format(len(item_pairs)))
    
    #filter dari item_pairs yang dibawah minimum support
    item_pairs = item_pairs[item_pairs['supportAB'] >= min_support]
    
    print("banyak pasangan item dengan support >= {}: {}\n".format(min_support, len(item_pairs)))
    
    #buat tabel association rule
    item_pairs = item_pairs.reset_index().rename(columns={'level_0': 'item_A', 'level_1': 'item_B'})
    item_pairs = item_pairs.merge(item_stats.rename(columns={'freq': 'freqA', 'support': 'supportA'}), left_on='item_A', right_index=True).merge(item_stats.rename(columns={'freq': 'freqB', 'support': 'supportB'}), left_on='item_B', right_index=True)
    
    item_pairs['confidenceAtoB'] = item_pairs['supportAB'] / item_pairs['supportA']
    item_pairs['confidenceBtoA'] = item_pairs['supportAB'] / item_pairs['supportB']
    item_pairs['lift']           = item_pairs['supportAB'] / (item_pairs['supportA'] * item_pairs['supportB'])

    return item_pairs.sort_values('lift', ascending=False)

In [None]:
rules = fungsi_asosiasi(op_prior, 0.01)

In [None]:
products   = pd.read_csv('../input/instacart-market-basket-analysis/products.csv')

In [None]:
columns = ['itemA','itemB','freqAB','supportAB','freqA','supportA','freqB','supportB', 
               'confidenceAtoB','confidenceBtoA','lift']
rules = (rules.merge(products.rename(columns={'product_name': 'itemA'}), left_on='item_A', right_on='product_id').merge(products.rename(columns={'product_name': 'itemB'}), left_on='item_B', right_on='product_id'))[columns].sort_values('lift', ascending=False)

In [None]:
rules

In [None]:
data_lift = rules[['itemA','itemB','lift']]

In [None]:
data_lift

In [None]:
products= products.drop(['product_id'], axis=1)
products

In [None]:
aisles = pd.read_csv('../input/instacart-market-basket-analysis/aisles.csv')
departments = pd.read_csv('../input/instacart-market-basket-analysis/departments.csv')

In [None]:
products = pd.merge(products, aisles, on='aisle_id', how='inner')
products = pd.merge(products, departments, on='department_id', how='inner')

In [None]:
products = products.drop(['aisle_id', 'department_id'], axis=1)

In [None]:
products = products.rename(columns={'aisle_x': 'aisle'})

In [None]:
products.head()

In [None]:
produk_asosiasi = data_lift.merge(products.rename(columns={'aisle': 'aisleA', 'department': 'departmentA'}), left_on='itemA', right_on='product_name').merge(products.rename(columns={'aisle': 'aisleB', 'department': 'departmentB'}), left_on='itemB', right_on='product_name')

In [None]:
produk_asosiasi = produk_asosiasi.drop(['product_name_x', 'product_name_y'],axis=1)

In [None]:
produk_asosiasi.head()

In [None]:
pa_lf1 = produk_asosiasi[produk_asosiasi['lift'] > 1]
pa_lf1.head()

In [None]:
pa_lf1[pa_lf1['departmentA']==pa_lf1['departmentB']]

In [None]:
pa_lf1[pa_lf1['departmentA']!=pa_lf1['departmentB']]

In [None]:
pa_lf1[pa_lf1['aisleA']==pa_lf1['aisleB']]

In [None]:
pa_lf1[pa_lf1['aisleA']!=pa_lf1['aisleB']]