In [278]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder
import squarify
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

In [279]:
df = pd.read_csv('order_details.csv')
df.head()

Unnamed: 0,orderID,productID,unitPrice,quantity,discount
0,10248,11,14.0,12,0.0
1,10248,42,9.8,10,0.0
2,10248,72,34.8,5,0.0
3,10249,14,18.6,9,0.0
4,10249,51,42.4,40,0.0


In [280]:
dc = pd.read_csv('products.csv')
dc.head(20)

Unnamed: 0,productID,productName,supplierID,categoryID,quantityPerUnit,unitPrice,unitsInStock,unitsOnOrder,reorderLevel,discontinued
0,1,Chai,1,1,10 boxes x 20 bags,18.0,39,0,10,0
1,2,Chang,1,1,24 - 12 oz bottles,19.0,17,40,25,0
2,3,Aniseed Syrup,1,2,12 - 550 ml bottles,10.0,13,70,25,0
3,4,Chef Anton's Cajun Seasoning,2,2,48 - 6 oz jars,22.0,53,0,0,0
4,5,Chef Anton's Gumbo Mix,2,2,36 boxes,21.35,0,0,0,1
5,6,Grandma's Boysenberry Spread,3,2,12 - 8 oz jars,25.0,120,0,25,0
6,7,Uncle Bob's Organic Dried Pears,3,7,12 - 1 lb pkgs.,30.0,15,0,10,0
7,8,Northwoods Cranberry Sauce,3,2,12 - 12 oz jars,40.0,6,0,0,0
8,9,Mishi Kobe Niku,4,6,18 - 500 g pkgs.,97.0,29,0,0,1
9,10,Ikura,4,8,12 - 200 ml jars,31.0,31,0,0,0


In [281]:
df.drop(["discount"],inplace=True, axis=1)

In [282]:
df.head()

Unnamed: 0,orderID,productID,unitPrice,quantity
0,10248,11,14.0,12
1,10248,42,9.8,10
2,10248,72,34.8,5
3,10249,14,18.6,9
4,10249,51,42.4,40


In [283]:
dc.drop(["supplierID","categoryID", "quantityPerUnit", "unitPrice", "unitsInStock", "unitsOnOrder", "reorderLevel", "discontinued"],inplace=True, axis=1)

In [284]:
dc.head()

Unnamed: 0,productID,productName
0,1,Chai
1,2,Chang
2,3,Aniseed Syrup
3,4,Chef Anton's Cajun Seasoning
4,5,Chef Anton's Gumbo Mix


In [285]:
df.head()

Unnamed: 0,orderID,productID,unitPrice,quantity
0,10248,11,14.0,12
1,10248,42,9.8,10
2,10248,72,34.8,5
3,10249,14,18.6,9
4,10249,51,42.4,40


In [286]:
result = pd.merge(df, dc, on='productID')

In [287]:
result.head()

Unnamed: 0,orderID,productID,unitPrice,quantity,productName
0,10248,11,14.0,12,Queso Cabrales
1,10296,11,16.8,12,Queso Cabrales
2,10327,11,16.8,50,Queso Cabrales
3,10353,11,16.8,12,Queso Cabrales
4,10365,11,16.8,24,Queso Cabrales


In [288]:
result['productName'] = result['productName'].str.strip()
result.dropna(axis=0, subset=['orderID'], inplace=True)

In [299]:
basket = (result.groupby(['orderID', 'productName'])['quantity'].sum().unstack().reset_index().fillna(0).set_index('orderID'))
basket.head()

productName,Alice Mutton,Aniseed Syrup,Boston Crab Meat,Camembert Pierrot,Carnarvon Tigers,Chai,Chang,Chartreuse verte,Chef Anton's Cajun Seasoning,Chef Anton's Gumbo Mix,...,Teatime Chocolate Biscuits,Thüringer Rostbratwurst,Tofu,Tourtière,Tunnbröd,Uncle Bob's Organic Dried Pears,Valkoinen suklaa,Vegie-spread,Wimmers gute Semmelknödel,Zaanse koeken
orderID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10249,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10250,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10252,0.0,0.0,0.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [298]:
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1
basket_sets = basket.applymap(encode_units)
 
basket_sets.head()

(830, 77)

In [291]:
frequent_itemsets = apriori(basket_sets, min_support=0.01, use_colnames=True)
frequent_itemsets.head()

Unnamed: 0,support,itemsets
0,0.044578,(Alice Mutton)
1,0.014458,(Aniseed Syrup)
2,0.049398,(Boston Crab Meat)
3,0.061446,(Camembert Pierrot)
4,0.03253,(Carnarvon Tigers)


In [292]:
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
# getting th item sets with length = 2 and support more than 1%

frequent_itemsets[ (frequent_itemsets['length'] == 1) &
                   (frequent_itemsets['support'] >= 0.02) ]

Unnamed: 0,support,itemsets,length
0,0.044578,(Alice Mutton),1
2,0.049398,(Boston Crab Meat),1
3,0.061446,(Camembert Pierrot),1
4,0.03253,(Carnarvon Tigers),1
5,0.045783,(Chai),1
6,0.053012,(Chang),1
7,0.036145,(Chartreuse verte),1
8,0.024096,(Chef Anton's Cajun Seasoning),1
10,0.028916,(Côte de Blaye),1
11,0.021687,(Escargots de Bourgogne),1
