In [1]:
#Imports
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import numpy as np
import altair as alt
import matplotlib.pyplot as plt

In [2]:
#Standard Settings:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
alt.renderers.enable('notebook')
alt.data_transformers.enable('default', max_rows=None)
%matplotlib inline 
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 40)
pd.set_option('display.width', 1000)

In [3]:
#Support Code
i = 0
for key, item in basket:
    if (i > 10):
          break
    i=i+1
    print(str(key) + " : " + str(item))

NameError: name 'basket' is not defined

In [4]:
#Load Data
df = pd.read_excel('./data/online_retail.xlsx') #Why did a 20MB file take 1min to load??
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [5]:
#Data Cleanup (all at once):
df['Description'] = df['Description'].str.strip()
df.dropna(axis=0, subset=['InvoiceNo'], inplace=True)
df['InvoiceNo'] = df['InvoiceNo'].astype('str')
df = df[~df['InvoiceNo'].str.contains('C')]

In [7]:
#This is our 1-Hot encoding transformation. Lets 
#Examine it carefully.

#Select all elements in France. 2-Level grouping with Invoice and Desc, for target, select quantity only
#for mapping.
basket = (df[df['Country'] =="France"].groupby(['InvoiceNo', 'Description'])["Quantity"])

i = 0
for key, item in basket:
    if (i > 10):
          break
    i=i+1
    print(str(key) + " : " + str(item)) # for item, first item is row index? Second is the actual value.

df[df['Description'] =="ALARM CLOCK BAKELIKE GREEN"].head(5)
df.iloc[28] #Verified. 28 is just a row indicies. #Index   #value
    


('536370', 'ALARM CLOCK BAKELIKE GREEN') : 28    12
Name: Quantity, dtype: int64
('536370', 'ALARM CLOCK BAKELIKE PINK') : 26    24
Name: Quantity, dtype: int64
('536370', 'ALARM CLOCK BAKELIKE RED') : 27    24
Name: Quantity, dtype: int64
('536370', 'CHARLOTTE BAG DOLLY GIRL DESIGN') : 38    20
Name: Quantity, dtype: int64
('536370', 'CIRCUS PARADE LUNCH BOX') : 37    24
Name: Quantity, dtype: int64
('536370', 'INFLATABLE POLITICAL GLOBE') : 31    48
Name: Quantity, dtype: int64
('536370', 'LUNCH BOX I LOVE LONDON') : 36    24
Name: Quantity, dtype: int64
('536370', 'MINI JIGSAW CIRCUS PARADE') : 42    24
Name: Quantity, dtype: int64
('536370', 'MINI JIGSAW SPACEBOY') : 43    24
Name: Quantity, dtype: int64
('536370', 'MINI PAINT SET VINTAGE') : 44    36
Name: Quantity, dtype: int64
('536370', 'PANDA AND BUNNIES STICKER SHEET') : 29    12
Name: Quantity, dtype: int64


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
28,536370,22726,ALARM CLOCK BAKELIKE GREEN,12,2010-12-01 08:45:00,3.75,12583.0,France
149,536382,22726,ALARM CLOCK BAKELIKE GREEN,4,2010-12-01 09:45:00,3.75,16098.0,United Kingdom
205,536389,22726,ALARM CLOCK BAKELIKE GREEN,4,2010-12-01 10:03:00,3.75,12431.0,Australia
274,536395,22726,ALARM CLOCK BAKELIKE GREEN,8,2010-12-01 10:47:00,3.75,13767.0,United Kingdom
1226,536531,22726,ALARM CLOCK BAKELIKE GREEN,9,2010-12-01 13:23:00,3.75,15485.0,United Kingdom


InvoiceNo                          536370
StockCode                           22726
Description    ALARM CLOCK BAKELIKE GREEN
Quantity                               12
InvoiceDate           2010-12-01 08:45:00
UnitPrice                            3.75
CustomerID                          12583
Country                            France
Name: 28, dtype: object

In [8]:
#By summing, for 2 level indicies we just grab the quantity on the invoice.
#By unstacking, we perform a cast/pivot and turn Descriptions into columns, with table cells as our summed values.
#We have moved from 1 to 2 D, so most of our rows are filled with NAs. Set them to zero.
#Reset our index, and them turn InvoiceNumber into a row index instead of a column.
basket = basket.sum().unstack().reset_index().fillna(0).set_index('InvoiceNo') 

In [9]:
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

basket_sets = basket.applymap(encode_units) #apply map applies a function to every cell of a dataframe, and returns
#a dataframe.
basket_sets.drop('POSTAGE', inplace=True, axis=1)
       

### Starting Analysis:

In [10]:
frequent_itemsets = apriori(basket_sets, min_support=0.03, use_colnames=True)


In [11]:
frequent_itemsets.head()

Unnamed: 0,support,itemsets
0,0.030612,(10 COLOUR SPACEBOY PEN)
1,0.035714,(3 PIECE SPACEBOY COOKIE CUTTER SET)
2,0.045918,(36 PENCILS TUBE RED RETROSPOT)
3,0.071429,(4 TRADITIONAL SPINNING TOPS)
4,0.096939,(ALARM CLOCK BAKELIKE GREEN)


In [13]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules.sort_values(by="conviction",ascending=False).head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
1068,"(PACK OF 6 SKULL PAPER PLATES, SET/6 RED SPOTT...",(SET/6 RED SPOTTY PAPER PLATES),0.033163,0.127551,0.033163,1.0,7.84,0.028933,inf
1099,"(PACK OF 6 SKULL PAPER CUPS, PACK OF 6 SKULL P...",(SET/6 RED SPOTTY PAPER PLATES),0.030612,0.127551,0.030612,1.0,7.84,0.026708,inf
987,"(PACK OF 6 SKULL PAPER PLATES, SET/6 RED SPOTT...",(SET/6 RED SPOTTY PAPER CUPS),0.035714,0.137755,0.035714,1.0,7.259259,0.030794,inf
886,"(MINI PAINT SET VINTAGE, SET/6 RED SPOTTY PAPE...",(SET/6 RED SPOTTY PAPER PLATES),0.035714,0.127551,0.035714,1.0,7.84,0.031159,inf
765,"(PACK OF 6 SKULL PAPER CUPS, SET/6 RED SPOTTY ...",(SET/6 RED SPOTTY PAPER CUPS),0.038265,0.137755,0.038265,1.0,7.259259,0.032994,inf


## Visualizing some simple rules.

In [None]:
import pyvis

In [None]:
rules.sort_values(by="support",ascending=False,inplace=True)
rules.reset_index(inplace=True,drop=True)

In [None]:
rules.head()
rules.shape
type(rules)


In [None]:
#Lets shorten the names, so they fit in our nodes better. Really, we just want to see what our network looks like.

def strends(x):
    for y in x: #Frozensets...a weird artifiact from our Apriori Algorithms. OK.
        thestr = y
    return thestr
    #return thestr[0:6] + thestr[(len(x) - 6):]
    

rules["antecedents"] = rules["antecedents"].apply(strends)
rules["consequents"] = rules["consequents"].apply(strends)








In [None]:
#Lets get a set of all antecedents and consqeuents.
aCSet = {-1}

hold = rules["antecedents"].apply(lambda x: aCSet.add(x))
hold = rules["consequents"].apply(lambda x: aCSet.add(x))
aCSet.remove(-1)

#aCSet


In [None]:
#With this set, we can make a graph now!
from pyvis.network import Network
net = Network()
nodeDict = {}

for i,item in enumerate(aCSet):
    net.add_node(i, label=item) 
    nodeDict[item] = i


In [None]:
#finally, lets add the edges to the network, and use the confidence score as network weight.
for index, row in rules.iterrows():
    net.add_edge(nodeDict[row["antecedents"]],nodeDict[row["consequents"]],weight=row["confidence"])

In [None]:
net.show("mygraph.html")