### Sample program for Association Analysis (Market Basket Analysis) using FP-Growth  

#### Import libraries  

In [1]:
import pandas as pd
import pyfpgrowth  # https://fp-growth.readthedocs.io/en/latest/

#### Parameters  

In [2]:
csv_in = 'online_retail_small_cleaned.csv'

#### Read CSV file  

In [3]:
df = pd.read_csv(csv_in, delimiter=',', skiprows=0, header=0)
print(df.shape)
print(df.info())
display(df.head())

(59145, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59145 entries, 0 to 59144
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   InvoiceNo    59145 non-null  int64  
 1   StockCode    59145 non-null  object 
 2   Description  59145 non-null  object 
 3   Quantity     59145 non-null  int64  
 4   InvoiceDate  59145 non-null  object 
 5   UnitPrice    59145 non-null  float64
 6   CustomerID   59145 non-null  int64  
 7   Country      59145 non-null  object 
dtypes: float64(1), int64(3), object(4)
memory usage: 3.6+ MB
None


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536386,84880,WHITE WIRE EGG HOLDER,36,2010-12-01 09:57:00,4.95,16029,United Kingdom
1,536386,85099C,JUMBO BAG BAROQUE BLACK WHITE,100,2010-12-01 09:57:00,1.65,16029,United Kingdom
2,536386,85099B,JUMBO BAG RED RETROSPOT,100,2010-12-01 09:57:00,1.65,16029,United Kingdom
3,536404,22297,HEART IVORY TRELLIS SMALL,24,2010-12-01 11:29:00,1.25,16218,United Kingdom
4,536404,22771,CLEAR DRAWER KNOB ACRYLIC EDWARDIAN,12,2010-12-01 11:29:00,1.25,16218,United Kingdom


#### Check appearance of each StockCode  

In [4]:
top_sc = df['StockCode'].value_counts()
print(top_sc.size)
print(top_sc.head())

3140
85123A    315
85099B    246
22423     229
84879     213
47566     208
Name: StockCode, dtype: int64


#### Format transaction data  
Before:  
```
invoice1, stockcode1, ...  
invoice1, stockcode2, ...  
invoice2, stockcode1, ...  
invoice3, stockcode1, ...  
invoice3, stockcode3, ...  
```

After:  
```
[  
  [stockcode1, stockcode2],    
  [stockcode1],  
  [stockcode1, stockcode3],  
  ...  
]  
```

and stockcode should be interger.  

#### Assign an integer (ID) to each stockcode  

In [5]:
id2sc = sorted(list(set(df['StockCode'])))
sc2id = {}
for i in range(len(id2sc)):
    sc2id[id2sc[i]] = i

In [6]:
df['StockCode_ID'] = df['StockCode'].map(lambda x: sc2id[x])
display(df.head())

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,StockCode_ID
0,536386,84880,WHITE WIRE EGG HOLDER,36,2010-12-01 09:57:00,4.95,16029,United Kingdom,2775
1,536386,85099C,JUMBO BAG BAROQUE BLACK WHITE,100,2010-12-01 09:57:00,1.65,16029,United Kingdom,2907
2,536386,85099B,JUMBO BAG RED RETROSPOT,100,2010-12-01 09:57:00,1.65,16029,United Kingdom,2906
3,536404,22297,HEART IVORY TRELLIS SMALL,24,2010-12-01 11:29:00,1.25,16218,United Kingdom,1078
4,536404,22771,CLEAR DRAWER KNOB ACRYLIC EDWARDIAN,12,2010-12-01 11:29:00,1.25,16218,United Kingdom,1517


#### Grouping transaction data by InvoiceNo  

In [19]:
invoices = []
for r in df.groupby('InvoiceNo'):
    # print(len(r))  # debug
    # print(r[0])  # debug
    # print(r[1])  # debug
    #break  # debug
    s1 = set(r[1]['StockCode_ID'])
    invoices.append(list(s1))
print(len(invoices))
print(invoices)

492, 1499, 1628, 1373, 229, 485, 1131, 2162, 888, 890], [1923, 1155, 137, 2830, 1807, 532, 1557, 1559, 2585, 1561, 1331, 1332, 1461, 1081, 1339, 2752, 1346, 3139, 1348, 842, 843, 1356, 1482, 846, 849, 3027, 3028, 3029, 855, 2269, 2015, 2275, 2022, 2023, 1018], [129, 1922, 1154, 1924, 2181, 1926, 2055, 135, 2185, 2186, 651, 2060, 138, 134, 1807, 2196, 2198, 1177, 922, 1697, 673, 2084, 1958, 2219, 2228, 2229, 2230, 1718, 1588, 824, 1722, 2876, 2240, 704, 1986, 1987, 2256, 2257, 2258, 1105, 213, 2264, 2265, 2266, 1886, 2271, 992, 1381, 1382, 2023, 1253, 2537, 1131, 1906, 1910, 2169], [2048, 899, 1156, 2437, 262, 2439, 2057, 2059, 399, 146, 916, 1685, 2073, 1185, 1186, 299, 1201, 946, 947, 2485, 1214, 2114, 2013, 2014, 2527, 2017, 229, 2024, 1641, 1642, 2032, 1393, 2293, 892, 2047], [1156, 1925, 137, 139, 140, 797, 1999, 2264, 2266, 2271, 992, 1121, 1122, 1124, 111, 112, 113, 114, 117, 1917], [1798, 7, 1288, 1194, 2099, 2100, 2101, 2102, 1332, 1331, 2105, 1221, 1222, 1349, 980, 981, 1116, 

#### Market Basket Analysis by FP-Growth  

In [8]:
%time patterns = pyfpgrowth.find_frequent_patterns(invoices, 40)

CPU times: user 181 ms, sys: 1.03 ms, total: 182 ms
Wall time: 180 ms


In [9]:
#print(patterns)

In [10]:
%time rules = pyfpgrowth.generate_association_rules(patterns, 0.8)

CPU times: user 0 ns, sys: 365 µs, total: 365 µs
Wall time: 367 µs


In [11]:
print(rules)

{(1473,): ((1475,), 0.803921568627451), (1332,): ((1331,), 0.8243243243243243), (1474, 1478): ((1475,), 0.8035714285714286), (1475, 1478): ((1474,), 0.8490566037735849), (138, 1156): ((137,), 0.8333333333333334), (140, 1156): ((137,), 0.8448275862068966), (1154, 1156): ((137,), 0.828125)}


In [12]:
results = []
for x in rules:
    ret = [x, rules[x][0], rules[x][1]]
    results.append(ret)
df_res = pd.DataFrame(results)
df_res.columns = ['LHS', 'RHS', 'Conf']

In [13]:
display(df_res.sort_values(by='Conf', ascending=False))

Unnamed: 0,LHS,RHS,Conf
3,"(1475, 1478)","(1474,)",0.849057
5,"(140, 1156)","(137,)",0.844828
4,"(138, 1156)","(137,)",0.833333
6,"(1154, 1156)","(137,)",0.828125
1,"(1332,)","(1331,)",0.824324
0,"(1473,)","(1475,)",0.803922
2,"(1474, 1478)","(1475,)",0.803571


#### Get original StockCode  

In [14]:
print(id2sc[1474])
print(id2sc[1475])
print(id2sc[1478])

22726
22727
22730


#### Calculation of Lift  

In [15]:
n_all = len(invoices)
lift = []
for i in range(df_res.shape[0]):
    rhs = df_res.at[i, 'RHS']
    conf = df_res.at[i, 'Conf']
    n_rhs = 0
    for items in invoices:
        if set(items) >= set(rhs):
            n_rhs += 1
    lift1 = conf / (n_rhs / n_all)
    lift.append(lift1)
    
df_res['Lift'] = lift

In [16]:
display(df_res.sort_values(by='Conf', ascending=False))

Unnamed: 0,LHS,RHS,Conf,Lift
3,"(1475, 1478)","(1474,)",0.849057,18.287883
5,"(140, 1156)","(137,)",0.844828,11.883621
4,"(138, 1156)","(137,)",0.833333,11.721939
6,"(1154, 1156)","(137,)",0.828125,11.648677
1,"(1332,)","(1331,)",0.824324,28.057558
0,"(1473,)","(1475,)",0.803922,16.540386
2,"(1474, 1478)","(1475,)",0.803571,16.533182
