### Sample program for Association Analysis (Market Basket Analysis) using FP-Growth  

#### Import libraries  

In [4]:
import pandas as pd
import pyfpgrowth  # https://fp-growth.readthedocs.io/en/latest/

#### Parameters  

In [5]:
csv_in = 'dm-12-quiz.csv'

#### Read CSV file  

In [6]:
df = pd.read_csv(csv_in, delimiter=',', skiprows=0, header=0)
print(df.shape)
print(df.info())
display(df.head())

(318, 2)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318 entries, 0 to 317
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   InvoiceNo  318 non-null    object
 1   ItemNo     318 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 5.1+ KB
None


Unnamed: 0,InvoiceNo,ItemNo
0,T001,5
1,T001,4
2,T001,6
3,T002,2
4,T003,2


In [19]:
set1 = set(df['ItemNo'].values.flatten())
print(len(set1))

7


#### Format transaction data  
Before:  
```
invoice1, stockcode1, ...  
invoice1, stockcode2, ...  
invoice2, stockcode1, ...  
invoice3, stockcode1, ...  
invoice3, stockcode3, ...  
```

After:  
```
[  
  [stockcode1, stockcode2],    
  [stockcode1],  
  [stockcode1, stockcode3],  
  ...  
]  
```

and stockcode should be interger.  

#### Assign an integer (ID) to each stockcode  

In [20]:
id2item = list(set1)
item2id = {}
for i in range(len(id2item)):
    item2id[id2item[i]] = i

In [17]:
df_id = df.applymap(lambda x: item2id[x])
display(df_id)

Unnamed: 0,InvoiceNo,ItemNo
0,62,5
1,62,4
2,62,6
3,127,2
4,55,2
...,...,...
313,110,2
314,102,4
315,72,1
316,72,2


#### Grouping transaction data by InvoiceNo  

In [22]:
invoices = []
for ser in df.groupby('InvoiceNo'):
    s = ser[ser>0]
    invoices.append(list(s))
print(len(invoices))

TypeError: '>' not supported between instances of 'tuple' and 'int'

#### Market Basket Analysis by FP-Growth  

In [11]:
%time patterns = pyfpgrowth.find_frequent_patterns(invoices, 15)

CPU times: user 287 µs, sys: 237 µs, total: 524 µs
Wall time: 527 µs


In [12]:
#print(patterns)

In [13]:
%time rules = pyfpgrowth.generate_association_rules(patterns, 0.9)

CPU times: user 6 µs, sys: 5 µs, total: 11 µs
Wall time: 12.9 µs


In [14]:
print(rules)

{}


In [15]:
results = []
for x in rules:
    ret = [x, rules[x][0], rules[x][1]]
    results.append(ret)
df_res = pd.DataFrame(results)
df_res.columns = ['LHS', 'RHS', 'Conf']

ValueError: Length mismatch: Expected axis has 0 elements, new values have 3 elements

In [62]:
display(df_res.sort_values(by='Conf', ascending=False))

Unnamed: 0,LHS,RHS,Conf
1,"(5, 17, 67)","(82,)",1.0
2,"(46, 55, 104)","(82,)",0.9375
5,"(1, 67, 96, 158)","(82,)",0.9375
0,"(58, 119)","(76,)",0.904762
4,"(1, 5, 73)","(104,)",0.904762
3,"(1, 5, 99)","(82,)",0.9


#### Get original StockCode  

In [66]:
print(id2item[5])
print(id2item[17])
print(id2item[67])
print(id2item[82])

whipped/sour cream
flour
root vegetables
whole milk


#### Calculation of Lift  

In [64]:
n_all = len(invoices)
lift = []
for i in range(df_res.shape[0]):
    rhs = df_res.at[i, 'RHS']
    conf = df_res.at[i, 'Conf']
    n_rhs = 0
    for items in invoices:
        if set(items) >= set(rhs):
            n_rhs += 1
    lift1 = conf / (n_rhs / n_all)
    lift.append(lift1)
    
df_res['Lift'] = lift

In [65]:
display(df_res.sort_values(by='Conf', ascending=False))

Unnamed: 0,LHS,RHS,Conf,Lift
1,"(5, 17, 67)","(82,)",1.0,3.913649
2,"(46, 55, 104)","(82,)",0.9375,3.669046
5,"(1, 67, 96, 158)","(82,)",0.9375,3.669046
0,"(58, 119)","(76,)",0.904762,11.235269
4,"(1, 5, 73)","(104,)",0.904762,4.67595
3,"(1, 5, 99)","(82,)",0.9,3.522284
