# Association Analysis Using FP-Growth

Recommendation System

In [1]:
#pip install mlxtend

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth

In [3]:
#Import the data
transactions=[]
with open ('E:/Dataset/practise/Groceries.csv') as f:
    for line in f:
        transaction=[item for item in line.strip().split(',') if item != 'NaN'] 
        #All cell value splited by , and checking if the value is not NaN
        transactions.append(transaction)

In [4]:
len(transactions)

9835

In [5]:
transactions[:3] #upto 3 transactions

[['citrus fruit', 'semi-finished bread', 'margarine', 'ready soups'],
 ['tropical fruit', 'yogurt', 'coffee'],
 ['whole milk']]

In [6]:
# create the object of TransactionEncoder
te = TransactionEncoder()
te_array=te.fit(transactions).transform(transactions)

In [7]:
te_array

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False,  True, False],
       [False, False, False, ...,  True, False, False],
       ...,
       [False, False, False, ..., False,  True, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [8]:
te.columns_

['Instant food products',
 'UHT-milk',
 'abrasive cleaner',
 'artif. sweetener',
 'baby cosmetics',
 'baby food',
 'bags',
 'baking powder',
 'bathroom cleaner',
 'beef',
 'berries',
 'beverages',
 'bottled beer',
 'bottled water',
 'brandy',
 'brown bread',
 'butter',
 'butter milk',
 'cake bar',
 'candles',
 'candy',
 'canned beer',
 'canned fish',
 'canned fruit',
 'canned vegetables',
 'cat food',
 'cereals',
 'chewing gum',
 'chicken',
 'chocolate',
 'chocolate marshmallow',
 'citrus fruit',
 'cleaner',
 'cling film/bags',
 'cocoa drinks',
 'coffee',
 'condensed milk',
 'cooking chocolate',
 'cookware',
 'cream',
 'cream cheese ',
 'curd',
 'curd cheese',
 'decalcifier',
 'dental care',
 'dessert',
 'detergent',
 'dish cleaner',
 'dishes',
 'dog food',
 'domestic eggs',
 'female sanitary products',
 'finished products',
 'fish',
 'flour',
 'flower (seeds)',
 'flower soil/fertilizer',
 'frankfurter',
 'frozen chicken',
 'frozen dessert',
 'frozen fish',
 'frozen fruits',
 'frozen m

In [9]:
#will transform into a dataframe
df = pd.DataFrame(te_array,columns=te.columns_)
df.head()

Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,baby food,bags,baking powder,bathroom cleaner,beef,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False


In [27]:
#Build the rules using FP Growth
rules_df = fpgrowth(df, min_support=0.01, use_colnames=True, max_len=2) #maxlen is maximum combination

In [28]:
rules_df

Unnamed: 0,support,itemsets
0,0.082766,(citrus fruit)
1,0.058566,(margarine)
2,0.017692,(semi-finished bread)
3,0.139502,(yogurt)
4,0.104931,(tropical fruit)
...,...,...
296,0.010168,"(frozen vegetables, rolls/buns)"
297,0.012405,"(yogurt, frozen vegetables)"
298,0.014235,"(other vegetables, onions)"
299,0.012100,"(whole milk, onions)"


In [12]:
rules_df = rules_df[rules_df.itemsets.map(len)==2] #extracting the combining products only

In [13]:
list(rules_df['itemsets'])[0]  #making a frozen set

frozenset({'citrus fruit', 'whole milk'})

In [14]:
tuple(list(rules_df['itemsets'])[0])[1]  #to drill down into frozen set

'citrus fruit'

In [15]:
#From
rules_df['From']=rules_df['itemsets'].map(lambda x:list(x)[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rules_df['From']=rules_df['itemsets'].map(lambda x:list(x)[0])


In [16]:
#To
rules_df['To']=rules_df['itemsets'].map(lambda x:list(x)[1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rules_df['To']=rules_df['itemsets'].map(lambda x:list(x)[1])


In [17]:
rules_df.head()

Unnamed: 0,support,itemsets,From,To
88,0.030503,"(whole milk, citrus fruit)",whole milk,citrus fruit
89,0.021657,"(yogurt, citrus fruit)",yogurt,citrus fruit
90,0.019929,"(tropical fruit, citrus fruit)",tropical fruit,citrus fruit
91,0.013523,"(citrus fruit, bottled water)",citrus fruit,bottled water
92,0.028876,"(other vegetables, citrus fruit)",other vegetables,citrus fruit


In [18]:
#N = Support * Total number of transactions
rules_df['N']= (rules_df['support']*len(transactions)).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rules_df['N']= (rules_df['support']*len(transactions)).astype(int)


In [19]:
rules_df.head()

Unnamed: 0,support,itemsets,From,To,N
88,0.030503,"(whole milk, citrus fruit)",whole milk,citrus fruit,300
89,0.021657,"(yogurt, citrus fruit)",yogurt,citrus fruit,213
90,0.019929,"(tropical fruit, citrus fruit)",tropical fruit,citrus fruit,196
91,0.013523,"(citrus fruit, bottled water)",citrus fruit,bottled water,133
92,0.028876,"(other vegetables, citrus fruit)",other vegetables,citrus fruit,284


In [20]:
#Sorting the data based on N values 
rules_df.sort_values('N',ascending=False, inplace=True)
rules_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rules_df.sort_values('N',ascending=False, inplace=True)


Unnamed: 0,support,itemsets,From,To,N
146,0.074835,"(other vegetables, whole milk)",other vegetables,whole milk,736
157,0.056634,"(whole milk, rolls/buns)",whole milk,rolls/buns,557
107,0.056024,"(yogurt, whole milk)",yogurt,whole milk,551
240,0.048907,"(whole milk, root vegetables)",whole milk,root vegetables,481
238,0.047382,"(other vegetables, root vegetables)",other vegetables,root vegetables,466


In [21]:
#lets take top 50 rules
rules_50=rules_df.head(50)
rules_50

Unnamed: 0,support,itemsets,From,To,N
146,0.074835,"(other vegetables, whole milk)",other vegetables,whole milk,736
157,0.056634,"(whole milk, rolls/buns)",whole milk,rolls/buns,557
107,0.056024,"(yogurt, whole milk)",yogurt,whole milk,551
240,0.048907,"(whole milk, root vegetables)",whole milk,root vegetables,481
238,0.047382,"(other vegetables, root vegetables)",other vegetables,root vegetables,466
110,0.043416,"(yogurt, other vegetables)",yogurt,other vegetables,427
156,0.042603,"(other vegetables, rolls/buns)",other vegetables,rolls/buns,419
118,0.042298,"(whole milk, tropical fruit)",whole milk,tropical fruit,416
191,0.040061,"(whole milk, soda)",whole milk,soda,394
190,0.038332,"(soda, rolls/buns)",soda,rolls/buns,377


In [22]:
#list of all items
items=set(rules_df['From']) | set(rules_50['To']) 

In [23]:
items

{'beef',
 'berries',
 'bottled beer',
 'bottled water',
 'brown bread',
 'butter',
 'canned beer',
 'chocolate',
 'citrus fruit',
 'coffee',
 'curd',
 'dessert',
 'domestic eggs',
 'frankfurter',
 'frozen vegetables',
 'fruit/vegetable juice',
 'long life bakery product',
 'margarine',
 'napkins',
 'newspapers',
 'other vegetables',
 'pastry',
 'pip fruit',
 'pork',
 'rolls/buns',
 'root vegetables',
 'sausage',
 'shopping bags',
 'sliced cheese',
 'soda',
 'sugar',
 'tropical fruit',
 'whipped/sour cream',
 'whole milk',
 'yogurt'}

In [24]:
#for i,itm in enumerate(items):
  #  print(i,itm)

In [25]:
#creating mapping of items to numbers
imap = {item: i for i, item in enumerate(items)}
#Maps the items to number and ads the numeric 'FromN' and 'ToN'
rules_50['FromN']=rules_50['From'].map(imap)
rules_50['ToN']=rules_50['To'].map(imap)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rules_50['FromN']=rules_50['From'].map(imap)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rules_50['ToN']=rules_50['To'].map(imap)


In [26]:
rules_50

Unnamed: 0,support,itemsets,From,To,N,FromN,ToN
146,0.074835,"(other vegetables, whole milk)",other vegetables,whole milk,736,0,21
157,0.056634,"(whole milk, rolls/buns)",whole milk,rolls/buns,557,21,12
107,0.056024,"(yogurt, whole milk)",yogurt,whole milk,551,19,21
240,0.048907,"(whole milk, root vegetables)",whole milk,root vegetables,481,21,7
238,0.047382,"(other vegetables, root vegetables)",other vegetables,root vegetables,466,0,7
110,0.043416,"(yogurt, other vegetables)",yogurt,other vegetables,427,19,0
156,0.042603,"(other vegetables, rolls/buns)",other vegetables,rolls/buns,419,0,12
118,0.042298,"(whole milk, tropical fruit)",whole milk,tropical fruit,416,21,10
191,0.040061,"(whole milk, soda)",whole milk,soda,394,21,34
190,0.038332,"(soda, rolls/buns)",soda,rolls/buns,377,34,12
