# Market basket analysis

### What items should I recommend based on previous purchases?

In [1]:
## import the libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import mlxtend.preprocessing
import mlxtend.frequent_patterns

In [2]:
## load groceries data

groceries_df = pd.read_csv('grocery_assignment.csv')

## check data

groceries_df.head()

Unnamed: 0.1,Unnamed: 0,transaction_id,Member_number,Date,itemDescription,year,month,day,day_of_week
0,0,1,5000,2015-10-02,soda,2015,10,2,4
1,1,1,5000,2015-10-02,root vegetables,2015,10,2,4
2,2,1,5000,2015-10-02,semi-finished bread,2015,10,2,4
3,3,2,5000,2014-11-16,bottled beer,2014,11,16,6
4,4,2,5000,2014-11-16,other vegetables,2014,11,16,6


In [3]:
## Since every transaction is on individual rows groupby function is used to combine transaction_id and itemDescription
## into one row for each transaction
## Also creating the list of lists

grocery_list = groceries_df.groupby(['transaction_id'])['itemDescription'].apply(list).values.tolist()


## checking that the code works

grocery_list[:10]

[['soda', 'root vegetables', 'semi-finished bread'],
 ['bottled beer', 'other vegetables'],
 ['fruit/vegetable juice', 'onions'],
 ['bottled water', 'herbs'],
 ['butter milk', 'whipped/sour cream'],
 ['berries', 'onions'],
 ['tropical fruit',
  'berries',
  'other vegetables',
  'yogurt',
  'kitchen towels',
  'napkins'],
 ['semi-finished bread', 'newspapers'],
 ['other vegetables', 'detergent'],
 ['rolls/buns', 'curd']]

In [4]:
# Creating the transactional format 

## the encoder

encoder = mlxtend.preprocessing.TransactionEncoder().fit(grocery_list)

# Then the transactional format
encoded_data = encoder.transform(grocery_list)

# Convert it to dataframe
grocery_trans = pd.DataFrame(encoded_data, columns = encoder.columns_)


## check the results

grocery_trans.head()

## There are a total of 167 different columns (items)

Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False


In [5]:
# Check the support value in order to get an idea about the treshold value for min_support

# Based on the results I will choose a very small support
support = grocery_trans.mean()
support = pd.DataFrame(support, columns=['support']).sort_values('support',ascending=False)

print(support)

                        support
whole milk             0.157923
other vegetables       0.122101
rolls/buns             0.110005
soda                   0.097106
yogurt                 0.085879
...                         ...
rubbing alcohol        0.000334
bags                   0.000267
baby cosmetics         0.000200
kitchen utensil        0.000067
preservation products  0.000067

[167 rows x 1 columns]


In [6]:
# Check the frequent itemset with different values in order to select one that will yield a good amount of rules

frequent_itemsets_1 = mlxtend.frequent_patterns.apriori(grocery_trans, min_support = 0.001, max_len = 3, use_colnames = True)
frequent_itemsets_2 = mlxtend.frequent_patterns.apriori(grocery_trans, min_support = 0.05, max_len = 3, use_colnames = True)
frequent_itemsets_3 = mlxtend.frequent_patterns.apriori(grocery_trans, min_support = 0.01, max_len = 2, use_colnames = True)

# Print the frequent itemsets

print('Frequent itemset with min_support 0.001:', len(frequent_itemsets_1),)
print('Frequent itemset with min_support 0.05:', len(frequent_itemsets_2))
print('Frequent itemset with min_support 0.01:', len(frequent_itemsets_3),)


## Result: 

## Frequent itemset with min_support 0.001: 750
## Frequent itemset with min_support 0.05: 11
## Frequent itemset with min_support 0.01: 69

## I will use  the frequent_itemsets based on min_support = 0.001, max_len = 3

Frequent itemset with min_support 0.001: 750
Frequent itemset with min_support 0.05: 11
Frequent itemset with min_support 0.01: 69


In [7]:
## the final frequent_itemsets with min_support 0.001 and max_len 3

frequent_itemsets = mlxtend.frequent_patterns.apriori(grocery_trans, min_support = 0.001, max_len = 3, use_colnames = True)

# check the results
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.004010,(Instant food products)
1,0.021386,(UHT-milk)
2,0.001470,(abrasive cleaner)
3,0.001938,(artif. sweetener)
4,0.008087,(baking powder)
...,...,...
745,0.001136,"(sausage, whole milk, rolls/buns)"
746,0.001002,"(whole milk, soda, rolls/buns)"
747,0.001337,"(whole milk, rolls/buns, yogurt)"
748,0.001069,"(sausage, whole milk, soda)"


In [8]:
# Now I will determine the relevant rules based on lift, of which 1.0 is the critical value

## I chose to focus on lift, since that is easier to decide than confidence. If I wanted to further filter the rules
## I would set some more specific parameters on antecedent support, consequent support and confidence 

## Results: 240 rules
## based on the amount of remaining rules I will not filter them any further

rules = mlxtend.frequent_patterns.association_rules(frequent_itemsets, metric = "lift", min_threshold = 1.0)

#Print association rules
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(tropical fruit),(UHT-milk),0.067767,0.021386,0.001537,0.022682,1.060617,8.785064e-05,1.001326
1,(UHT-milk),(tropical fruit),0.021386,0.067767,0.001537,0.071875,1.060617,8.785064e-05,1.004426
2,(brown bread),(beef),0.037626,0.033950,0.001537,0.040853,1.203301,2.597018e-04,1.007196
3,(beef),(brown bread),0.033950,0.037626,0.001537,0.045276,1.203301,2.597018e-04,1.008012
4,(beef),(citrus fruit),0.033950,0.053131,0.001804,0.053150,1.000349,6.297697e-07,1.000020
...,...,...,...,...,...,...,...,...,...
235,"(sausage, yogurt)",(whole milk),0.005748,0.157923,0.001470,0.255814,1.619866,5.626300e-04,1.131541
236,"(whole milk, yogurt)",(sausage),0.011161,0.060349,0.001470,0.131737,2.182917,7.967480e-04,1.082219
237,(sausage),"(whole milk, yogurt)",0.060349,0.011161,0.001470,0.024363,2.182917,7.967480e-04,1.013532
238,(whole milk),"(sausage, yogurt)",0.157923,0.005748,0.001470,0.009310,1.619866,5.626300e-04,1.003596


In [9]:
## Recommendations based on sausage

## there are 16 rules containing sausage (counting the instances where sausage is the ONLY antecedent)

## Some recommendations based on sausage: beveraged, bottled beer, curd, dessert, frozen meals, frozen vegetables, grapes,
## misc beverages, pastry, salty snack, sliced cheese, soda, yogurt, whole milk, rolls/buns

sausage = rules['antecedents'].apply(lambda x: 'sausage' in x)
rules[sausage]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
20,(sausage),(beverages),0.060349,0.016574,0.001537,0.025471,1.536764,0.000537,1.009129
32,(sausage),(bottled beer),0.060349,0.045312,0.003342,0.055371,1.222,0.000607,1.010649
102,(sausage),(curd),0.060349,0.033683,0.002941,0.048726,1.446615,0.000908,1.015814
106,(sausage),(dessert),0.060349,0.023592,0.00147,0.024363,1.032711,4.7e-05,1.000791
132,(sausage),(frozen meals),0.060349,0.016775,0.00127,0.021041,1.254327,0.000257,1.004358
136,(sausage),(frozen vegetables),0.060349,0.028002,0.002072,0.03433,1.225966,0.000382,1.006553
143,(sausage),(grapes),0.060349,0.014436,0.001069,0.017719,1.227431,0.000198,1.003342
166,(sausage),(misc. beverages),0.060349,0.015772,0.001069,0.017719,1.123412,0.000117,1.001982
178,(sausage),(pastry),0.060349,0.051728,0.003208,0.053156,1.027617,8.6e-05,1.001509
194,(sausage),(salty snack),0.060349,0.01878,0.001136,0.018826,1.002475,3e-06,1.000047


In [13]:
## Recommendation based on citrus fruit

## 7 rules

## Recommendations: beef, butter, candy, frozen vegetable, napkins, specialty chocolate, yogurt

Citrus = rules['antecedents'].apply(lambda x: 'citrus fruit' in x)
rules[Citrus]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
5,(citrus fruit),(beef),0.053131,0.03395,0.001804,0.033962,1.000349,6.297697e-07,1.000012
47,(citrus fruit),(butter),0.053131,0.03522,0.001938,0.036478,1.035712,6.682705e-05,1.001305
58,(citrus fruit),(candy),0.053131,0.014369,0.001002,0.018868,1.31312,0.0002390445,1.004586
83,(citrus fruit),(frozen vegetables),0.053131,0.028002,0.001604,0.030189,1.078074,0.000116159,1.002254
84,(citrus fruit),(napkins),0.053131,0.022121,0.001403,0.026415,1.194106,0.0002281374,1.00441
87,(citrus fruit),(specialty chocolate),0.053131,0.015973,0.001403,0.026415,1.653762,0.0005548137,1.010726
88,(citrus fruit),(yogurt),0.053131,0.085879,0.004611,0.086792,1.010642,4.855926e-05,1.001001
