# Overview

The instruction is to practice market basket analysis (association rules).

The rules are as below:
* Support minimum of 0.002
* Confidence minimum of 0.05
* Lift a minimum of 3

In [1]:
#Packages
!pip install apyori  ## Installing apriori library
import numpy as np # linear algebra
import pandas as pd # Data pre-processing
import seaborn as sns # Required for plotting

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting apyori
  Downloading apyori-1.1.2.tar.gz (8.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: apyori
  Building wheel for apyori (setup.py) ... [?25l[?25hdone
  Created wheel for apyori: filename=apyori-1.1.2-py3-none-any.whl size=5973 sha256=b5b126f3c5d37ee439d79c70d6acbe9e086719f3ab57ac8b329795eec1969c32
  Stored in directory: /root/.cache/pip/wheels/1b/02/6c/a45230be8603bd95c0a51cd2b289aefdd860c1a100eab73661
Successfully built apyori
Installing collected packages: apyori
Successfully installed apyori-1.1.2


# import dataset

In [3]:
# import dataset from spreadsheet
sheet_url = 'https://drive.google.com/file/d/1vfmsAn7vWiHF-rVoFsmu0XINeuWsvxkB/view?usp=sharing' 
#use this trick to get the id of the file
sheet = 'https://drive.google.com/uc?id=' + sheet_url.split('/')[-2] 

#add sep=';' parameter to pandas read_csv function in order to tell the function, to use semicolon (;) as separator
df = pd.read_csv (sheet, sep=',') 
df.head()

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38765 entries, 0 to 38764
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Member_number    38765 non-null  int64 
 1   Date             38765 non-null  object
 2   itemDescription  38765 non-null  object
dtypes: int64(1), object(2)
memory usage: 908.7+ KB


In [6]:
df.isnull().sum().sort_values(ascending=False)

Member_number      0
Date               0
itemDescription    0
dtype: int64

In [8]:
df['Member_number'] = df['Member_number'].apply(str)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38765 entries, 0 to 38764
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Member_number    38765 non-null  object
 1   Date             38765 non-null  object
 2   itemDescription  38765 non-null  object
dtypes: object(3)
memory usage: 908.7+ KB


# Data Pre Processing

In [9]:
member= df[['Member_number', 'itemDescription']].sort_values(by = 'Member_number', ascending = False) ## Selecting only required variables for modelling
member['itemDescription'] = member['itemDescription'].str.strip() # Removing white spaces if any
member

Unnamed: 0,Member_number,itemDescription
3578,5000,soda
34885,5000,semi-finished bread
11728,5000,fruit/vegetable juice
9340,5000,bottled beer
19727,5000,root vegetables
...,...,...
13331,1000,whole milk
17778,1000,pickled vegetables
6388,1000,sausage
20992,1000,semi-finished bread


In [12]:
trx = [a[1]['itemDescription'].tolist() for a in list(member.groupby(['Member_number']))] ## Combing all the items in list format for each cutomer
trx

[['misc. beverages',
  'sausage',
  'soda',
  'yogurt',
  'salty snack',
  'pastry',
  'canned beer',
  'hygiene articles',
  'whole milk',
  'pickled vegetables',
  'sausage',
  'semi-finished bread',
  'whole milk'],
 ['beef',
  'sausage',
  'frankfurter',
  'soda',
  'whipped/sour cream',
  'frankfurter',
  'rolls/buns',
  'soda',
  'white bread',
  'whole milk',
  'whole milk',
  'curd'],
 ['specialty chocolate',
  'butter',
  'whole milk',
  'butter milk',
  'tropical fruit',
  'sugar',
  'other vegetables',
  'frozen vegetables'],
 ['dental care',
  'rolls/buns',
  'rolls/buns',
  'frozen meals',
  'rolls/buns',
  'sausage',
  'detergent',
  'root vegetables'],
 ['frozen fish',
  'shopping bags',
  'hygiene articles',
  'rolls/buns',
  'whole milk',
  'canned beer',
  'root vegetables',
  'tropical fruit',
  'whole milk',
  'chocolate',
  'other vegetables',
  'red/blush wine',
  'pastry',
  'rolls/buns',
  'packaged fruit/vegetables',
  'dish cleaner',
  'pip fruit',
  'other ve

# Importing and Setting the apriori options
* 1. Minimum Support
* 2. Minimum Confidence
* 3. Minimum Lift
* 4. Length = the number of items 

In [13]:
#Importing and Setting the apriori options
#1. Minimum Support
#2. Minimum Confidence
#3. Minimum Lift
#4. Lenght = the number of items 

In [15]:
from apyori import apriori ## Importing apriori package
rules = apriori(transactions = trx, min_support = 0.002, min_confidence = 0.05, min_lift = 3, min_length = 2, max_length = 2) ## Model Creation

In [16]:
results = list(rules)

In [17]:
## Creating user-defined function for arranging the results obtained from model into readable format

def inspect(results):
    x         = [tuple(result[2][0][0])[0] for result in results]
    y         = [tuple(result[2][0][1])[0] for result in results]
    supports    = [result[1] for result in results]
    confidences = [result[2][0][2] for result in results]
    lifts       = [result[2][0][3] for result in results]
    return list(zip(x, y, supports, confidences, lifts))
resultsinDataFrame = pd.DataFrame(inspect(results), columns = ['X', 'Y', 'Support', 'Confidence', 'Lift'])

In [18]:
resultsinDataFrame.nlargest(n=10, columns="Lift") ## Showing best possible scenarios

Unnamed: 0,X,Y,Support,Confidence,Lift
0,kitchen towels,UHT-milk,0.002309,0.3,3.821569
1,potato products,beef,0.002565,0.454545,3.802185
2,canned fruit,coffee,0.002309,0.428571,3.728954
4,flour,mayonnaise,0.002309,0.06338,3.338599
6,sparkling wine,waffles,0.002565,0.217391,3.150154
5,rice,napkins,0.003079,0.244898,3.011395
3,meat spreads,domestic eggs,0.003592,0.4,3.004239
